Révision 50

Pi/GPU/Pi-GPU.py (revision 50)
148 148
   __syncthreads();
149 149

  
150 150
}
151

  
152
__global__ void MainLoopBlocks64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
153
{
154
   uint z=seed_z/(blockIdx.x+1);
155
   uint w=seed_w/(blockIdx.x+1);
156

  
157
   ulong total=0;
158

  
159
   for (ulong i=0;i<iterations;i++) {
160

  
161
      double x=(double)MWCfp ;
162
      double y=(double)MWCfp ;
163

  
164
      // Matching test
165
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
166
      total+=inside;
167

  
168
   }
169

  
170
   s[blockIdx.x]=total;
171
   __syncthreads();
172

  
173
}
174

  
175
__global__ void MainLoopThreads64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
176
{
177
   uint z=seed_z/(threadIdx.x+1);
178
   uint w=seed_w/(threadIdx.x+1);
179

  
180
   ulong total=0;
181

  
182
   for (ulong i=0;i<iterations;i++) {
183

  
184
      double x=(double)MWCfp ;
185
      double y=(double)MWCfp ;
186

  
187
      // Matching test
188
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
189
      total+=inside;
190

  
191
   }
192

  
193
   s[threadIdx.x]=total;
194
   __syncthreads();
195

  
196
}
197

  
198
__global__ void MainLoopHybrid64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
199
{
200
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
201
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
202

  
203
   ulong total=0;
204

  
205
   for (ulong i=0;i<iterations;i++) {
206

  
207
      double x=(double)MWCfp ;
208
      double y=(double)MWCfp ;
209

  
210
      // Matching test
211
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
212
      total+=inside;
213

  
214
   }
215

  
216
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
217
   __syncthreads();
218

  
219
}
151 220
"""
152 221

  
153 222
KERNEL_CODE_OPENCL="""
223
#pragma OPENCL EXTENSION cl_khr_fp64: enable
154 224

  
155 225
// Marsaglia RNG very simple implementation
156 226
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
......
225 295
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
226 296
      
227 297
}
298

  
299
__kernel void MainLoopGlobal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
300
{
301
   uint z=seed_z/(get_global_id(0)+1);
302
   uint w=seed_w/(get_global_id(0)+1);
303

  
304
   ulong total=0;
305

  
306
   for (ulong i=0;i<iterations;i++) {
307

  
308
      double x=(double)MWCfp ;
309
      double y=(double)MWCfp ;
310

  
311
      // Matching test
312
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
313
      total+=inside;
314
   }
315
   s[get_global_id(0)]=total;
316
   barrier(CLK_GLOBAL_MEM_FENCE);
317
      
318
}
319

  
320
__kernel void MainLoopLocal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
321
{
322
   uint z=seed_z/(get_local_id(0)+1);
323
   uint w=seed_w/(get_local_id(0)+1);
324

  
325
   ulong total=0;
326

  
327
   for (ulong i=0;i<iterations;i++) {
328

  
329
      double x=(double)MWCfp ;
330
      double y=(double)MWCfp ;
331

  
332
      // Matching test
333
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
334
      total+=inside;
335
   }
336
   s[get_local_id(0)]=total;
337
   barrier(CLK_LOCAL_MEM_FENCE);
338
      
339
}
340

  
341
__kernel void MainLoopHybrid64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
342
{
343
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
344
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
345

  
346
   ulong total=0;
347

  
348
   for (uint i=0;i<iterations;i++) {
349

  
350
      double x=(double)MWCfp ;
351
      double y=(double)MWCfp ;
352

  
353
      // Matching test
354
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
355
      total+=inside;
356
   }
357
   barrier(CLK_LOCAL_MEM_FENCE);
358
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
359
      
360
}
228 361
"""
229 362

  
230
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
363
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,DoublePrecision):
231 364

  
232 365
  # Avec PyCUDA autoinit, rien a faire !
233 366
  
......
238 371
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
239 372
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
240 373
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
374
  MetropolisBlocks64CU=mod.get_function("MainLoopBlocks64")
375
  MetropolisJobs64CU=mod.get_function("MainLoopThreads64")
376
  MetropolisHybrid64CU=mod.get_function("MainLoopHybrid64")
241 377
  
242 378
  start = pycuda.driver.Event()
243 379
  stop = pycuda.driver.Event()
244 380
  
245 381
  MyPi=numpy.zeros(steps)
246 382
  MyDuration=numpy.zeros(steps)
247
  
383

  
248 384
  if iterations%jobs==0:
249 385
    iterationsCL=numpy.uint64(iterations/jobs)
250 386
    iterationsNew=iterationsCL*jobs
......
256 392
    start.record()
257 393
    start.synchronize()
258 394
    if ParaStyle=='Blocks':
259
      MetropolisBlocksCU(circleCU,
260
                         numpy.uint64(iterationsCL),
261
                         numpy.uint32(nprnd(2**30/jobs)),
262
                         numpy.uint32(nprnd(2**30/jobs)),
263
                         grid=(jobs,1),
264
                         block=(1,1,1))
395
      if DoublePrecision:
396
        MetropolisBlocksCU(circleCU,
397
                           numpy.uint64(iterationsCL),
398
                           numpy.uint32(nprnd(2**30/jobs)),
399
                           numpy.uint32(nprnd(2**30/jobs)),
400
                           grid=(jobs,1),
401
                           block=(1,1,1))
402
      else:
403
        MetropolisBlocks64CU(circleCU,
404
                             numpy.uint64(iterationsCL),
405
                             numpy.uint32(nprnd(2**30/jobs)),
406
                             numpy.uint32(nprnd(2**30/jobs)),
407
                             grid=(jobs,1),
408
                             block=(1,1,1))
409
        
265 410
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
266 411
            (Alu,jobs,1,ParaStyle)      
267 412
    elif ParaStyle=='Hybrid':
268 413
      threads=BestThreadsNumber(jobs)
269
      MetropolisHybridCU(circleCU,
270
                          numpy.uint64(iterationsCL),
271
                          numpy.uint32(nprnd(2**30/jobs)),
272
                          numpy.uint32(nprnd(2**30/jobs)),
273
                          grid=(jobs,1),
274
                          block=(threads,1,1))
414
      if DoublePrecision:
415
        MetropolisHybrid64CU(circleCU,
416
                             numpy.uint64(iterationsCL),
417
                             numpy.uint32(nprnd(2**30/jobs)),
418
                             numpy.uint32(nprnd(2**30/jobs)),
419
                             grid=(jobs,1),
420
                             block=(threads,1,1))
421
      else:
422
        MetropolisHybridCU(circleCU,
423
                           numpy.uint64(iterationsCL),
424
                           numpy.uint32(nprnd(2**30/jobs)),
425
                           numpy.uint32(nprnd(2**30/jobs)),
426
                           grid=(jobs,1),
427
                           block=(threads,1,1))
275 428
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
276 429
            (Alu,jobs/threads,threads,ParaStyle)
277 430
    else:
278
      MetropolisJobsCU(circleCU,
279
                          numpy.uint64(iterationsCL),
280
                          numpy.uint32(nprnd(2**30/jobs)),
281
                          numpy.uint32(nprnd(2**30/jobs)),
282
                          grid=(1,1),
283
                          block=(jobs,1,1))
431
      if DoublePrecision:
432
        MetropolisJobs64CU(circleCU,
433
                           numpy.uint64(iterationsCL),
434
                           numpy.uint32(nprnd(2**30/jobs)),
435
                           numpy.uint32(nprnd(2**30/jobs)),
436
                           grid=(1,1),
437
                           block=(jobs,1,1))
438
      else:
439
        MetropolisJobsCU(circleCU,
440
                         numpy.uint64(iterationsCL),
441
                         numpy.uint32(nprnd(2**30/jobs)),
442
                         numpy.uint32(nprnd(2**30/jobs)),
443
                         grid=(1,1),
444
                         block=(jobs,1,1))
284 445
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
285 446
            (Alu,jobs,1,ParaStyle)
286 447
    stop.record()
287 448
    stop.synchronize()
288 449
                
289
    #elapsed = stop.time_since(start)*1e-3
290 450
    elapsed = start.time_till(stop)*1e-3
291 451

  
292
    #print circle,float(numpy.sum(circle))
293
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
294 452
    MyDuration[i]=elapsed
295
    #print MyPi[i],MyDuration[i]
296
    #time.sleep(1)
453
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
454
    MyPi[i]=numpy.median(AllPi)
455
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
297 456

  
457

  
298 458
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
299 459

  
300 460
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
301 461

  
302 462

  
303
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
463
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
464
                     DoublePrecision):
304 465
	
305 466
  # Initialisation des variables en les CASTant correctement
306 467
    
......
338 499
      return(0,0,0)
339 500
				
340 501
  # Je cree le contexte et la queue pour son execution
341
  #ctx = cl.create_some_context()
342 502
  ctx = cl.Context([XPU])
343 503
  queue = cl.CommandQueue(ctx,
344 504
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
......
351 511
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
352 512
    options = "-cl-mad-enable -cl-fast-relaxed-math")
353 513

  
354
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
355

  
356 514
  i=0
357 515

  
358 516
  MyPi=numpy.zeros(steps)
......
375 533
      # SeedZCL is lattice translated in CL format
376 534
      # SeedWCL is lattice translated in CL format
377 535
      # step is number of iterations
378
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
379
                                           circleCL,
380
                                           numpy.uint64(iterationsCL),
381
                                           numpy.uint32(nprnd(2**30/jobs)),
382
                                           numpy.uint32(nprnd(2**30/jobs)))
536
      if DoublePrecision:
537
        CLLaunch=MetropolisCL.MainLoopGlobal64(queue,(jobs,),None,
538
                                               circleCL,
539
                                               numpy.uint64(iterationsCL),
540
                                               numpy.uint32(nprnd(2**30/jobs)),
541
                                               numpy.uint32(nprnd(2**30/jobs)))
542
      else:
543
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
544
                                             circleCL,
545
                                             numpy.uint64(iterationsCL),
546
                                             numpy.uint32(nprnd(2**30/jobs)),
547
                                             numpy.uint32(nprnd(2**30/jobs)))
383 548
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
384 549
            (Alu,jobs,1,ParaStyle)
385 550
    elif ParaStyle=='Hybrid':
386 551
      threads=BestThreadsNumber(jobs)
387 552
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
388
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
389
                                          circleCL,
390
                                          numpy.uint64(iterationsCL),
391
                                          numpy.uint32(nprnd(2**30/jobs)),
392
                                          numpy.uint32(nprnd(2**30/jobs)))
553
      if DoublePrecision:
554
        CLLaunch=MetropolisCL.MainLoopHybrid64(queue,(jobs,),(threads,),
555
                                               circleCL,
556
                                               numpy.uint64(iterationsCL),
557
                                               numpy.uint32(nprnd(2**30/jobs)),
558
                                               numpy.uint32(nprnd(2**30/jobs)))
559
      else:
560
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
561
                                             circleCL,
562
                                             numpy.uint64(iterationsCL),
563
                                             numpy.uint32(nprnd(2**30/jobs)),
564
                                             numpy.uint32(nprnd(2**30/jobs)))
565
        
393 566
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
394 567
            (Alu,jobs/threads,threads,ParaStyle)
395 568
    else:
396 569
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
397
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
398
                                          circleCL,
399
                                          numpy.uint64(iterationsCL),
400
                                          numpy.uint32(nprnd(2**30/jobs)),
401
                                          numpy.uint32(nprnd(2**30/jobs)))
570
      if DoublePrecision:
571
        CLLaunch=MetropolisCL.MainLoopLocal64(queue,(jobs,),(jobs,),
572
                                              circleCL,
573
                                              numpy.uint64(iterationsCL),
574
                                              numpy.uint32(nprnd(2**30/jobs)),
575
                                              numpy.uint32(nprnd(2**30/jobs)))
576
      else:
577
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
578
                                            circleCL,
579
                                            numpy.uint64(iterationsCL),
580
                                            numpy.uint32(nprnd(2**30/jobs)),
581
                                            numpy.uint32(nprnd(2**30/jobs)))
402 582
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
403 583

  
404 584
    CLLaunch.wait()
......
515 695
  Metrology='InMetro'
516 696
  # Curves is True to print the curves
517 697
  Curves=False
698
  # DoublePrecision on FP calculus
699
  DoublePrecision=False
518 700

  
519 701
  try:
520
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
702
    opts, args = getopt.getopt(sys.argv[1:],"hocla:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
521 703
  except getopt.GetoptError:
522
    print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
704
    print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
523 705
    sys.exit(2)
524 706
    
525 707
  for opt, arg in opts:
526 708
    if opt == '-h':
527
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
709
      print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
528 710

  
529 711
      print "\nInformations about devices detected under OpenCL:"
530 712
      # For PyOpenCL import
......
540 722
    elif opt == '-o':
541 723
      OutMetrology=True
542 724
      Metrology='OutMetro'
725
    elif opt == '-l':
726
      DoublePrecision=True
543 727
    elif opt == '-c':
544 728
      Curves=True
545 729
    elif opt in ("-a", "--alu"):
......
578 762
  print "Number of threads on end : %s" % JobEnd
579 763
  print "Number of redo : %s" % Redo
580 764
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
765
  print "Double Precision in Kernels : %r" % DoublePrecision
581 766

  
582 767
  if GpuStyle=='CUDA':
583 768
    # For PyCUDA import
......
594 779
      for device in platform.get_devices():
595 780
        deviceType=cl.device_type.to_string(device.type)
596 781
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
782
        if Id == Device:
783
          # Set the Alu as detected Device Type
784
          Alu=deviceType
597 785
        Id=Id+1
598 786

  
599 787
  average=numpy.array([]).astype(numpy.float32)
......
615 803
        start=time.time()
616 804
        if GpuStyle=='CUDA':
617 805
          try:
618
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
806
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
807
                                 DoublePrecision)
619 808
          except:
620 809
            print "Problem with %i // computations on Cuda" % Jobs
621 810
        elif GpuStyle=='OpenCL':
622 811
          try:
623
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device)
812
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
813
                                   Alu,Device,DoublePrecision)
624 814
          except:
625 815
            print "Problem with %i // computations on OpenCL" % Jobs            
626 816
        duration=numpy.append(duration,time.time()-start)
......
633 823
    else:
634 824
      if GpuStyle=='CUDA':
635 825
        try:
636
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
826
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,
827
                                     DoublePrecision)
637 828
        except:
638 829
          print "Problem with %i // computations on Cuda" % Jobs
639 830
      elif GpuStyle=='OpenCL':
......
641 832
        #   avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
642 833
        # except:
643 834
        #   print "Problem with %i // computations on OpenCL" % Jobs            
644
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
835
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,DoublePrecision)
645 836

  
646 837
    if (avg,med,std) != (0,0,0):
647 838
      print "jobs,avg,med,std",Jobs,avg,med,std
......
651 842
    else:
652 843
      print "Values seem to be wrong..."
653 844
    #THREADS*=2
845
    if DoublePrecision:
846
      Precision='DP'
847
    else:
848
      Precision='SP'
654 849
    if len(average)!=0:
655
      numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
656
      numpy.savetxt("Pi_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
850
      numpy.savez("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
851
      ToSave=[ ExploredJobs,average,median,stddev ]
852
      numpy.savetxt("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
657 853
    Jobs+=JobStep
658 854

  
659 855
  FitAndPrint(ExploredJobs,median,Curves)

Formats disponibles : Unified diff