Révision 68

Splutter/GPU/SplutterGPU.py (revision 68)
25 25
import math
26 26
from socket import gethostname
27 27

  
28
# find prime factors of a number
29
# Get for WWW :
30
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
31
def PrimeFactors(x):
32
  factorlist=numpy.array([]).astype('uint32')
33
  loop=2
34
  while loop<=x:
35
    if x%loop==0:
36
      x/=loop
37
      factorlist=numpy.append(factorlist,[loop])
38
    else:
39
      loop+=1
40
  return factorlist
41
  
28 42
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
29 43
# output is thread number
30 44
def BestThreadsNumber(jobs):
......
53 67
def Mylq2(N, T1,s,c1,c2,p):
54 68
  return (T1*(s+p/N)+c1*N+c2*N*N)
55 69

  
56
prout="""
57

  
58
"""
59

  
60

  
61 70
KERNEL_CODE_CUDA="""
62 71

  
63 72
// Marsaglia RNG very simple implementation
......
275 284
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
276 285
{
277 286
   __private const ulong id=(ulong)get_global_id(0);
278
   __private const ulong size=(ulong)get_global_size(0);
279
   __private const ulong block=(ulong)space/(ulong)size;
280 287
   
281 288
   __private uint z=seed_z-(uint)id;
282 289
   __private uint w=seed_w+(uint)id;
......
287 294
   for (__private ulong i=0;i<iterations;i++) {
288 295

  
289 296
      // Dense version
290
      __private size_t position=(size_t)( ((ulong)MWC*(ulong)space)/MAX );
297
      __private size_t position=(size_t)( MWC%space );
291 298

  
292 299
      atomic_inc(&s[position]);
293 300
   }
......
296 303

  
297 304
}
298 305

  
299
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
306
__kernel void SplutterLocal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
300 307
{
301
   __private const ulong id=(ulong)get_global_id(0);
302
   __private const ulong size=(ulong)get_global_size(0);
303
   __private const ulong block=(ulong)space/(ulong)size;
304
   
305
   __private uint z=seed_z-(uint)id;
306
   __private uint w=seed_w+(uint)id;
307

  
308
   __private uint jsr=seed_z;
309
   __private uint jcong=seed_w;
310

  
311
   for (__private ulong i=0;i<iterations;i++) {
312

  
313
      // Dense version
314
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
315

  
316
      s[position]++;
317
   }
318

  
319
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
320

  
321
}
322

  
323
__kernel void SplutterGlobalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
324
{
325
   __private const ulong id=(ulong)get_global_id(0);
326
   __private const ulong size=(ulong)get_global_size(0);
327
   __private const ulong block=(ulong)space/(ulong)size;
328
   
329
   __private uint z=seed_z-(uint)id;
330
   __private uint w=seed_w+(uint)id;
331

  
332
   __private uint jsr=seed_z;
333
   __private uint jcong=seed_w;
334

  
335
   for (__private ulong i=0;i<iterations;i++) {
336

  
337
      // Sparse version
338
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
339

  
340
      s[position]++;
341
   }
342

  
343
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
344

  
345
}
346

  
347
__kernel void SplutterLocalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
348
{
349 308
   __private const ulong id=(ulong)get_local_id(0);
350
   __private const ulong size=(ulong)get_local_size(0);
351
   __private const ulong block=(ulong)space/(ulong)size;
352 309
   
353 310
   __private uint z=seed_z-(uint)id;
354 311
   __private uint w=seed_w+(uint)id;
......
359 316
   for (__private ulong i=0;i<iterations;i++) {
360 317

  
361 318
      // Dense version
362
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
319
      //__private size_t position=(size_t)( (MWC+id*block)%space );
320
      __private size_t position=(size_t)( MWC%space );
363 321

  
364
      s[position]++;
322
      atomic_inc(&s[position]);
365 323
   }
366 324

  
367 325
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
368 326

  
369 327
}
370 328

  
371
__kernel void SplutterLocalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
329
__kernel void SplutterHybrid(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
372 330
{
373
   __private const ulong id=(ulong)get_local_id(0);
374
   __private const ulong size=(ulong)get_local_size(0);
375
   __private const ulong block=(ulong)space/(ulong)size;
331
   __private const ulong id=(ulong)(get_global_id(0)+get_local_id(0));
376 332
   
377 333
   __private uint z=seed_z-(uint)id;
378 334
   __private uint w=seed_w+(uint)id;
......
382 338

  
383 339
   for (__private ulong i=0;i<iterations;i++) {
384 340

  
385
      // Sparse version
386
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
387

  
388
      s[position]++;
389
   }
390

  
391
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
392

  
393
}
394

  
395
__kernel void SplutterHybridDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
396
{
397
   __private const ulong id=(ulong)(get_global_id(0));
398
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
399
   __private const ulong block=(ulong)space/(ulong)size;
400
   
401
   __private uint z=seed_z-(uint)id;
402
   __private uint w=seed_w+(uint)id;
403

  
404
   __private uint jsr=seed_z;
405
   __private uint jcong=seed_w;
406

  
407
   for (__private ulong i=0;i<iterations;i++) {
408

  
409 341
      // Dense version
410
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
342
      __private size_t position=(size_t)( MWC%space );
411 343

  
412
      s[position]++;
344
      atomic_inc(&s[position]);
413 345
   }
414 346
      
415 347
}
416 348

  
417
__kernel void SplutterHybridSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
418
{
419
   __private const ulong id=(ulong)(get_global_id(0));
420
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
421
   __private const ulong block=(ulong)space/(ulong)size;
422
   
423
   __private uint z=seed_z-(uint)id;
424
   __private uint w=seed_w+(uint)id;
425

  
426
   __private uint jsr=seed_z;
427
   __private uint jcong=seed_w;
428

  
429
   for (__private ulong i=0;i<iterations;i++) {
430

  
431
      // Sparse version
432
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
433

  
434
      s[position]++;
435
   }
436
      
437
}
438

  
439 349
"""
440 350

  
441
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density):
351
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density,Memory):
442 352

  
443 353
  # Avec PyCUDA autoinit, rien a faire !
444 354

  
......
536 446

  
537 447

  
538 448
def MetropolisOpenCL(circle,iterations,steps,jobs,
539
                     ParaStyle,Alu,Device,Density):
449
                     ParaStyle,Alu,Device,Memory):
540 450
	
541 451
  # Initialisation des variables en les CASTant correctement
542 452

  
......
616 526
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
617 527
  
618 528
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
619
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
529
  #Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
530
  Splutter=numpy.zeros(Memory).astype(numpy.uint32)
620 531

  
621 532
  for i in range(steps):
622 533
		
......
645 556
      #                                      numpy.uint64(iterationsCL),
646 557
      #                                      numpy.uint32(nprnd(2**30/jobs)),
647 558
      #                                      numpy.uint32(nprnd(2**30/jobs)))
648
      if Density=='Dense':
649
        CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None,
650
                                                  SplutterCL,
651
                                                  numpy.uint32(len(Splutter)),
652
                                                  numpy.uint64(iterationsCL),
653
                                                  numpy.uint32(521288629),
654
                                                  numpy.uint32(362436069))
655
      elif Density=='Sparse':
656
        CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None,
657
                                                   SplutterCL,
658
                                                   numpy.uint32(len(Splutter)),
659
                                                   numpy.uint64(iterationsCL),
660
                                                   numpy.uint32(521288629),
661
                                                   numpy.uint32(362436069))
559
      CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
560
                                           SplutterCL,
561
                                           numpy.uint32(len(Splutter)),
562
                                           numpy.uint64(iterationsCL),
563
                                           numpy.uint32(nprnd(2**30/jobs)),
564
                                           numpy.uint32(nprnd(2**30/jobs)))
662 565
        
663
      else:
664
        CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
665
                                             SplutterCL,
666
                                             numpy.uint32(len(Splutter)),
667
                                             numpy.uint64(iterationsCL),
668
                                             numpy.uint32(521288629),
669
                                             numpy.uint32(362436069))
670
        
671 566
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
672 567
            (Alu,jobs,1,ParaStyle)
673 568
    elif ParaStyle=='Hybrid':
674
      threads=BestThreadsNumber(jobs)
569
      #threads=BestThreadsNumber(jobs)
570
      threads=BestThreadsNumber(256)
571
      print "print",threads      
675 572
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
676
      if Density=='Dense':
677
        CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,),
678
                                                  SplutterCL,
679
                                                  numpy.uint32(len(Splutter)),
680
                                                  numpy.uint64(iterationsCL),
681
                                                  numpy.uint32(nprnd(2**30/jobs)),
682
                                                  numpy.uint32(nprnd(2**30/jobs)))
683
      elif Density=='Sparse':
684
        CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,),
685
                                                   SplutterCL,
686
                                                   numpy.uint32(len(Splutter)),
687
                                                   numpy.uint64(iterationsCL),
688
                                                   numpy.uint32(nprnd(2**30/jobs)),
689
                                                   numpy.uint32(nprnd(2**30/jobs)))
573
      CLLaunch=MetropolisCL.SplutterHybrid(queue,(jobs,),(threads,),
574
                                           SplutterCL,
575
                                           numpy.uint32(len(Splutter)),
576
                                           numpy.uint64(iterationsCL),
577
                                           numpy.uint32(nprnd(2**30/jobs)),
578
                                           numpy.uint32(nprnd(2**30/jobs)))
690 579
        
691 580
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
692 581
            (Alu,jobs/threads,threads,ParaStyle)
693 582
    else:
694 583
      # en OpenCL, necessaire de mettre un global_id identique au local_id
695
      if Density=='Dense':
696
        CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,),
697
                                                 SplutterCL,
698
                                                 numpy.uint32(len(Splutter)),
699
                                                 numpy.uint64(iterationsCL),
700
                                                 numpy.uint32(nprnd(2**30/jobs)),
701
                                                 numpy.uint32(nprnd(2**30/jobs)))
702
      elif Density=='Sparse':
703
        CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,),
704
                                                  SplutterCL,
705
                                                  numpy.uint32(len(Splutter)),
706
                                                  numpy.uint64(iterationsCL),
707
                                                  numpy.uint32(nprnd(2**30/jobs)),
708
                                                  numpy.uint32(nprnd(2**30/jobs)))
584
      CLLaunch=MetropolisCL.SplutterLocal(queue,(jobs,),(jobs,),
585
                                          SplutterCL,
586
                                          numpy.uint32(len(Splutter)),
587
                                          numpy.uint64(iterationsCL),
588
                                          numpy.uint32(nprnd(2**30/jobs)),
589
                                          numpy.uint32(nprnd(2**30/jobs)))
709 590
        
710 591
        
711 592
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
......
719 600
    print Splutter,sum(Splutter)
720 601
    #MySplutter[i]=numpy.median(Splutter)
721 602
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
603
    
604
  SplutterCL.release()
722 605

  
723
    SplutterCL.release()
724

  
725 606
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
726 607
	
727 608
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
......
830 711
  Curves=False
831 712
  # Fit is True to print the curves
832 713
  Fit=False
833
  # Spluttering is Dense by default
834
  Density='All'
714
  # Memory of vector explored
715
  Memory=1024
835 716

  
836 717
  try:
837
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:y:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","density="])
718
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
838 719
  except getopt.GetoptError:
839
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
720
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
840 721
    sys.exit(2)
841 722
    
842 723
  for opt, arg in opts:
843 724
    if opt == '-h':
844
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law)  -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
725
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
845 726

  
846 727
      print "\nInformations about devices detected under OpenCL:"
847 728
      # For PyOpenCL import
......
865 746
      Metrology='OutMetro'
866 747
    elif opt == '-c':
867 748
      Curves=True
868
    elif opt in ("-y", "--density"):
869
      Density = arg
870 749
    elif opt == '-f':
871 750
      Fit=True
872 751
    elif opt in ("-a", "--alu"):
......
887 766
      JobStep = int(arg)
888 767
    elif opt in ("-r", "--redo"):
889 768
      Redo = int(arg)
769
    elif opt in ("-m", "--memory"):
770
      Memory = int(arg)
890 771

  
891 772
  print "Toto %s" % Alu
892 773

  
......
902 783
  print "Device Identification : %s" % Device
903 784
  print "GpuStyle used : %s" % GpuStyle
904 785
  print "Parallel Style used : %s" % ParaStyle
905
  print "Density Spluttering : %s" % Density
906 786
  print "Iterations : %s" % Iterations
907 787
  print "Number of threads on start : %s" % JobStart
908 788
  print "Number of threads on end : %s" % JobEnd
909 789
  print "Number of redo : %s" % Redo
790
  print "Memory  : %s" % Memory
910 791
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
911 792

  
912 793
  if GpuStyle=='CUDA':
......
954 835
        start=time.time()
955 836
        if GpuStyle=='CUDA':
956 837
          try:
957
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Density)
838
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
839
                                 Memory)
958 840
          except:
959 841
            print "Problem with %i // computations on Cuda" % Jobs
960 842
        elif GpuStyle=='OpenCL':
961 843
          try:
962 844
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
963
                                   Alu,Device,Density)
845
                                   Alu,Device,Memory)
964 846
          except:
965 847
            print "Problem with %i // computations on OpenCL" % Jobs            
966 848
        duration=numpy.append(duration,time.time()-start)
......
974 856
      if GpuStyle=='CUDA':
975 857
        try:
976 858
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,
977
                                     Jobs,ParaStyle,Density)
859
                                     Jobs,ParaStyle,Memory)
978 860
        except:
979 861
          print "Problem with %i // computations on Cuda" % Jobs
980 862
      elif GpuStyle=='OpenCL':
981 863
        try:
982
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,
983
                                       Jobs,ParaStyle,Alu,Device,Density)
864
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,
865
                                       ParaStyle,Alu,Device,Memory)
984 866
        except:
985 867
          print "Problem with %i // computations on OpenCL" % Jobs            
986 868

  

Formats disponibles : Unified diff