Révision 66 Splutter/GPU/SplutterGPU.py

SplutterGPU.py (revision 66)
7 7
#
8 8
# Thanks to Andreas Klockner for PyCUDA:
9 9
# http://mathema.tician.de/software/pycuda
10
# http://mathema.tician.de/software/pyopencl
10 11
# 
11 12

  
12 13
# 2013-01-01 : problems with launch timeout
13 14
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14 15
# Option "Interactive" "0" in /etc/X11/xorg.conf
15 16

  
17
# Marsaglia elements about RNG 
18

  
16 19
# Common tools
17 20
import numpy
18 21
from numpy.random import randint as nprnd
......
64 67
def Mylq2(N, T1,s,c1,c2,p):
65 68
  return (T1*(s+p/N)+c1*N+c2*N*N)
66 69

  
70
prout="""
71

  
72
"""
73

  
74

  
67 75
KERNEL_CODE_CUDA="""
68 76

  
69 77
// Marsaglia RNG very simple implementation
......
82 90
#define KISSfp KISS * 2.328306435454494e-10f
83 91

  
84 92
#define MAX (ulong)4294967296
93
#define UMAX (uint)2147483648
85 94

  
95
__global__ void SplutterGlobal(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
96
{
97
    const ulong id=(ulong)(blockIdx.x);
98
   
99
    uint z=seed_z-(uint)id;
100
    uint w=seed_w+(uint)id;
101

  
102
    uint jsr=seed_z;
103
    uint jcong=seed_w;
104

  
105
   for ( ulong i=0;i<iterations;i++) {
106

  
107
      // All version
108
      uint position=(uint)( ((ulong)MWC*(ulong)space)/MAX );
109

  
110
      // UMAX is set to avoid round over overflow
111
      atomicInc(&s[position],UMAX);
112
   }
113

  
114
   __syncthreads();
115
}
116

  
86 117
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
87 118
{
88 119
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
......
229 260
"""
230 261

  
231 262
KERNEL_CODE_OPENCL="""
263
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
264

  
232 265
// Marsaglia RNG very simple implementation
233 266
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
234 267
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
......
253 286
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
254 287
}
255 288

  
289
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
290
{
291
   __private const ulong id=(ulong)get_global_id(0);
292
   __private const ulong size=(ulong)get_global_size(0);
293
   __private const ulong block=(ulong)space/(ulong)size;
294
   
295
   __private uint z=seed_z-(uint)id;
296
   __private uint w=seed_w+(uint)id;
297

  
298
   __private uint jsr=seed_z;
299
   __private uint jcong=seed_w;
300

  
301
   for (__private ulong i=0;i<iterations;i++) {
302

  
303
      // Dense version
304
      __private size_t position=(size_t)( ((ulong)MWC*(ulong)space)/MAX );
305

  
306
      atomic_inc(&s[position]);
307
   }
308

  
309
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
310

  
311
}
312

  
256 313
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
257 314
{
258 315
   __private const ulong id=(ulong)get_global_id(0);
......
395 452

  
396 453
"""
397 454

  
398
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Dense):
455
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density):
399 456

  
400 457
  # Avec PyCUDA autoinit, rien a faire !
401
  
458

  
402 459
  circleCU = cuda.InOut(circle)
403 460
  
404
  print "prout"
405

  
406 461
  mod = SourceModule(KERNEL_CODE_CUDA)
407 462

  
408
  print "prout 2"
409

  
410
  if Dense:
463
  if Density=='Dense':
411 464
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
412 465
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
413 466
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
414
  else:
467
  elif Density=='Sparse':
415 468
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
416 469
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
417 470
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
418
  
419
  print "prout 3"
420

  
471
  else:
472
    MetropolisBlocksCU=mod.get_function("SplutterGlobal")
473
    
421 474
  start = pycuda.driver.Event()
422 475
  stop = pycuda.driver.Event()
423 476
  
......
496 549
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
497 550

  
498 551

  
499
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,Dense):
552
def MetropolisOpenCL(circle,iterations,steps,jobs,
553
                     ParaStyle,Alu,Device,Density):
500 554
	
501 555
  # Initialisation des variables en les CASTant correctement
502 556

  
......
605 659
      #                                      numpy.uint64(iterationsCL),
606 660
      #                                      numpy.uint32(nprnd(2**30/jobs)),
607 661
      #                                      numpy.uint32(nprnd(2**30/jobs)))
608
      if Dense:
662
      if Density=='Dense':
609 663
        CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None,
610 664
                                                  SplutterCL,
611 665
                                                  numpy.uint32(len(Splutter)),
612 666
                                                  numpy.uint64(iterationsCL),
613 667
                                                  numpy.uint32(521288629),
614 668
                                                  numpy.uint32(362436069))
615
      else:
669
      elif Density=='Sparse':
616 670
        CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None,
617 671
                                                   SplutterCL,
618 672
                                                   numpy.uint32(len(Splutter)),
......
620 674
                                                   numpy.uint32(521288629),
621 675
                                                   numpy.uint32(362436069))
622 676
        
677
      else:
678
        CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
679
                                             SplutterCL,
680
                                             numpy.uint32(len(Splutter)),
681
                                             numpy.uint64(iterationsCL),
682
                                             numpy.uint32(521288629),
683
                                             numpy.uint32(362436069))
684
        
623 685
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
624 686
            (Alu,jobs,1,ParaStyle)
625 687
    elif ParaStyle=='Hybrid':
626 688
      threads=BestThreadsNumber(jobs)
627 689
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
628
      if Dense:
690
      if Density=='Dense':
629 691
        CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,),
630 692
                                                  SplutterCL,
631 693
                                                  numpy.uint32(len(Splutter)),
632 694
                                                  numpy.uint64(iterationsCL),
633 695
                                                  numpy.uint32(nprnd(2**30/jobs)),
634 696
                                                  numpy.uint32(nprnd(2**30/jobs)))
635
      else:
697
      elif Density=='Sparse':
636 698
        CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,),
637 699
                                                   SplutterCL,
638 700
                                                   numpy.uint32(len(Splutter)),
......
643 705
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
644 706
            (Alu,jobs/threads,threads,ParaStyle)
645 707
    else:
646
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
647
      if Dense:
708
      # en OpenCL, necessaire de mettre un global_id identique au local_id
709
      if Density=='Dense':
648 710
        CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,),
649 711
                                                 SplutterCL,
650 712
                                                 numpy.uint32(len(Splutter)),
651 713
                                                 numpy.uint64(iterationsCL),
652 714
                                                 numpy.uint32(nprnd(2**30/jobs)),
653 715
                                                 numpy.uint32(nprnd(2**30/jobs)))
654
      else:
716
      elif Density=='Sparse':
655 717
        CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,),
656 718
                                                  SplutterCL,
657 719
                                                  numpy.uint32(len(Splutter)),
......
659 721
                                                  numpy.uint32(nprnd(2**30/jobs)),
660 722
                                                  numpy.uint32(nprnd(2**30/jobs)))
661 723
        
724
        
662 725
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
663 726

  
664 727
    CLLaunch.wait()
......
765 828
  # Parallel distribution can be on Threads or Blocks
766 829
  ParaStyle='Blocks'
767 830
  # Iterations is integer
768
  Iterations=100000000
831
  Iterations=10000000
769 832
  # JobStart in first number of Jobs to explore
770 833
  JobStart=1
771 834
  # JobEnd is last number of Jobs to explore
......
782 845
  # Fit is True to print the curves
783 846
  Fit=False
784 847
  # Spluttering is Dense by default
785
  Dense=True
848
  Density='All'
786 849

  
787 850
  try:
788
    opts, args = getopt.getopt(sys.argv[1:],"hocfvwa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
851
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:y:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","density="])
789 852
  except getopt.GetoptError:
790
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
853
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
791 854
    sys.exit(2)
792 855
    
793 856
  for opt, arg in opts:
794 857
    if opt == '-h':
795
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law)  -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
858
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law)  -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
796 859

  
797 860
      print "\nInformations about devices detected under OpenCL:"
798 861
      # For PyOpenCL import
......
816 879
      Metrology='OutMetro'
817 880
    elif opt == '-c':
818 881
      Curves=True
819
    elif opt == '-v':
820
      Dense=True
821
    elif opt == '-w':
822
      Dense=False
882
    elif opt in ("-y", "--density"):
883
      Density = arg
823 884
    elif opt == '-f':
824 885
      Fit=True
825 886
    elif opt in ("-a", "--alu"):
......
841 902
    elif opt in ("-r", "--redo"):
842 903
      Redo = int(arg)
843 904

  
905
  print "Toto %s" % Alu
906

  
844 907
  if Alu=='CPU' and GpuStyle=='CUDA':
845 908
    print "Alu can't be CPU for CUDA, set Alu to GPU"
846 909
    Alu='GPU'
847 910

  
848 911
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
849 912
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
850
    ParaStyle='Threads'
913
    ParaStyle='Blocks'
851 914

  
852 915
  print "Compute unit : %s" % Alu
853 916
  print "Device Identification : %s" % Device
854 917
  print "GpuStyle used : %s" % GpuStyle
855 918
  print "Parallel Style used : %s" % ParaStyle
856
  print "Dense (or Sparse) Spluttering : %r" % Dense
919
  print "Density Spluttering : %s" % Density
857 920
  print "Iterations : %s" % Iterations
858 921
  print "Number of threads on start : %s" % JobStart
859 922
  print "Number of threads on end : %s" % JobEnd
......
905 968
        start=time.time()
906 969
        if GpuStyle=='CUDA':
907 970
          try:
908
            print "toto"
909
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Dense)
971
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Density)
910 972
          except:
911 973
            print "Problem with %i // computations on Cuda" % Jobs
912 974
        elif GpuStyle=='OpenCL':
913 975
          try:
914 976
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
915
                                   Alu,Device,Dense)
977
                                   Alu,Device,Density)
916 978
          except:
917 979
            print "Problem with %i // computations on OpenCL" % Jobs            
918 980
        duration=numpy.append(duration,time.time()-start)
......
925 987
    else:
926 988
      if GpuStyle=='CUDA':
927 989
        try:
928
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,Dense)
990
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,
991
                                     Jobs,ParaStyle,Density)
929 992
        except:
930 993
          print "Problem with %i // computations on Cuda" % Jobs
931 994
      elif GpuStyle=='OpenCL':
932 995
        try:
933
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,Dense)
996
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,
997
                                       Jobs,ParaStyle,Alu,Device,Density)
934 998
        except:
935 999
          print "Problem with %i // computations on OpenCL" % Jobs            
936 1000

  

Formats disponibles : Unified diff