/Pi/GPU/Pi-GPU.py-20130207 - Bench4GPU - Forge du Centre Blaise Pascal

root / Pi / GPU / Pi-GPU.py-20130207 @ 50

Historique | Voir | Annoter | Télécharger (18,25 ko)

       #!/usr/bin/env python
+      #
       # Pi-by-MC using PyCUDA
+      #
       # CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
+      #
       # Thanks to Andreas Klockner for PyCUDA:
       # http://mathema.tician.de/software/pycuda
+      #
       # 2013-01-01 : problems with launch timeout
       # nvidia-smi -c 1 : ko
       # nvidia-smi -c 3 : ko
       # nvidia-smi --gom=1 : not supported
       # http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
       # Option "Interactive" "0" in /etc/X11/xorg.conf
       # Common tools
       import numpy
       from numpy.random import randint as nprnd
       import sys
       import getopt
       import time
       import matplotlib.pyplot as plt
       import math
       from scipy.optimize import curve_fit
       from socket import gethostname
       # Predicted Amdahl Law (Reduced with s=1-p)
       def AmdahlR(N, T1, p):
         return (T1*(1-p+p/N))
       # Predicted Amdahl Law
       def Amdahl(N, T1, s, p):
         return (T1*(s+p/N))
       # Predicted Mylq Law with first order
       def Mylq(N, T1,s,c,p):
         return (T1*(s+c*N+p/N))
       # Predicted Mylq Law with second order
       def Mylq2(N, T1,s,c1,c2,p):
         return (T1*(s+c1*N+c2*N*N+p/N))
       KERNEL_CODE_CUDA="""
       // Marsaglia RNG very simple implementation
       #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
       #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       //#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
       //#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
       #define MWCfp MWC * 2.328306435454494e-10f
       #define KISSfp KISS * 2.328306435454494e-10f
       __global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(blockIdx.x+1);
          uint w=seed_w/(blockIdx.x+1);
          // uint jsr=123456789;
          // uint jcong=380116160;
          int total=0;
          for (uint i=0;i<iterations;i++) {
             float x=MWCfp ;
             float y=MWCfp ;
             // Matching test
             int inside=((x*x+y*y) < 1.0f) ? 1:0;
             total+=inside;
+         }
          s[blockIdx.x]=total;
          __syncthreads();
+      }
       __global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(threadIdx.x+1);
          uint w=seed_w/(threadIdx.x+1);
          // uint jsr=123456789;
          // uint jcong=380116160;
          int total=0;
          for (uint i=0;i<iterations;i++) {
             float x=MWCfp ;
             float y=MWCfp ;
             // Matching test
             int inside=((x*x+y*y) < 1.0f) ? 1:0;
             total+=inside;
+         }
          s[threadIdx.x]=total;
          __syncthreads();
+      }
       __global__ void MainLoopHybrid(uint *s,uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
          uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
          // uint jsr=123456789;
          // uint jcong=380116160;
          int total=0;
          for (uint i=0;i<iterations;i++) {
             float x=MWCfp ;
             float y=MWCfp ;
             // Matching test
             int inside=((x*x+y*y) < 1.0f) ? 1:0;
             total+=inside;
+         }
          s[blockDim.x*blockIdx.x+threadIdx.x]=total;
          __syncthreads();
+      }
       """
       KERNEL_CODE_OPENCL="""
       // Marsaglia RNG very simple implementation
       #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
       #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       //#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
       //#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
       #define MWCfp MWC * 2.328306435454494e-10f
       #define KISSfp KISS * 2.328306435454494e-10f
       __kernel void MainLoopGlobal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(get_global_id(0)+1);
          uint w=seed_w/(get_global_id(0)+1);
          // uint jsr=123456789;
          // uint jcong=380116160;
          int total=0;
          for (uint i=0;i<iterations;i++) {
             float x=MWCfp ;
             float y=MWCfp ;
             // Matching test
             int inside=((x*x+y*y) < 1.0f) ? 1:0;
             total+=inside;
+         }
          s[get_global_id(0)]=total;
          barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       __kernel void MainLoopLocal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(get_local_id(0)+1);
          uint w=seed_w/(get_local_id(0)+1);
          // uint jsr=123456789;
          // uint jcong=380116160;
          int total=0;
          for (uint i=0;i<iterations;i++) {
             float x=MWCfp ;
             float y=MWCfp ;
             // Matching test
             int inside=((x*x+y*y) < 1.0f) ? 1:0;
             total+=inside;
+         }
          s[get_local_id(0)]=total;
          barrier(CLK_LOCAL_MEM_FENCE);
+      }
       __kernel void MainLoopHybrid(__global uint *s,uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(get_global_id(0)+1);
          uint w=seed_w/(get_global_id(0)+1);
          // uint jsr=123456789;
          // uint jcong=380116160;
          int total=0;
          for (uint i=0;i<iterations;i++) {
             float x=MWCfp ;
            float y=MWCfp ;
             // Matching test
             int inside=((x*x+y*y) < 1.0f) ? 1:0;
             total+=inside;
+         }
          barrier(CLK_LOCAL_MEM_FENCE);
          s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
+      }
       """
       def MetropolisCuda(circle,iterations,steps,threads,ParaStyle):
         # Avec PyCUDA autoinit, rien a faire !
         circleCU = cuda.InOut(circle)
         mod = SourceModule(KERNEL_CODE_CUDA)
         MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
         MetropolisThreadsCU=mod.get_function("MainLoopThreads")
         MetropolisHybridCU=mod.get_function("MainLoopHybrid")
         start = pycuda.driver.Event()
         stop = pycuda.driver.Event()
         MyPi=numpy.zeros(steps)
         MyDuration=numpy.zeros(steps)
         if iterations%threads==0:
           iterationsCL=numpy.uint32(iterations/threads+1)
           iterationsNew=iterationsCL*threads
         else:
           iterationsCL=numpy.uint32(iterations/threads)
           iterationsNew=iterations
         for i in range(steps):
           start.record()
           start.synchronize()
           if ParaStyle=='Blocks':
             MetropolisBlocksCU(circleCU,
                                numpy.uint32(iterationsCL),
                                numpy.uint32(nprnd(2**32/threads)),
                                numpy.uint32(nprnd(2**32/threads)),
                                grid=(threads,1),
                                block=(1,1,1))
             print "GPU with %i %s done" % (threads,ParaStyle)
           elif ParaStyle=='Hybrid':
             blocks=int(math.sqrt(float(threads)))
             MetropolisHybridCU(circleCU,
                                 numpy.uint32(iterationsCL),
                                 numpy.uint32(nprnd(2**32/threads)),
                                 numpy.uint32(nprnd(2**32/threads)),
                                 grid=(blocks,1),
                                 block=(threads/blocks,1,1))
             print "GPU with (blocks,threads)=(%i,%i) %s done" % (blocks,threads/blocks,ParaStyle)
           else:
             MetropolisThreadsCU(circleCU,
                                 numpy.uint32(iterationsCL),
                                 numpy.uint32(nprnd(2**32/threads)),
                                 numpy.uint32(nprnd(2**32/threads)),
                                 grid=(1,1),
                                 block=(threads,1,1))
             print "GPU with %i %s done" % (threads,ParaStyle)
           stop.record()
           stop.synchronize()
           #elapsed = stop.time_since(start)*1e-3
           elapsed = start.time_till(stop)*1e-3
           #print circle,float(numpy.sum(circle))
           MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
           MyDuration[i]=elapsed
           #print MyPi[i],MyDuration[i]
           #time.sleep(1)
         print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
         return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
       def MetropolisOpenCL(circle,iterations,steps,threads,ParaStyle,Alu):
         # Initialisation des variables en les CASTant correctement
         # Je detecte un peripherique GPU dans la liste des peripheriques
         # for platform in cl.get_platforms():
         # 	for device in platform.get_devices():
         # 		if cl.device_type.to_string(device.type)=='GPU':
         # 			GPU=device
         #print "GPU detected: ",device.name
         HasGPU=False
         # Device selection based on choice (default is GPU)
         for platform in cl.get_platforms():
           for device in platform.get_devices():
             if not HasGPU:
               deviceType=cl.device_type.to_string(device.type)
               if deviceType=="GPU" and Alu=="GPU":
                 GPU=device
                 print "GPU selected: ",device.name
                 HasGPU=True
               if deviceType=="CPU" and Alu=="CPU":
                 GPU=device
                 print "CPU selected: ",device.name
                 HasGPU=True
         # Je cree le contexte et la queue pour son execution
         #ctx = cl.create_some_context()
         ctx = cl.Context([GPU])
         queue = cl.CommandQueue(ctx,
                                 properties=cl.command_queue_properties.PROFILING_ENABLE)
         # Je recupere les flag possibles pour les buffers
         mf = cl.mem_flags
         circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
         MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
           options = "-cl-mad-enable -cl-fast-relaxed-math")
         #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
         i=0
         MyPi=numpy.zeros(steps)
         MyDuration=numpy.zeros(steps)
         if iterations%threads==0:
           iterationsCL=numpy.uint32(iterations/threads+1)
           iterationsNew=iterationsCL*threads
         else:
           iterationsCL=numpy.uint32(iterations/threads)
           iterationsNew=iterations
         for i in range(steps):
           if ParaStyle=='Blocks':
             # Call OpenCL kernel
             # (1,) is Global work size (only 1 work size)
             # (1,) is local work size
             # circleCL is lattice translated in CL format
             # SeedZCL is lattice translated in CL format
             # SeedWCL is lattice translated in CL format
             # step is number of iterations
             CLLaunch=MetropolisCL.MainLoopGlobal(queue,(threads,),None,
                                                  circleCL,
                                                  numpy.uint32(iterationsCL),
                                                  numpy.uint32(nprnd(2**32/threads)),
                                                  numpy.uint32(nprnd(2**32/threads)))
             print "%s with %i %s done" % (Alu,threads,ParaStyle)
           elif ParaStyle=='Hybrid':
             # en OpenCL, necessaire de mettre un Global_id identique au local_id
             blocks=int(math.sqrt(threads))
             CLLaunch=MetropolisCL.MainLoopHybrid(queue,(threads,),(blocks,),
                                                 circleCL,
                                                 numpy.uint32(iterationsCL),
                                                 numpy.uint32(nprnd(2**32/threads)),
                                                 numpy.uint32(nprnd(2**32/threads)))
             print "%s with (blocks,threads)=(%i,%i) %s done" % (Alu,blocks,threads/blocks,ParaStyle)
           else:
             # en OpenCL, necessaire de mettre un Global_id identique au local_id
             CLLaunch=MetropolisCL.MainLoopLocal(queue,(threads,),(threads,),
                                                 circleCL,
                                                 numpy.uint32(iterationsCL),
                                                 numpy.uint32(nprnd(2**32/threads)),
                                                 numpy.uint32(nprnd(2**32/threads)))
             print "%s with %i %s done" % (Alu,threads,ParaStyle)
           CLLaunch.wait()
           cl.enqueue_copy(queue, circle, circleCL).wait()
           elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
           print circle,float(numpy.sum(circle))
           MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
           MyDuration[i]=elapsed
           #print MyPi[i],MyDuration[i]
         circleCL.release()
         #print threads,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
         print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
         return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
       def FitAndPrint(N,D,Curves):
         try:
           coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
           D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
           coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
           coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
           coeffs_Amdahl[0]=D[0]
           print "Amdahl Normalized: T=%.2f(%.5f+%.5f/N)" % \
               (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
         except:
           print "Impossible to fit for Amdahl law : only %i elements" % len(D)
         try:
           coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
           D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
           coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
           coeffs_AmdahlR[0]=D[0]
           print "Amdahl Reduced Normalized: T=%.2f(%.5f+%.5f/N)" % \
               (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
         except:
           print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
         try:
           coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
           coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
           coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
           coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
           coeffs_Mylq[0]=D[0]
           print "Mylq Normalized : T=%.2f(%.5f+%.5f*N+%.5f/N)" % (coeffs_Mylq[0],
                                                                   coeffs_Mylq[1],
                                                                   coeffs_Mylq[2],
                                                                   coeffs_Mylq[3])
           D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
                       coeffs_Mylq[3])
         except:
           print "Impossible to fit for Mylq law : only %i elements" % len(D)
         try:
           coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
           coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
           coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
           coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
           coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
           coeffs_Mylq2[0]=D[0]
           print "Mylq 2nd order Normalized: T=%.2f(%.5f+%.5f*N+%.5f*N^2+%.5f/N)" % \
               (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
                coeffs_Mylq2[4])
         except:
           print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
         if Curves:
           plt.xlabel("Number of Threads/work Items")
           plt.ylabel("Total Elapsed Time")
           Experience,=plt.plot(N,D,'ro')
           try:
             pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
             pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
           except:
             print "Fit curves seem not to be available"
           plt.legend()
           plt.show()
       if __name__=='__main__':
         # Set defaults values
         # Alu can be CPU or GPU
         Alu='CPU'
         # GPU style can be Cuda (Nvidia implementation) or OpenCL
         GpuStyle='OpenCL'
         # Parallel distribution can be on Threads or Blocks
         ParaStyle='Threads'
         # Iterations is integer
         Iterations=1000000000
         # ThreadStart in first number of Threads to explore
         ThreadStart=1
         # ThreadEnd is last number of Threads to explore
         ThreadEnd=512
         # Redo is the times to redo the test to improve metrology
         Redo=1
         # OutMetrology is method for duration estimation : False is GPU inside
         OutMetrology=False
         # Curves is True to print the curves
         Curves=False
         try:
           opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:",["alu=","gpustyle=","parastyle=","iterations=","threadstart=","threadend=","redo="])
         except getopt.GetoptError:
           print '%s -o -a <CPU/GPU> -g <CUDA/OpenCL> -p <ParaStyle> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
           sys.exit(2)
         for opt, arg in opts:
           if opt == '-h':
             print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -g <CUDA/OpenCL> -p <Threads/Blocks> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
             sys.exit()
           elif opt == '-o':
             OutMetrology=True
           elif opt == '-c':
             Curves=True
           elif opt in ("-a", "--alu"):
             Alu = arg
           elif opt in ("-g", "--gpustyle"):
             GpuStyle = arg
           elif opt in ("-p", "--parastyle"):
             ParaStyle = arg
           elif opt in ("-i", "--iterations"):
             Iterations = numpy.uint32(arg)
           elif opt in ("-s", "--threadstart"):
             ThreadStart = int(arg)
           elif opt in ("-e", "--threadend"):
             ThreadEnd = int(arg)
           elif opt in ("-r", "--redo"):
             Redo = int(arg)
         if Alu=='CPU' and GpuStyle=='CUDA':
           print "Alu can't be CPU for CUDA, set Alu to GPU"
           Alu='GPU'
         if ParaStyle not in ('Blocks','Threads','Hybrid'):
           print "%s not exists, ParaStyle set as Threads !" % ParaStyle
           ParaStyle='Threads'
         print "Compute unit : %s" % Alu
         print "GpuStyle used : %s" % GpuStyle
         print "Parallel Style used : %s" % ParaStyle
         print "Iterations : %s" % Iterations
         print "Number of threads on start : %s" % ThreadStart
         print "Number of threads on end : %s" % ThreadEnd
         print "Number of redo : %s" % Redo
         print "Metrology done out of CPU/GPU : %r" % OutMetrology
         if GpuStyle=='CUDA':
           # For PyCUDA import
           import pycuda.driver as cuda
           import pycuda.gpuarray as gpuarray
           import pycuda.autoinit
           from pycuda.compiler import SourceModule
         if GpuStyle=='OpenCL':
           # For PyOpenCL import
           import pyopencl as cl
         average=numpy.array([]).astype(numpy.float32)
         median=numpy.array([]).astype(numpy.float32)
         stddev=numpy.array([]).astype(numpy.float32)
         ExploredThreads=numpy.array([]).astype(numpy.uint32)
         Threads=ThreadStart
         while Threads <= ThreadEnd:
           ExploredThreads=numpy.append(ExploredThreads,Threads)
           circle=numpy.zeros(Threads).astype(numpy.uint32)
           if OutMetrology:
             duration=numpy.array([]).astype(numpy.float32)
             for i in range(Redo):
               start=time.time()
               if GpuStyle=='CUDA':
                 try:
                   MetropolisCuda(circle,Iterations,1,Threads,ParaStyle)
                 except:
                   print "Problem with %i // computations on Cuda" % Threads
               elif GpuStyle=='OpenCL':
                 try:
                   MetropolisOpenCL(circle,Iterations,1,Threads,ParaStyle,Alu)
                 except:
                   print "Problem with %i // computations on OpenCL" % Threads
               duration=numpy.append(duration,time.time()-start)
             avg=numpy.mean(duration)
             med=numpy.median(duration)
             std=numpy.std(duration)
           else:
             if GpuStyle=='CUDA':
               avg,med,std=MetropolisCuda(circle,Iterations,Redo,Threads,ParaStyle)
             elif GpuStyle=='OpenCL':
               avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Threads,
                                            ParaStyle,Alu)
           print "avg,med,std",avg,med,std
           average=numpy.append(average,avg)
           median=numpy.append(median,med)
           stddev=numpy.append(stddev,std)
           #THREADS*=2
           numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,ThreadStart,ThreadEnd,Iterations,gethostname()),(ExploredThreads,average,median,stddev))
           Threads+=1
         FitAndPrint(ExploredThreads,median,Curves)

Centre Blaise Pascal » Bench4GPU

root / Pi / GPU / Pi-GPU.py-20130207 @ 50