/Pi/XPU/PiXPU.py - Bench4GPU - Forge du Centre Blaise Pascal

root / Pi / XPU / PiXPU.py

Historique | Voir | Annoter | Télécharger (31,71 ko)

       #!/usr/bin/env python3
+      #
       # Pi-by-MonteCarlo using PyCUDA/PyOpenCL
+      #
       # performs an estimation of Pi using Monte Carlo method
       # a large amount of iterations is divided and distributed to compute units
       # a lot of options are provided to perform scalabilty tests
+      #
       # use -h for complete set of options
+      #
       # CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@ens-lyon.fr>
+      #
       # Part of matrix programs from: https://forge.cbp.ens-lyon.fr/svn/bench4gpu/
       # Thanks to Andreas Klockner for PyCUDA:
       # http://mathema.tician.de/software/pycuda
       # Thanks to Andreas Klockner for PyOpenCL:
       # http://mathema.tician.de/software/pyopencl
+      #
       # 2013-01-01 : problems with launch timeout
       # http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
       # Option "Interactive" "0" in /etc/X11/xorg.conf
       # Common tools
       import numpy
       from numpy.random import randint as nprnd
       import sys
       import getopt
       import time
       import itertools
       from socket import gethostname
       class PenStacle:
           """Pentacle of Statistics from data"""
           Avg=0
           Med=0
           Std=0
           Min=0
           Max=0
           def __init__(self,Data):
               self.Avg=numpy.average(Data)
               self.Med=numpy.median(Data)
               self.Std=numpy.std(Data)
               self.Max=numpy.max(Data)
               self.Min=numpy.min(Data)
           def display(self):
               print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
       class Experience:
           """Metrology for experiences"""
           DeviceStyle=''
           DeviceId=0
           AvgD=0
           MedD=0
           StdD=0
           MinD=0
           MaxD=0
           AvgR=0
           MedR=0
           StdR=0
           MinR=0
           MaxR=0
           def __init__(self,DeviceStyle,DeviceId,Iterations):
               self.DeviceStyle=DeviceStyle
               self.DeviceId=DeviceId
               self.Iterations
           def Metrology(self,Data):
               Duration=PenStacle(Data)
               Rate=PenStacle(Iterations/Data)
               print("Duration %s" % Duration)
               print("Rate %s" % Rate)
       def DictionariesAPI():
           Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
           Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
           Test={True:1,False:0}
           return(Marsaglia,Computing,Test)
       # find prime factors of a number
       # Get for WWW :
       # http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
       def PrimeFactors(x):
           factorlist=numpy.array([]).astype('uint32')
           loop=2
           while loop<=x:
               if x%loop==0:
                   x/=loop
                   factorlist=numpy.append(factorlist,[loop])
               else:
                   loop+=1
           return factorlist
       # Try to find the best thread number in Hybrid approach (Blocks&Threads)
       # output is thread number
       def BestThreadsNumber(jobs):
           factors=PrimeFactors(jobs)
           matrix=numpy.append([factors],[factors[::-1]],axis=0)
           threads=1
           for factor in matrix.transpose().ravel():
               threads=threads*factor
               if threads*threads>jobs or threads>512:
                   break
           return(long(threads))
       # Predicted Amdahl Law (Reduced with s=1-p)
       def AmdahlR(N, T1, p):
           return (T1*(1-p+p/N))
       # Predicted Amdahl Law
       def Amdahl(N, T1, s, p):
           return (T1*(s+p/N))
       # Predicted Mylq Law with first order
       def Mylq(N, T1,s,c,p):
           return (T1*(s+p/N)+c*N)
       # Predicted Mylq Law with second order
       def Mylq2(N, T1,s,c1,c2,p):
           return (T1*(s+p/N)+c1*N+c2*N*N)
       def KernelCodeCuda():
           KERNEL_CODE_CUDA="""
       #define TCONG 0
       #define TSHR3 1
       #define TMWC 2
       #define TKISS 3
       #define TINT32 0
       #define TINT64 1
       #define TFP32 2
       #define TFP64 3
       #define IFTHEN 1
       // Marsaglia RNG very simple implementation
       #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
       #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       #define MWCfp MWC * 2.328306435454494e-10f
       #define KISSfp KISS * 2.328306435454494e-10f
       #define SHR3fp SHR3 * 2.328306435454494e-10f
       #define CONGfp CONG * 2.328306435454494e-10f
       __device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
+      {
       #if TRNG == TCONG
          uint jcong=seed_z+work;
       #elif TRNG == TSHR3
          uint jsr=seed_w+work;
       #elif TRNG == TMWC
          uint z=seed_z+work;
          uint w=seed_w+work;
       #elif TRNG == TKISS
          uint jcong=seed_z+work;
          uint jsr=seed_w+work;
          uint z=seed_z-work;
          uint w=seed_w-work;
       #endif
          ulong total=0;
          for (ulong i=0;i<iterations;i++) {
       #if TYPE == TINT32
           #define THEONE 1073741824
           #if TRNG == TCONG
               uint x=CONG>>17 ;
               uint y=CONG>>17 ;
           #elif TRNG == TSHR3
               uint x=SHR3>>17 ;
               uint y=SHR3>>17 ;
           #elif TRNG == TMWC
               uint x=MWC>>17 ;
               uint y=MWC>>17 ;
           #elif TRNG == TKISS
               uint x=KISS>>17 ;
               uint y=KISS>>17 ;
           #endif
       #elif TYPE == TINT64
           #define THEONE 4611686018427387904
           #if TRNG == TCONG
               ulong x=(ulong)(CONG>>1) ;
               ulong y=(ulong)(CONG>>1) ;
           #elif TRNG == TSHR3
               ulong x=(ulong)(SHR3>>1) ;
               ulong y=(ulong)(SHR3>>1) ;
           #elif TRNG == TMWC
               ulong x=(ulong)(MWC>>1) ;
               ulong y=(ulong)(MWC>>1) ;
           #elif TRNG == TKISS
               ulong x=(ulong)(KISS>>1) ;
               ulong y=(ulong)(KISS>>1) ;
           #endif
       #elif TYPE == TFP32
           #define THEONE 1.0f
           #if TRNG == TCONG
               float x=CONGfp ;
               float y=CONGfp ;
           #elif TRNG == TSHR3
               float x=SHR3fp ;
               float y=SHR3fp ;
           #elif TRNG == TMWC
               float x=MWCfp ;
               float y=MWCfp ;
           #elif TRNG == TKISS
             float x=KISSfp ;
             float y=KISSfp ;
           #endif
       #elif TYPE == TFP64
           #define THEONE 1.0f
           #if TRNG == TCONG
               double x=(double)CONGfp ;
               double y=(double)CONGfp ;
           #elif TRNG == TSHR3
               double x=(double)SHR3fp ;
               double y=(double)SHR3fp ;
           #elif TRNG == TMWC
               double x=(double)MWCfp ;
               double y=(double)MWCfp ;
           #elif TRNG == TKISS
               double x=(double)KISSfp ;
               double y=(double)KISSfp ;
           #endif
       #endif
       #if TEST == IFTHEN
             if ((x*x+y*y) <=THEONE) {
                total+=1;
+            }
       #else
             ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
             total+=inside;
       #endif
+         }
          return(total);
+      }
       __global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
+      {
          ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
          s[blockIdx.x]=total;
          __syncthreads();
+      }
       __global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
+      {
          ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
          s[threadIdx.x]=total;
          __syncthreads();
+      }
       __global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
+      {
          ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
          s[blockDim.x*blockIdx.x+threadIdx.x]=total;
          __syncthreads();
+      }
       """
           return(KERNEL_CODE_CUDA)
       def KernelCodeOpenCL():
           KERNEL_CODE_OPENCL="""
       #define TCONG 0
       #define TSHR3 1
       #define TMWC 2
       #define TKISS 3
       #define TINT32 0
       #define TINT64 1
       #define TFP32 2
       #define TFP64 3
       #define IFTHEN 1
       // Marsaglia RNG very simple implementation
       #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
       #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       #define MWCfp MWC * 2.328306435454494e-10f
       #define KISSfp KISS * 2.328306435454494e-10f
       #define CONGfp CONG * 2.328306435454494e-10f
       #define SHR3fp SHR3 * 2.328306435454494e-10f
       ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
+      {
       #if TRNG == TCONG
          uint jcong=seed_z+work;
       #elif TRNG == TSHR3
          uint jsr=seed_w+work;
       #elif TRNG == TMWC
          uint z=seed_z+work;
          uint w=seed_w+work;
       #elif TRNG == TKISS
          uint jcong=seed_z+work;
          uint jsr=seed_w+work;
          uint z=seed_z-work;
          uint w=seed_w-work;
       #endif
          ulong total=0;
          for (ulong i=0;i<iterations;i++) {
       #if TYPE == TINT32
           #define THEONE 1073741824
           #if TRNG == TCONG
               uint x=CONG>>17 ;
               uint y=CONG>>17 ;
           #elif TRNG == TSHR3
               uint x=SHR3>>17 ;
               uint y=SHR3>>17 ;
           #elif TRNG == TMWC
               uint x=MWC>>17 ;
               uint y=MWC>>17 ;
           #elif TRNG == TKISS
               uint x=KISS>>17 ;
               uint y=KISS>>17 ;
           #endif
       #elif TYPE == TINT64
           #define THEONE 4611686018427387904
           #if TRNG == TCONG
               ulong x=(ulong)(CONG>>1) ;
               ulong y=(ulong)(CONG>>1) ;
           #elif TRNG == TSHR3
               ulong x=(ulong)(SHR3>>1) ;
               ulong y=(ulong)(SHR3>>1) ;
           #elif TRNG == TMWC
               ulong x=(ulong)(MWC>>1) ;
               ulong y=(ulong)(MWC>>1) ;
           #elif TRNG == TKISS
               ulong x=(ulong)(KISS>>1) ;
               ulong y=(ulong)(KISS>>1) ;
           #endif
       #elif TYPE == TFP32
           #define THEONE 1.0f
           #if TRNG == TCONG
               float x=CONGfp ;
               float y=CONGfp ;
           #elif TRNG == TSHR3
               float x=SHR3fp ;
               float y=SHR3fp ;
           #elif TRNG == TMWC
               float x=MWCfp ;
               float y=MWCfp ;
           #elif TRNG == TKISS
             float x=KISSfp ;
             float y=KISSfp ;
           #endif
       #elif TYPE == TFP64
       #pragma OPENCL EXTENSION cl_khr_fp64: enable
           #define THEONE 1.0f
           #if TRNG == TCONG
               double x=(double)CONGfp ;
               double y=(double)CONGfp ;
           #elif TRNG == TSHR3
               double x=(double)SHR3fp ;
               double y=(double)SHR3fp ;
           #elif TRNG == TMWC
               double x=(double)MWCfp ;
               double y=(double)MWCfp ;
           #elif TRNG == TKISS
               double x=(double)KISSfp ;
               double y=(double)KISSfp ;
           #endif
       #endif
       #if TEST == IFTHEN
             if ((x*x+y*y) <= THEONE) {
                total+=1;
+            }
       #else
             ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
             total+=inside;
       #endif
+         }
          return(total);
+      }
       __kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
+      {
          ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
          barrier(CLK_GLOBAL_MEM_FENCE);
          s[get_global_id(0)]=total;
+      }
       __kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
+      {
          ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
          barrier(CLK_LOCAL_MEM_FENCE);
          s[get_local_id(0)]=total;
+      }
       __kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
+      {
          ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
          barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
          s[get_global_id(0)]=total;
+      }
       """
           return(KERNEL_CODE_OPENCL)
       def MetropolisCuda(InputCU):
           print("Inside ",InputCU)
           iterations=InputCU['Iterations']
           steps=InputCU['Steps']
           blocks=InputCU['Blocks']
           threads=InputCU['Threads']
           Device=InputCU['Device']
           RNG=InputCU['RNG']
           ValueType=InputCU['ValueType']
           TestType=InputCU['IfThen']
           Seeds=InputCU['Seeds']
           Marsaglia,Computing,Test=DictionariesAPI()
           try:
               # For PyCUDA import
               import pycuda.driver as cuda
               from pycuda.compiler import SourceModule
               cuda.init()
               for Id in range(cuda.Device.count()):
                   if Id==Device:
                       XPU=cuda.Device(Id)
                       print("GPU selected %s" % XPU.name())
               print
           except ImportError:
               print("Platform does not seem to support CUDA")
           circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
           circleCU = cuda.InOut(circle)
           #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
           #cuda.memcpy_htod(circleCU, circle)
           Context=XPU.make_context()
           try:
               mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
               #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
               # Needed to set the compiler via ccbin for CUDA9 implementation
               #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
           except:
               print("Compilation seems to break")
           MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
           MetropolisThreadsCU=mod.get_function("MainLoopThreads")
           MetropolisHybridCU=mod.get_function("MainLoopHybrid")
           MyDuration=numpy.zeros(steps)
           jobs=blocks*threads;
           iterationsCU=numpy.uint64(iterations/jobs)
           if iterations%jobs!=0:
               iterationsCU+=numpy.uint64(1)
           for i in range(steps):
               start_time=time.time()
               try:
                   MetropolisHybridCU(circleCU,
                                      numpy.uint64(iterationsCU),
                                      numpy.uint32(Seeds[0]),
                                      numpy.uint32(Seeds[1]),
                                      grid=(blocks,1),block=(threads,1,1))
               except:
                   print("Crash during CUDA call")
               elapsed = time.time()-start_time
               print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
               MyDuration[i]=elapsed
           OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
           print(OutputCU)
           Context.pop()
           Context.detach()
           return(OutputCU)
       def MetropolisOpenCL(InputCL):
           import pyopencl as cl
           iterations=InputCL['Iterations']
           steps=InputCL['Steps']
           blocks=InputCL['Blocks']
           threads=InputCL['Threads']
           Device=InputCL['Device']
           RNG=InputCL['RNG']
           ValueType=InputCL['ValueType']
           TestType=InputCL['IfThen']
           Seeds=InputCL['Seeds']
           Marsaglia,Computing,Test=DictionariesAPI()
           # Initialisation des variables en les CASTant correctement
           Id=0
           HasXPU=False
           for platform in cl.get_platforms():
               for device in platform.get_devices():
                   if Id==Device:
                       XPU=device
                       print("CPU/GPU selected: ",device.name.lstrip())
                       HasXPU=True
                   Id+=1
                   # print(Id)
           if HasXPU==False:
               print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
               sys.exit()
           # Je cree le contexte et la queue pour son execution
           try:
               ctx = cl.Context(devices=[XPU])
               queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
           except:
               print("Crash during context creation")
           # Je recupere les flag possibles pour les buffers
           mf = cl.mem_flags
           circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
           #    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
           circleCL = cl.Buffer(ctx, mf.WRITE_ONLY,circle.nbytes)
           MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
           MyDuration=numpy.zeros(steps)
           jobs=blocks*threads;
           iterationsCL=numpy.uint64(iterations/jobs)
           if iterations%jobs!=0:
               iterationsCL+=1
           for i in range(steps):
               start_time=time.time()
               if threads == 1:
                   CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
                                                        circleCL,
                                                        numpy.uint64(iterationsCL),
                                                        numpy.uint32(Seeds[0]),
                                                        numpy.uint32(Seeds[1]))
               else:
                   CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
                                                        circleCL,
                                                        numpy.uint64(iterationsCL),
                                                        numpy.uint32(Seeds[0]),
                                                        numpy.uint32(Seeds[1]))
               CLLaunch.wait()
               cl.enqueue_copy(queue, circle, circleCL).wait()
               elapsed = time.time()-start_time
               print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
               # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
               # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
               # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
               MyDuration[i]=elapsed
               # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
               # MyPi[i]=numpy.median(AllPi)
               # print MyPi[i],numpy.std(AllPi),MyDuration[i]
           circleCL.release()
           OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
           # print(OutputCL)
           return(OutputCL)
       def FitAndPrint(N,D,Curves):
           from scipy.optimize import curve_fit
           import matplotlib.pyplot as plt
           try:
               coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
               D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
               coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
               coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
               coeffs_Amdahl[0]=D[0]
               print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
           except:
               print("Impossible to fit for Amdahl law : only %i elements" % len(D))
           try:
               coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
               D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
               coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
               coeffs_AmdahlR[0]=D[0]
               print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
           except:
               print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
           try:
               coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
               coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
               # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
               coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
               coeffs_Mylq[0]=D[0]
               print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
                                                                       coeffs_Mylq[1],
                                                                       coeffs_Mylq[3],
                                                                       coeffs_Mylq[2]))
               D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
                           coeffs_Mylq[3])
           except:
               print("Impossible to fit for Mylq law : only %i elements" % len(D))
           try:
               coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
               coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
               # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
               # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
               coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
               coeffs_Mylq2[0]=D[0]
               print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
           except:
               print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
           if Curves:
               plt.xlabel("Number of Threads/work Items")
               plt.ylabel("Total Elapsed Time")
               Experience,=plt.plot(N,D,'ro')
           try:
               pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
               pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
           except:
               print("Fit curves seem not to be available")
           plt.legend()
           plt.show()
       if __name__=='__main__':
           # Set defaults values
           # GPU style can be Cuda (Nvidia implementation) or OpenCL
           GpuStyle='OpenCL'
           # Iterations is integer
           Iterations=1000000000
           # BlocksBlocks in first number of Blocks to explore
           BlocksBegin=1024
           # BlocksEnd is last number of Blocks to explore
           BlocksEnd=1024
           # BlocksStep is the step of Blocks to explore
           BlocksStep=1
           # ThreadsBlocks in first number of Blocks to explore
           ThreadsBegin=1
           # ThreadsEnd is last number of Blocks to explore
           ThreadsEnd=1
           # ThreadsStep is the step of Blocks to explore
           ThreadsStep=1
           # Redo is the times to redo the test to improve metrology
           Redo=1
           # OutMetrology is method for duration estimation : False is GPU inside
           OutMetrology=False
           Metrology='InMetro'
           # Curves is True to print the curves
           Curves=False
           # Fit is True to print the curves
           Fit=False
           # Inside based on If
           IfThen=False
           # Marsaglia RNG
           RNG='MWC'
           # Value type : INT32, INT64, FP32, FP64
           ValueType='FP32'
           # Seeds for RNG
           Seeds=110271,101008
           HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
           try:
               opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
           except getopt.GetoptError:
               print(HowToUse % sys.argv[0])
               sys.exit(2)
           # List of Devices
           Devices=[]
           Alu={}
           for opt, arg in opts:
               if opt == '-h':
                   print(HowToUse % sys.argv[0])
                   print("\nInformations about devices detected under OpenCL API:")
                   # For PyOpenCL import
                   try:
                       import pyopencl as cl
                       Id=0
                       for platform in cl.get_platforms():
                           for device in platform.get_devices():
                               #deviceType=cl.device_type.to_string(device.type)
                               deviceType="xPU"
                               print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
                               Id=Id+1
                   except:
                       print("Your platform does not seem to support OpenCL")
                   print("\nInformations about devices detected under CUDA API:")
                   # For PyCUDA import
                   try:
                       import pycuda.driver as cuda
                       cuda.init()
                       for Id in range(cuda.Device.count()):
                           device=cuda.Device(Id)
                           print("Device #%i of type GPU : %s" % (Id,device.name()))
                       print
                   except:
                       print("Your platform does not seem to support CUDA")
                   sys.exit()
               elif opt == '-o':
                   OutMetrology=True
                   Metrology='OutMetro'
               elif opt == '-c':
                   Curves=True
               elif opt == '-k':
                   IfThen=True
               elif opt in ("-d", "--device"):
                   Devices.append(int(arg))
               elif opt in ("-g", "--gpustyle"):
                   GpuStyle = arg
               elif opt in ("-m", "--marsaglia"):
                   RNG = arg
               elif opt in ("-v", "--valuetype"):
                   ValueType = arg
               elif opt in ("-i", "--iterations"):
                   Iterations = numpy.uint64(arg)
               elif opt in ("-b", "--blocksbegin"):
                   BlocksBegin = int(arg)
                   BlocksEnd= BlocksBegin
               elif opt in ("-e", "--blocksend"):
                   BlocksEnd = int(arg)
               elif opt in ("-s", "--blocksstep"):
                   BlocksStep = int(arg)
               elif opt in ("-f", "--threadsfirst"):
                   ThreadsBegin = int(arg)
                   ThreadsEnd = ThreadsBegin
               elif opt in ("-l", "--threadslast"):
                   ThreadsEnd = int(arg)
               elif opt in ("-t", "--threadsstep"):
                   ThreadsStep = int(arg)
               elif opt in ("-r", "--redo"):
                   Redo = int(arg)
           # If no device has been specified, take the first one!
           if len(Devices)==0:
               Devices.append(0)
           print("Devices Identification : %s" % Devices)
           print("GpuStyle used : %s" % GpuStyle)
           print("Iterations : %s" % Iterations)
           print("Number of Blocks on begin : %s" % BlocksBegin)
           print("Number of Blocks on end : %s" % BlocksEnd)
           print("Step on Blocks : %s" % BlocksStep)
           print("Number of Threads on begin : %s" % ThreadsBegin)
           print("Number of Threads on end : %s" % ThreadsEnd)
           print("Step on Threads : %s" % ThreadsStep)
           print("Number of redo : %s" % Redo)
           print("Metrology done out of XPU : %r" % OutMetrology)
           print("Type of Marsaglia RNG used : %s" % RNG)
           print("Type of variable : %s" % ValueType)
           if GpuStyle=='CUDA':
               try:
                   # For PyCUDA import
                   import pycuda.driver as cuda
                   cuda.init()
                   for Id in range(cuda.Device.count()):
                       device=cuda.Device(Id)
                       print("Device #%i of type GPU : %s" % (Id,device.name()))
                       if Id in Devices:
                           Alu[Id]='GPU'
               except ImportError:
                   print("Platform does not seem to support CUDA")
           if GpuStyle=='OpenCL':
               try:
                   # For PyOpenCL import
                   import pyopencl as cl
                   Id=0
                   for platform in cl.get_platforms():
                       for device in platform.get_devices():
                           #deviceType=cl.device_type.to_string(device.type)
                           deviceType="xPU"
                           print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
                           if Id in Devices:
                           # Set the Alu as detected Device Type
                               Alu[Id]=deviceType
                           Id=Id+1
               except ImportError:
                   print("Platform does not seem to support OpenCL")
           #print(Devices,Alu)
           BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
           ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
           ExploredJobs=numpy.array([]).astype(numpy.uint32)
           ExploredBlocks=numpy.array([]).astype(numpy.uint32)
           ExploredThreads=numpy.array([]).astype(numpy.uint32)
           avgD=numpy.array([]).astype(numpy.float32)
           medD=numpy.array([]).astype(numpy.float32)
           stdD=numpy.array([]).astype(numpy.float32)
           minD=numpy.array([]).astype(numpy.float32)
           maxD=numpy.array([]).astype(numpy.float32)
           avgR=numpy.array([]).astype(numpy.float32)
           medR=numpy.array([]).astype(numpy.float32)
           stdR=numpy.array([]).astype(numpy.float32)
           minR=numpy.array([]).astype(numpy.float32)
           maxR=numpy.array([]).astype(numpy.float32)
           for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
               # print Blocks,Threads
               circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
               ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
               ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
               ExploredThreads=numpy.append(ExploredThreads,Threads)
               if OutMetrology:
                   DurationItem=numpy.array([]).astype(numpy.float32)
                   Duration=numpy.array([]).astype(numpy.float32)
                   Rate=numpy.array([]).astype(numpy.float32)
                   for i in range(Redo):
                       start=time.time()
                       if GpuStyle=='CUDA':
                           try:
                               InputCU={}
                               InputCU['Iterations']=Iterations
                               InputCU['Steps']=1
                               InputCU['Blocks']=Blocks
                               InputCU['Threads']=Threads
                               InputCU['Device']=Devices[0]
                               InputCU['RNG']=RNG
                               InputCU['Seeds']=Seeds
                               InputCU['ValueType']=ValueType
                               InputCU['IfThen']=IfThen
                               OutputCU=MetropolisCuda(InputCU)
                               Inside=OutputCU['Circle']
                               NewIterations=OutputCU['NewIterations']
                               Duration=OutputCU['Duration']
                           except:
                               print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
                       elif GpuStyle=='OpenCL':
                           try:
                               InputCL={}
                               InputCL['Iterations']=Iterations
                               InputCL['Steps']=1
                               InputCL['Blocks']=Blocks
                               InputCL['Threads']=Threads
                               InputCL['Device']=Devices[0]
                               InputCL['RNG']=RNG
                               InputCL['Seeds']=Seeds
                               InputCL['ValueType']=ValueType
                               InputCL['IfThen']=IfThen
                               OutputCL=MetropolisOpenCL(InputCL)
                               Inside=OutputCL['Circle']
                               NewIterations=OutputCL['NewIterations']
                               Duration=OutputCL['Duration']
                           except:
                               print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
                       Duration=numpy.append(Duration,time.time()-start)
                       Rate=numpy.append(Rate,NewIterations/Duration[-1])
               else:
                   if GpuStyle=='CUDA':
                       try:
                           InputCU={}
                           InputCU['Iterations']=Iterations
                           InputCU['Steps']=Redo
                           InputCU['Blocks']=Blocks
                           InputCU['Threads']=Threads
                           InputCU['Device']=Devices[0]
                           InputCU['RNG']=RNG
                           InputCU['Seeds']=Seeds
                           InputCU['ValueType']=ValueType
                           InputCU['IfThen']=IfThen
                           OutputCU=MetropolisCuda(InputCU)
                           Inside=OutputCU['Inside']
                           NewIterations=OutputCU['NewIterations']
                           Duration=OutputCU['Duration']
                           pycuda.context.pop()
                       except:
                           print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
                   elif GpuStyle=='OpenCL':
                       try:
                           InputCL={}
                           InputCL['Iterations']=Iterations
                           InputCL['Steps']=Redo
                           InputCL['Blocks']=Blocks
                           InputCL['Threads']=Threads
                           InputCL['Device']=Devices[0]
                           InputCL['RNG']=RNG
                           InputCL['Seeds']=Seeds
                           InputCL['ValueType']=ValueType
                           InputCL['IfThen']=IfThen
                           OutputCL=MetropolisOpenCL(InputCL)
                           Inside=OutputCL['Inside']
                           NewIterations=OutputCL['NewIterations']
                           Duration=OutputCL['Duration']
                       except:
                           print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
                   Rate=NewIterations/Duration[-1]
                   print("Itops %i\nLogItops %.2f " % (int(Rate),numpy.log(Rate)/numpy.log(10)))
                   print("Pi estimation %.8f" % (4./NewIterations*Inside))
               avgD=numpy.append(avgD,numpy.average(Duration))
               medD=numpy.append(medD,numpy.median(Duration))
               stdD=numpy.append(stdD,numpy.std(Duration))
               minD=numpy.append(minD,numpy.min(Duration))
               maxD=numpy.append(maxD,numpy.max(Duration))
               avgR=numpy.append(avgR,numpy.average(Rate))
               medR=numpy.append(medR,numpy.median(Rate))
               stdR=numpy.append(stdR,numpy.std(Rate))
               minR=numpy.append(minR,numpy.min(Rate))
               maxR=numpy.append(maxR,numpy.max(Rate))
               print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
               numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
               ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
               numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
           if Fit:
               FitAndPrint(ExploredJobs,median,Curves)

Centre Blaise Pascal » Bench4GPU

root / Pi / XPU / PiXPU.py