/ETSN/MySteps_6.py - Bench4GPU - Forge du Centre Blaise Pascal

root / ETSN / MySteps_6.py @ 310

Historique | Voir | Annoter | Télécharger (13,33 ko)

       #!/usr/bin/env python3
       import numpy as np
       import pyopencl as cl
       # piling 16 arithmetical functions
       def MySillyFunction(x):
           return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
       # Native Operation under Numpy (for prototyping & tests
       def NativeAddition(a_np,b_np):
           return(a_np+b_np)
       # Native Operation with MySillyFunction under Numpy (for prototyping & tests
       def NativeSillyAddition(a_np,b_np,Calls):
           a=a_np.copy()
           b=b_np.copy()
           for i in range(Calls):
               a=MySillyFunction(a)
               b=MySillyFunction(b)
           return(a+b)
       # CUDA complete operation
       def CUDAAddition(a_np,b_np):
           import pycuda.autoinit
           import pycuda.driver as drv
           import numpy
           from pycuda.compiler import SourceModule
           mod = SourceModule("""
           __global__ void sum(float *dest, float *a, float *b)
+      {
         // const int i = threadIdx.x;
         const int i = blockIdx.x;
         dest[i] = a[i] + b[i];
+      }
       """)
           # sum = mod.get_function("sum")
           sum = mod.get_function("sum")
           res_np = numpy.zeros_like(a_np)
           sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
               block=(1,1,1), grid=(a_np.size,1))
           return(res_np)
       # CUDA Silly complete operation
       def CUDASillyAddition(a_np,b_np,Calls,Threads):
           import pycuda.autoinit
           import pycuda.driver as drv
           import numpy
           from pycuda.compiler import SourceModule
           TimeIn=time.time()
           mod = SourceModule("""
       __device__ float MySillyFunction(float x)
+      {
           return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
+      }
       __global__ void sillysum(float *dest, float *a, float *b, int Calls)
+      {
         const int i = blockDim.x*blockIdx.x+threadIdx.x;
         float ai=a[i];
         float bi=b[i];
         for (int c=0;c<Calls;c++)
+        {
           ai=MySillyFunction(ai);
           bi=MySillyFunction(bi);
+        }
         dest[i] = ai + bi;
+      }
       """)
           Elapsed=time.time()-TimeIn
           print("Definition of kernel : %.3f" % Elapsed)
           TimeIn=time.time()
           # sum = mod.get_function("sum")
           sillysum = mod.get_function("sillysum")
           Elapsed=time.time()-TimeIn
           print("Synthesis of kernel : %.3f" % Elapsed)
           TimeIn=time.time()
           res_np = numpy.zeros_like(a_np)
           Elapsed=time.time()-TimeIn
           print("Allocation on Host for results : %.3f" % Elapsed)
           Size=a_np.size
           if (Size % Threads != 0):
               print("Impossible : %i not multiple of %i..." % (Threads,Size) )
               TimeIn=time.time()
               sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls),
                        block=(1,1,1), grid=(a_np.size,1))
               Elapsed=time.time()-TimeIn
               print("Execution of kernel : %.3f" % Elapsed)
           else:
               Blocks=int(Size/Threads)
               TimeIn=time.time()
               sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls),
                        block=(Threads,1,1), grid=(Blocks,1))
               Elapsed=time.time()-TimeIn
               print("Execution of kernel : %.3f" % Elapsed)
           print("Execution of kernel : %.3f" % Elapsed)
           return(res_np)
       # OpenCL complete operation
       def OpenCLAddition(a_np,b_np):
           # Context creation
           ctx = cl.create_some_context()
           # Every process is stored in a queue
           queue = cl.CommandQueue(ctx)
           TimeIn=time.time()
           # Copy from Host to Device using pointers
           mf = cl.mem_flags
           a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
           b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
           Elapsed=time.time()-TimeIn
           print("Copy from Host 2 Device : %.3f" % Elapsed)
           TimeIn=time.time()
           # Definition of kernel under OpenCL
           prg = cl.Program(ctx, """
       __kernel void sum(
           __global const float *a_g, __global const float *b_g, __global float *res_g)
+      {
         int gid = get_global_id(0);
         res_g[gid] = a_g[gid] + b_g[gid];
+      }
       """).build()
           Elapsed=time.time()-TimeIn
           print("Building kernels : %.3f" % Elapsed)
           TimeIn=time.time()
           # Memory allocation on Device for result
           res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
           Elapsed=time.time()-TimeIn
           print("Allocation on Device for results : %.3f" % Elapsed)
           TimeIn=time.time()
           # Synthesis of function "sum" inside Kernel Sources
           knl = prg.sum  # Use this Kernel object for repeated calls
           Elapsed=time.time()-TimeIn
           print("Synthesis of kernel : %.3f" % Elapsed)
           TimeIn=time.time()
           # Call of kernel previously defined
           knl(queue, a_np.shape, None, a_g, b_g, res_g)
           Elapsed=time.time()-TimeIn
           print("Execution of kernel : %.3f" % Elapsed)
           TimeIn=time.time()
           # Creation of vector for result with same size as input vectors
           res_np = np.empty_like(a_np)
           Elapsed=time.time()-TimeIn
           print("Allocation on Host for results: %.3f" % Elapsed)
           TimeIn=time.time()
           # Copy from Device to Host
           cl.enqueue_copy(queue, res_np, res_g)
           Elapsed=time.time()-TimeIn
           print("Copy from Device 2 Host : %.3f" % Elapsed)
           # Liberation of memory
           a_g.release()
           b_g.release()
           res_g.release()
           return(res_np)
       # OpenCL complete operation
       def OpenCLSillyAddition(a_np,b_np,Calls):
           Id=0
           HasXPU=False
           for platform in cl.get_platforms():
               for device in platform.get_devices():
                   if Id==Device:
                       XPU=device
                       print("CPU/GPU selected: ",device.name.lstrip())
                       HasXPU=True
                   Id+=1
                   # print(Id)
           if HasXPU==False:
               print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
               sys.exit()
           try:
               ctx = cl.Context(devices=[XPU])
               queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
           except:
               print("Crash during context creation")
           TimeIn=time.time()
           # Copy from Host to Device using pointers
           mf = cl.mem_flags
           a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
           b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
           Elapsed=time.time()-TimeIn
           print("Copy from Host 2 Device : %.3f" % Elapsed)
           TimeIn=time.time()
           # Definition of kernel under OpenCL
           prg = cl.Program(ctx, """
       float MySillyFunction(float x)
+      {
           return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
+      }
       __kernel void sillysum(
           __global const float *a_g, __global const float *b_g, __global float *res_g, int Calls)
+      {
         int gid = get_global_id(0);
         float ai=a_g[gid];
         float bi=b_g[gid];
         for (int c=0;c<Calls;c++)
+        {
           ai=MySillyFunction(ai);
           bi=MySillyFunction(bi);
+        }
         res_g[gid] = ai + bi;
+      }
       __kernel void sum(
           __global const float *a_g, __global const float *b_g, __global float *res_g)
+      {
         int gid = get_global_id(0);
         res_g[gid] = a_g[gid] + b_g[gid];
+      }
       """).build()
           Elapsed=time.time()-TimeIn
           print("Building kernels : %.3f" % Elapsed)
           TimeIn=time.time()
           # Memory allocation on Device for result
           res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
           Elapsed=time.time()-TimeIn
           print("Allocation on Device for results : %.3f" % Elapsed)
           TimeIn=time.time()
           # Synthesis of function "sillysum" inside Kernel Sources
           knl = prg.sillysum  # Use this Kernel object for repeated calls
           Elapsed=time.time()-TimeIn
           print("Synthesis of kernel : %.3f" % Elapsed)
           TimeIn=time.time()
           # Call of kernel previously defined
           CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g, np.uint32(Calls))
+          #
           CallCL.wait()
           Elapsed=time.time()-TimeIn
           print("Execution of kernel : %.3f" % Elapsed)
           TimeIn=time.time()
           # Creation of vector for result with same size as input vectors
           res_np = np.empty_like(a_np)
           Elapsed=time.time()-TimeIn
           print("Allocation on Host for results: %.3f" % Elapsed)
           TimeIn=time.time()
           # Copy from Device to Host
           cl.enqueue_copy(queue, res_np, res_g)
           Elapsed=time.time()-TimeIn
           print("Copy from Device 2 Host : %.3f" % Elapsed)
           # Liberation of memory
           a_g.release()
           b_g.release()
           res_g.release()
           return(res_np)
       import sys
       import time
       if __name__=='__main__':
           GpuStyle='OpenCL'
           SIZE=1024
           Device=0
           Calls=1
           Threads=1
           Serial=True
           import getopt
           HowToUse='%s -n -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId> -c <SillyCalls> -t <Threads>'
           try:
               opts, args = getopt.getopt(sys.argv[1:],"hng:s:d:c:t:",["gpustyle=","size=","device=","calls=","threads="])
           except getopt.GetoptError:
               print(HowToUse % sys.argv[0])
               sys.exit(2)
           # List of Devices
           Devices=[]
           Alu={}
           for opt, arg in opts:
               if opt == '-h':
                   print(HowToUse % sys.argv[0])
                   print("\nInformations about devices detected under OpenCL API:")
                   # For PyOpenCL import
                   try:
                       import pyopencl as cl
                       Id=0
                       for platform in cl.get_platforms():
                           for device in platform.get_devices():
                               #deviceType=cl.device_type.to_string(device.type)
                               deviceType="xPU"
                               print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
                               Id=Id+1
                   except:
                       print("Your platform does not seem to support OpenCL")
                   print("\nInformations about devices detected under CUDA API:")
                   # For PyCUDA import
                   try:
                       import pycuda.driver as cuda
                       cuda.init()
                       for Id in range(cuda.Device.count()):
                           device=cuda.Device(Id)
                           print("Device #%i of type GPU : %s" % (Id,device.name()))
                       print
                   except:
                       print("Your platform does not seem to support CUDA")
                   sys.exit()
               elif opt in ("-d", "--device"):
                   Device=int(arg)
               elif opt in ("-t", "--threads"):
                   Threads=int(arg)
               elif opt in ("-c", "--calls"):
                   Calls=int(arg)
               elif opt in ("-g", "--gpustyle"):
                   GpuStyle = arg
               elif opt in ("-s", "--size"):
                   SIZE = int(arg)
               elif opt in ("-n"):
                   Serial = False
           print("Device Selection : %i" % Device)
           print("GpuStyle used : %s" % GpuStyle)
           print("Size of complex vector : %i" % SIZE)
           print("Number of silly calls : %i" % Calls)
           print("Number of Threads : %i" % Threads)
           print("Serial compute : %i" % Serial)
           if GpuStyle=='CUDA':
               try:
                   # For PyCUDA import
                   import pycuda.driver as cuda
                   cuda.init()
                   for Id in range(cuda.Device.count()):
                       device=cuda.Device(Id)
                       print("Device #%i of type GPU : %s" % (Id,device.name()))
                       if Id in Devices:
                           Alu[Id]='GPU'
               except ImportError:
                   print("Platform does not seem to support CUDA")
           if GpuStyle=='OpenCL':
               try:
                   # For PyOpenCL import
                   import pyopencl as cl
                   Id=0
                   for platform in cl.get_platforms():
                       for device in platform.get_devices():
                           #deviceType=cl.device_type.to_string(device.type)
                           deviceType="xPU"
                           print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
                           if Id in Devices:
                           # Set the Alu as detected Device Type
                               Alu[Id]=deviceType
                           Id=Id+1
               except ImportError:
                   print("Platform does not seem to support OpenCL")
           a_np = np.random.rand(SIZE).astype(np.float32)
           b_np = np.random.rand(SIZE).astype(np.float32)
           # Native Implementation
           if Serial:
               TimeIn=time.time()
               res_np=NativeSillyAddition(a_np,b_np,Calls)
               NativeElapsed=time.time()-TimeIn
               NativeRate=int(SIZE/NativeElapsed)
               print("NativeRate: %i" % NativeRate)
           # OpenCL Implementation
           if GpuStyle=='OpenCL' or GpuStyle=='all':
               TimeIn=time.time()
               # res_cl=OpenCLAddition(a_np,b_np)
               res_cl=OpenCLSillyAddition(a_np,b_np,Calls)
               OpenCLElapsed=time.time()-TimeIn
               OpenCLRate=int(SIZE/OpenCLElapsed)
               print("OpenCLRate: %i" % OpenCLRate)
               # Check on OpenCL with Numpy:
               if Serial:
                   print(res_cl - res_np)
                   print(np.linalg.norm(res_cl - res_np))
                   try:
                       assert np.allclose(res_np, res_cl)
                   except:
                       print("Results between Native & OpenCL seem to be too different!")
                   print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
           # CUDA Implementation
           if GpuStyle=='CUDA' or GpuStyle=='all':
               TimeIn=time.time()
               res_cuda=CUDASillyAddition(a_np,b_np,Calls,Threads)
               CUDAElapsed=time.time()-TimeIn
               CUDARate=int(SIZE/CUDAElapsed)
               print("CUDARate: %i" % CUDARate)
               # Check on CUDA with Numpy:
               if Serial:
                   print(res_cuda - res_np)
                   print(np.linalg.norm(res_cuda - res_np))
                   try:
                       assert np.allclose(res_np, res_cuda)
                   except:
                       print("Results between Native & CUDA seem to be too different!")
                   print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))

Centre Blaise Pascal » Bench4GPU

root / ETSN / MySteps_6.py @ 310