/Ising/GPU/Ising2D-GPU.py-130527 - Bench4GPU - Forge du Centre Blaise Pascal

root / Ising / GPU / Ising2D-GPU.py-130527 @ 96

Historique | Voir | Annoter | Télécharger (16,78 ko)

       #!/usr/bin/env python
+      #
       # Ising2D model in serial mode
+      #
       # CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
       import sys
       import numpy
       import math
       from PIL import Image
       from math import exp
       from random import random
       import time
       import getopt
       import matplotlib.pyplot as plt
       from numpy.random import randint as nprnd
       KERNEL_CODE_OPENCL="""
       // Marsaglia RNG very simple implementation
       #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
       #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       #define MWCfp MWC * 2.328306435454494e-10f
       #define KISSfp KISS * 2.328306435454494e-10f
       __kernel void MainLoopOne(__global char *s,float T,float J,
                                 uint sizex,uint sizey,
                                 uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z;
          uint w=seed_w;
          for (uint i=0;i<iterations;i++) {
             uint x=(uint)(MWC%sizex) ;
             uint y=(uint)(MWC%sizey) ;
             int p=s[x+sizex*y];
             int d=s[x+sizex*((y+1)%sizey)];
             int u=s[x+sizex*((y-1)%sizey)];
             int l=s[((x-1)%sizex)+sizex*y];
             int r=s[((x+1)%sizex)+sizex*y];
             float DeltaE=2.0f*J*p*(u+d+l+r);
             int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
             s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
+         }
          barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       __kernel void MainLoopGlobal(__global char *s,__global float *T,float J,
                                    uint sizex,uint sizey,
                                    uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(get_global_id(0)+1);
          uint w=seed_w/(get_global_id(0)+1);
          float t;
          uint ind=get_global_id(0);
          t=T[get_global_id(0)];
          for (uint i=0;i<iterations;i++) {
             uint x=(uint)(MWC%sizex) ;
             uint y=(uint)(MWC%sizey) ;
             int p=s[x+sizex*(y+sizey*ind)];
             int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
             int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
             int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
             int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
             float DeltaE=2.0f*J*p*(u+d+l+r);
             int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
             s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
+         }
          barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       __kernel void MainLoopLocal(__global char *s,__global float *T,float J,
                                   uint sizex,uint sizey,
                                   uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z/(get_local_id(0)+1);
          uint w=seed_w/(get_local_id(0)+1);
          //float t=T[get_local_id(0)+get_global_id(0)*get_local_size(0)];
          //uint ind=get_local_id(0)+get_global_id(0)*get_local_size(0);
          float t=T[get_local_id(0)];
          uint ind=get_local_id(0);
          for (uint i=0;i<iterations;i++) {
             uint x=(uint)(MWC%sizex) ;
             uint y=(uint)(MWC%sizey) ;
             int p=s[x+sizex*(y+sizey*ind)];
             int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
             int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
             int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
             int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
             float DeltaE=2.0f*J*p*(u+d+l+r);
             int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
             s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
+         }
          barrier(CLK_LOCAL_MEM_FENCE);
          barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       """
       def ImageOutput(sigma,prefix):
           Max=sigma.max()
           Min=sigma.min()
           # Normalize value as 8bits Integer
           SigmaInt=(255*(sigma-Min)/(Max-Min)).astype('uint8')
           image = Image.fromarray(SigmaInt)
           image.save("%s.jpg" % prefix)
       def Metropolis(sigma,J,B,T,iterations):
           start=time.time()
           SizeX,SizeY=sigma.shape
           for p in xrange(0,iterations):
               # Random access coordonate
               X,Y=numpy.random.randint(SizeX),numpy.random.randint(SizeY)
               DeltaE=J*sigma[X,Y]*(2*(sigma[X,(Y+1)%SizeY]+
                                       sigma[X,(Y-1)%SizeY]+
                                       sigma[(X-1)%SizeX,Y]+
                                       sigma[(X+1)%SizeX,Y])+B)
               if DeltaE < 0. or random() < exp(-DeltaE/T):
                   sigma[X,Y]=-sigma[X,Y]
           duration=time.time()-start
           return(duration)
       def MetropolisOpenCL(sigma,J,B,T,iterations,jobs,ParaStyle,Alu,Device):
           # Initialisation des variables en les CASTant correctement
           # Je detecte un peripherique GPU dans la liste des peripheriques
           # for platform in cl.get_platforms():
           #     for device in platform.get_devices():
           #             if cl.device_type.to_string(device.type)=='GPU':
           #                     GPU=device
           #print "GPU detected: ",device.name
           HasGPU=False
           Id=1
           # Device selection based on choice (default is GPU)
           for platform in cl.get_platforms():
               for device in platform.get_devices():
                   if not HasGPU:
                       deviceType=cl.device_type.to_string(device.type)
                       if deviceType=="GPU" and Alu=="GPU" and Id==Device:
                           GPU=device
                           print "GPU selected: ",device.name
                           HasGPU=True
                       if deviceType=="CPU" and Alu=="CPU":
                           GPU=device
                           print "CPU selected: ",device.name
                           HasGPU=True
                   Id=Id+1
           # Je cree le contexte et la queue pour son execution
           # ctx = cl.create_some_context()
           ctx = cl.Context([GPU])
           queue = cl.CommandQueue(ctx,
                                   properties=cl.command_queue_properties.PROFILING_ENABLE)
           # Je recupere les flag possibles pour les buffers
           mf = cl.mem_flags
           print sigma,sigma.shape
           # Attention au CAST ! C'est un int8 soit un char en OpenCL !
           sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)
           MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
               options = "-cl-mad-enable -cl-fast-relaxed-math")
           SizeX,SizeY=sigma.shape
           if ParaStyle=='Blocks':
               # Call OpenCL kernel
               # (1,) is Global work size (only 1 work size)
               # (1,) is local work size
               CLLaunch=MetropolisCL.MainLoopOne(queue,(jobs,),None,
                                                 sigmaCL,
                                                 numpy.float32(T),
                                                 numpy.float32(J),
                                                 numpy.uint32(SizeX),
                                                 numpy.uint32(SizeY),
                                                 numpy.uint32(iterations),
                                                 numpy.uint32(nprnd(2**31-1)),
                                                 numpy.uint32(nprnd(2**31-1)))
               print "%s with %i %s done" % (Alu,jobs,ParaStyle)
           else:
               # en OpenCL, necessaire de mettre un Global_id identique au local_id
               CLLaunch=MetropolisCL.MainLoopOne(queue,(jobs,),(jobs,),
                                                 sigmaCL,
                                                 numpy.float32(T),
                                                 numpy.float32(J),
                                                 numpy.uint32(SizeX),
                                                 numpy.uint32(SizeY),
                                                 numpy.uint32(iterations),
                                                 numpy.uint32(nprnd(2**31-1)),
                                                 numpy.uint32(nprnd(2**31-1)))
               print "%s with %i %s done" % (Alu,jobs,ParaStyle)
           CLLaunch.wait()
           cl.enqueue_copy(queue, sigma, sigmaCL).wait()
           elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
           sigmaCL.release()
           return(elapsed)
       def MetropolisAllOpenCL(sigmaDict,J,B,TList,iterations,jobs,ParaStyle,Alu,Device):
           # sigmaDict & Tlist are NOT respectively array & float
           # sigmaDict : dict of array for each temperatoire
           # TList : list of temperatures
           # Initialisation des variables en les CASTant correctement
           # Je detecte un peripherique GPU dans la liste des peripheriques
           # for platform in cl.get_platforms():
           #     for device in platform.get_devices():
           #             if cl.device_type.to_string(device.type)=='GPU':
           #                     GPU=device
           #print "GPU detected: ",device.name
           HasGPU=False
           Id=1
           # Device selection based on choice (default is GPU)
           for platform in cl.get_platforms():
               for device in platform.get_devices():
                   if not HasGPU:
                       deviceType=cl.device_type.to_string(device.type)
                       if deviceType=="GPU" and Alu=="GPU" and Id==Device:
                           GPU=device
                           print "GPU selected: ",device.name
                           HasGPU=True
                       if deviceType=="CPU" and Alu=="CPU":
                           GPU=device
                           print "CPU selected: ",device.name
                           HasGPU=True
                   Id=Id+1
           # Je cree le contexte et la queue pour son execution
           # ctx = cl.create_some_context()
           ctx = cl.Context([GPU])
           queue = cl.CommandQueue(ctx,
                                   properties=cl.command_queue_properties.PROFILING_ENABLE)
           # Je recupere les flag possibles pour les buffers
           mf = cl.mem_flags
           # Concatenate all sigma in single array
           sigma=numpy.copy(sigmaDict[TList[0]])
           for T in TList[1:]:
               sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
           print sigma,sigma.shape
           sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)
           TCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=TList)
           MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
               options = "-cl-mad-enable -cl-fast-relaxed-math")
           SizeX,SizeY=sigmaDict[TList[0]].shape
           if ParaStyle=='Blocks':
               # Call OpenCL kernel
               # (1,) is Global work size (only 1 work size)
               # (1,) is local work size
               # SeedZCL is lattice translated in CL format
               # SeedWCL is lattice translated in CL format
               # step is number of iterations
               CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
                                                    sigmaCL,
                                                    TCL,
                                                    numpy.float32(J),
                                                    numpy.uint32(SizeX),
                                                    numpy.uint32(SizeY),
                                                    numpy.uint32(iterations),
                                                    numpy.uint32(nprnd(2**31-1)),
                                                    numpy.uint32(nprnd(2**31-1)))
               print "%s with %i %s done" % (Alu,jobs,ParaStyle)
           else:
               blocks=int(math.sqrt(jobs))
               # en OpenCL, necessaire de mettre un Global_id identique au local_id
               CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(2,),
                                                   sigmaCL,
                                                   TCL,
                                                   numpy.float32(J),
                                                   numpy.uint32(SizeX),
                                                   numpy.uint32(SizeY),
                                                   numpy.uint32(iterations),
                                                   numpy.uint32(nprnd(2**31-1)),
                                                   numpy.uint32(nprnd(2**31-1)))
               print "%s with %i %s done" % (Alu,jobs,ParaStyle)
           CLLaunch.wait()
           cl.enqueue_copy(queue, sigma, sigmaCL).wait()
           elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
           sigmaCL.release()
           results=numpy.split(sigma,len(TList),axis=1)
           for T in TList:
               sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
           return(elapsed)
       def Magnetization(sigma,M):
           return(numpy.sum(sigma)/(sigma.shape[0]*sigma.shape[1]*1.0))
       def Energy(sigma,J):
           # Copier et caster
           E=numpy.copy(sigma).astype(numpy.float32)
           # Appel par slice
           E[1:-1,1:-1]=-J*E[1:-1,1:-1]*(E[:-2,1:-1]+E[2:,1:-1]+
                                         E[1:-1,:-2]+E[1:-1,2:])
           # Bien nettoyer la peripherie
           E[:,0]=0
           E[:,-1]=0
           E[0,:]=0
           E[-1,:]=0
           Energy=numpy.sum(E)
           return(Energy/(E.shape[0]*E.shape[1]*1.0))
       def DisplayCurves(T,E,M,J,B):
           plt.xlabel("Temperature")
           plt.ylabel("Energy")
           Experience,=plt.plot(T,E,label="Energy")
           plt.legend()
           plt.show()
       if __name__=='__main__':
           # Set defaults values
           # Alu can be CPU or GPU
           Alu='CPU'
           # Id of GPU
           Device=1
           # GPU style can be Cuda (Nvidia implementation) or OpenCL
           GpuStyle='OpenCL'
           # Parallel distribution can be on Threads or Blocks
           ParaStyle='Blocks'
           # Coupling factor
           J=1.
           # Magnetic Field
           B=0.
           # Size of Lattice
           Size=256
           # Default Temperatures (start, end, step)
           Tmin=0.1
           Tmax=5
           Tstep=0.1
           # Default Number of Iterations
           Iterations=Size*Size
           # Curves is True to print the curves
           Curves=False
           try:
               opts, args = getopt.getopt(sys.argv[1:],"hcj:b:z:i:s:e:p:a:d:g:t:",["coupling=","magneticfield=","size=","iterations=","tempstart=","tempend=","tempstep=","alu=","gpustyle=","parastyle="])
           except getopt.GetoptError:
               print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Blocks> -t <ParaStyle>' % sys.argv[0]
               sys.exit(2)
           for opt, arg in opts:
               if opt == '-h':
                   print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Blocks> -t <ParaStyle>' % sys.argv[0]
                   sys.exit()
               elif opt == '-c':
                   Curves=True
               elif opt in ("-j", "--coupling"):
                   J = float(arg)
               elif opt in ("-b", "--magneticfield"):
                   B = float(arg)
               elif opt in ("-s", "--tempmin"):
                   Tmin = float(arg)
               elif opt in ("-e", "--tempmax"):
                   Tmax = float(arg)
               elif opt in ("-p", "--tempstep"):
                   Tstep = float(arg)
               elif opt in ("-i", "--iterations"):
                   Iterations = int(arg)
               elif opt in ("-z", "--size"):
                   Size = int(arg)
               elif opt in ("-a", "--alu"):
                   Alu = arg
               elif opt in ("-d", "--device"):
                   Device = int(arg)
               elif opt in ("-g", "--gpustyle"):
                   GpuStyle = arg
               elif opt in ("-t", "--parastyle"):
                   ParaStyle = arg
           if Alu=='CPU' and GpuStyle=='CUDA':
               print "Alu can't be CPU for CUDA, set Alu to GPU"
               Alu='GPU'
           if ParaStyle not in ('Blocks','Threads','Hybrid'):
               print "%s not exists, ParaStyle set as Threads !" % ParaStyle
               ParaStyle='Blocks'
           print "Compute unit : %s" % Alu
           print "Device Identification : %s" % Device
           print "GpuStyle used : %s" % GpuStyle
           print "Parallel Style used : %s" % ParaStyle
           print "Coupling Factor : %s" % J
           print "Magnetic Field :  %s" % B
           print "Size of lattice : %s" % Size
           print "Iterations : %s" % Iterations
           print "Temperature on start : %s" % Tmin
           print "Temperature on end : %s" % Tmax
           print "Temperature step : %s" % Tstep
           if GpuStyle=='CUDA':
               # For PyCUDA import
               import pycuda.driver as cuda
               import pycuda.gpuarray as gpuarray
               import pycuda.autoinit
               from pycuda.compiler import SourceModule
           if GpuStyle=='OpenCL':
               # For PyOpenCL import
               import pyopencl as cl
               Id=1
               for platform in cl.get_platforms():
                   for device in platform.get_devices():
                       deviceType=cl.device_type.to_string(device.type)
                       print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
                       Id=Id+1
           LAPIMAGE=False
           sigmaIn=numpy.where(numpy.random.randn(Size,Size)>0,1,-1).astype(numpy.int8)
           ImageOutput(sigmaIn,"Ising2D_Serial_%i_Initial" % (Size))
           # La temperature est passee comme parametre, attention au CAST !
           Trange=numpy.arange(Tmin,Tmax+Tstep,Tstep).astype(numpy.float32)
           E=[]
           M=[]
           print Trange,Trange.shape
           sigma={}
           for T in Trange:
               sigma[T]=numpy.copy(sigmaIn)
           # For GPU, all process are launched
           #MetropolisAllOpenCL(sigma,J,B,Trange,Iterations,len(Trange),
           #                    ParaStyle,Alu,Device)
           MetropolisAllOpenCL(sigma,J,B,Trange,Iterations,len(Trange),
                               ParaStyle,Alu,Device)
           for T in Trange:
               ImageOutput(sigma[T],"Ising2D_Serial_%i_%1.1f_Final" % (Size,T))
           # if Curves:
           #     DisplayCurves(Trange,E,M,J,B)
           # # Save output
           # numpy.savez("Ising2D_Serial_%i_%.8i" % (Size,Iterations),(Trange,E,M))

Centre Blaise Pascal » Bench4GPU

root / Ising / GPU / Ising2D-GPU.py-130527 @ 96