/Ising/GPU/Ising2D-GPU-One.py - Bench4GPU - Forge du Centre Blaise Pascal

root / Ising / GPU / Ising2D-GPU-One.py @ 94

Historique | Voir | Annoter | Télécharger (13,27 ko)

       #!/usr/bin/env python
+      #
       # Ising2D model in serial mode
+      #
       # CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
       import sys
       import numpy
       from PIL import Image
       from math import exp
       from random import random
       import time
       import getopt
       import matplotlib.pyplot as plt
       from numpy.random import randint as nprnd
       KERNEL_CODE_OPENCL="""
       // Marsaglia RNG very simple implementation
       #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
       #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       #define MWCfp MWC * 2.328306435454494e-10f
       #define KISSfp KISS * 2.328306435454494e-10f
       __kernel void MainLoopOne(__global char *s,float T,float J,float B,
                                 uint sizex,uint sizey,
                                 uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z;
          uint w=seed_w;
          for (uint i=0;i<iterations;i++) {
             uint x=(uint)(MWC%sizex) ;
             uint y=(uint)(MWC%sizey) ;
             int p=s[x+sizex*y];
             int d=s[x+sizex*((y+1)%sizey)];
             int u=s[x+sizex*((y-1)%sizey)];
             int l=s[((x-1)%sizex)+sizex*y];
             int r=s[((x+1)%sizex)+sizex*y];
             float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
             int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
             s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
+         }
          barrier(CLK_GLOBAL_MEM_FENCE);
+      }
       """
       KERNEL_CODE_CUDA="""
       // Marsaglia RNG very simple implementation
       #define znew  ((z=36969*(z&65535)+(z>>16))<<16)
       #define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
       #define MWC   (znew+wnew)
       #define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
       #define CONG  (jcong=69069*jcong+1234567)
       #define KISS  ((MWC^CONG)+SHR3)
       #define MWCfp MWC * 2.328306435454494e-10f
       #define KISSfp KISS * 2.328306435454494e-10f
       __global__ void MainLoopOne(char *s,float T,float J,float B,
                                   uint sizex,uint sizey,
                                   uint iterations,uint seed_w,uint seed_z)
+      {
          uint z=seed_z;
          uint w=seed_w;
          for (uint i=0;i<iterations;i++) {
             uint x=(uint)(MWC%sizex) ;
             uint y=(uint)(MWC%sizey) ;
             int p=s[x+sizex*y];
             int d=s[x+sizex*((y+1)%sizey)];
             int u=s[x+sizex*((y-1)%sizey)];
             int l=s[((x-1)%sizex)+sizex*y];
             int r=s[((x+1)%sizex)+sizex*y];
             float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
             int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
             s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
+         }
          __syncthreads();
+      }
       """
       def ImageOutput(sigma,prefix):
           Max=sigma.max()
           Min=sigma.min()
           # Normalize value as 8bits Integer
           SigmaInt=(255*(sigma-Min)/(Max-Min)).astype('uint8')
           image = Image.fromarray(SigmaInt)
           image.save("%s.jpg" % prefix)
       def Metropolis(sigma,T,J,B,iterations):
           start=time.time()
           SizeX,SizeY=sigma.shape
           for p in xrange(0,iterations):
               # Random access coordonate
               X,Y=numpy.random.randint(SizeX),numpy.random.randint(SizeY)
               DeltaE=J*sigma[X,Y]*(2*(sigma[X,(Y+1)%SizeY]+
                                       sigma[X,(Y-1)%SizeY]+
                                       sigma[(X-1)%SizeX,Y]+
                                       sigma[(X+1)%SizeX,Y])+B)
               if DeltaE < 0. or random() < exp(-DeltaE/T):
                   sigma[X,Y]=-sigma[X,Y]
           duration=time.time()-start
           return(duration)
       def MetropolisCuda(sigma,T,J,B,iterations,ParaStyle,Alu,Device):
           # Avec PyCUDA autoinit, rien a faire !
           sigmaCU=cuda.InOut(sigma)
           mod = SourceModule(KERNEL_CODE_CUDA)
           MetropolisCU=mod.get_function("MainLoopOne")
           start = pycuda.driver.Event()
           stop = pycuda.driver.Event()
           SizeX,SizeY=sigma.shape
           start.record()
           start.synchronize()
           MetropolisCU(sigmaCU,
                        numpy.float32(T),
                        numpy.float32(J),
                        numpy.float32(B),
                        numpy.uint32(SizeX),
                        numpy.uint32(SizeY),
                        numpy.uint32(iterations),
                        numpy.uint32(nprnd(2**31-1)),
                        numpy.uint32(nprnd(2**31-1)),
                        grid=(1,1),
                        block=(1,1,1))
           print "%s with %i %s done" % (Alu,1,ParaStyle)
           stop.record()
           stop.synchronize()
           #elapsed = stop.time_since(start)*1e-3
           elapsed = start.time_till(stop)*1e-3
           return(elapsed)
       def MetropolisOpenCL(sigma,T,J,B,iterations,ParaStyle,Alu,Device):
           # Initialisation des variables en les CASTant correctement
           # Je detecte un peripherique GPU dans la liste des peripheriques
           # for platform in cl.get_platforms():
           #     for device in platform.get_devices():
           #             if cl.device_type.to_string(device.type)=='GPU':
           #                     GPU=device
           #print "GPU detected: ",device.name
           HasGPU=False
           Id=1
           # Primary Device selection based on Device Id
           for platform in cl.get_platforms():
               for device in platform.get_devices():
                   deviceType=cl.device_type.to_string(device.type)
                   if Id==Device and not HasGPU:
                       GPU=device
                       print "CPU/GPU selected: ",device.name
                       HasGPU=True
                   Id=Id+1
           # Default Device selection based on ALU Type
           for platform in cl.get_platforms():
               for device in platform.get_devices():
                   deviceType=cl.device_type.to_string(device.type)
                   if deviceType=="GPU" and Alu=="GPU" and not HasGPU:
                       GPU=device
                       print "GPU selected: ",device.name
                       HasGPU=True
                   if deviceType=="CPU" and Alu=="CPU" and not HasGPU:
                       GPU=device
                       print "CPU selected: ",device.name
                       HasGPU=True
           # Je cree le contexte et la queue pour son execution
           # ctx = cl.create_some_context()
           ctx = cl.Context([GPU])
           queue = cl.CommandQueue(ctx,
                                   properties=cl.command_queue_properties.PROFILING_ENABLE)
           # Je recupere les flag possibles pour les buffers
           mf = cl.mem_flags
           # Attention au CAST ! C'est un int8 soit un char en OpenCL !
           sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)
           MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
               options = "-cl-mad-enable -cl-fast-relaxed-math")
           SizeX,SizeY=sigma.shape
           if ParaStyle=='Blocks':
               # Call OpenCL kernel
               # (1,) is Global work size (only 1 work size)
               # (1,) is local work size
               CLLaunch=MetropolisCL.MainLoopOne(queue,(1,),None,
                                                 sigmaCL,
                                                 numpy.float32(T),
                                                 numpy.float32(J),
                                                 numpy.float32(B),
                                                 numpy.uint32(SizeX),
                                                 numpy.uint32(SizeY),
                                                 numpy.uint32(iterations),
                                                 numpy.uint32(nprnd(2**31-1)),
                                                 numpy.uint32(nprnd(2**31-1)))
               print "%s with %i %s done" % (Alu,1,ParaStyle)
           else:
               # en OpenCL, necessaire de mettre un Global_id identique au local_id
               CLLaunch=MetropolisCL.MainLoopOne(queue,(1,),(1,),
                                                 sigmaCL,
                                                 numpy.float32(T),
                                                 numpy.float32(J),
                                                 numpy.float32(B),
                                                 numpy.uint32(SizeX),
                                                 numpy.uint32(SizeY),
                                                 numpy.uint32(iterations),
                                                 numpy.uint32(nprnd(2**31-1)),
                                                 numpy.uint32(nprnd(2**31-1)))
               print "%s with %i %s done" % (Alu,1,ParaStyle)
           CLLaunch.wait()
           cl.enqueue_copy(queue, sigma, sigmaCL).wait()
           elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
           sigmaCL.release()
           return(elapsed)
       def Magnetization(sigma,M):
           return(numpy.sum(sigma)/(sigma.shape[0]*sigma.shape[1]*1.0))
       def Energy(sigma,J):
           # Copier et caster
           E=numpy.copy(sigma).astype(numpy.float32)
           # Appel par slice
           E[1:-1,1:-1]=-J*E[1:-1,1:-1]*(E[:-2,1:-1]+E[2:,1:-1]+
                                         E[1:-1,:-2]+E[1:-1,2:])
           # Bien nettoyer la peripherie
           E[:,0]=0
           E[:,-1]=0
           E[0,:]=0
           E[-1,:]=0
           Energy=numpy.sum(E)
           return(Energy/(E.shape[0]*E.shape[1]*1.0))
       def DisplayCurves(T,E,M,J,B):
           plt.xlabel("Temperature")
           plt.ylabel("Energy")
           Experience,=plt.plot(T,E,label="Energy")
           plt.legend()
           plt.show()
       if __name__=='__main__':
           # Set defaults values
           # Alu can be CPU or GPU
           Alu='CPU'
           # Id of GPU : 0 will use the first find !
           Device=0
           # GPU style can be Cuda (Nvidia implementation) or OpenCL
           GpuStyle='OpenCL'
           # Parallel distribution can be on Threads or Blocks
           ParaStyle='Blocks'
           # Coupling factor
           J=1.
           # Magnetic Field
           B=0.
           # Size of Lattice
           Size=256
           # Default Temperatures (start, end, step)
           Tmin=0.1
           Tmax=5
           Tstep=0.1
           # Default Number of Iterations
           Iterations=Size*Size
           # Curves is True to print the curves
           Curves=False
           try:
               opts, args = getopt.getopt(sys.argv[1:],"hcj:b:z:i:s:e:p:a:d:g:t:",["coupling=","magneticfield=","size=","iterations=","tempstart=","tempend=","tempstep=","alu=","gpustyle=","parastyle="])
           except getopt.GetoptError:
               print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Blocks> -t <ParaStyle>' % sys.argv[0]
               sys.exit(2)
           for opt, arg in opts:
               if opt == '-h':
                   print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Blocks> -t <ParaStyle>' % sys.argv[0]
                   sys.exit()
               elif opt == '-c':
                   Curves=True
               elif opt in ("-j", "--coupling"):
                   J = float(arg)
               elif opt in ("-b", "--magneticfield"):
                   B = float(arg)
               elif opt in ("-s", "--tempmin"):
                   Tmin = float(arg)
               elif opt in ("-e", "--tempmax"):
                   Tmax = float(arg)
               elif opt in ("-p", "--tempstep"):
                   Tstep = float(arg)
               elif opt in ("-i", "--iterations"):
                   Iterations = int(arg)
               elif opt in ("-z", "--size"):
                   Size = int(arg)
               elif opt in ("-a", "--alu"):
                   Alu = arg
               elif opt in ("-d", "--device"):
                   Device = int(arg)
               elif opt in ("-g", "--gpustyle"):
                   GpuStyle = arg
               elif opt in ("-t", "--parastyle"):
                   ParaStyle = arg
           if Alu=='CPU' and GpuStyle=='CUDA':
               print "Alu can't be CPU for CUDA, set Alu to GPU"
               Alu='GPU'
           if ParaStyle not in ('Blocks','Threads','Hybrid'):
               print "%s not exists, ParaStyle set as Threads !" % ParaStyle
               ParaStyle='Blocks'
           print "Compute unit : %s" % Alu
           print "Device Identification : %s" % Device
           print "GpuStyle used : %s" % GpuStyle
           print "Parallel Style used : %s" % ParaStyle
           print "Coupling Factor : %s" % J
           print "Magnetic Field :  %s" % B
           print "Size of lattice : %s" % Size
           print "Iterations : %s" % Iterations
           print "Temperature on start : %s" % Tmin
           print "Temperature on end : %s" % Tmax
           print "Temperature step : %s" % Tstep
           if GpuStyle=='CUDA':
               # For PyCUDA import
               import pycuda.driver as cuda
               import pycuda.gpuarray as gpuarray
               import pycuda.autoinit
               from pycuda.compiler import SourceModule
           if GpuStyle=='OpenCL':
               # For PyOpenCL import
               import pyopencl as cl
               Id=1
               for platform in cl.get_platforms():
                   for device in platform.get_devices():
                       deviceType=cl.device_type.to_string(device.type)
                       print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
                       Id=Id+1
           LAPIMAGE=False
           sigmaIn=numpy.where(numpy.random.randn(Size,Size)>0,1,-1).astype(numpy.int8)
           ImageOutput(sigmaIn,"Ising2D_Serial_%i_Initial" % (Size))
           Trange=numpy.arange(Tmin,Tmax+Tstep,Tstep)
           E=[]
           M=[]
           for T in Trange:
               sigma=numpy.copy(sigmaIn)
               if GpuStyle=='CUDA':
                   duration=MetropolisCuda(sigma,T,J,B,Iterations,ParaStyle,Alu,Device)
               else:
                   duration=MetropolisOpenCL(sigma,T,J,B,Iterations,ParaStyle,Alu,Device)
               E=numpy.append(E,Energy(sigma,J))
               M=numpy.append(M,Magnetization(sigma,B))
               ImageOutput(sigma,"Ising2D_Serial_%i_%1.1f_Final" % (Size,T))
               print "CPU Time : %f" % (duration)
               print "Total Energy at Temperature %f : %f" % (T,E[-1])
               print "Total Magnetization at Temperature %f : %f" % (T,M[-1])
           if Curves:
               DisplayCurves(Trange,E,M,J,B)
           # Save output
           numpy.savez("Ising2D_Serial_%i_%.8i" % (Size,Iterations),(Trange,E,M))

Centre Blaise Pascal » Bench4GPU

root / Ising / GPU / Ising2D-GPU-One.py @ 94