Statistiques
| Révision :

root / Ising / GPU / Ising2D-GPU.py @ 308

Historique | Voir | Annoter | Télécharger (23,16 ko)

1 18 equemene
#!/usr/bin/env python
2 18 equemene
#
3 18 equemene
# Ising2D model in serial mode
4 18 equemene
#
5 18 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
6 18 equemene
7 18 equemene
import sys
8 18 equemene
import numpy
9 18 equemene
import math
10 18 equemene
from PIL import Image
11 18 equemene
from math import exp
12 18 equemene
from random import random
13 18 equemene
import time
14 18 equemene
import getopt
15 18 equemene
import matplotlib.pyplot as plt
16 18 equemene
from numpy.random import randint as nprnd
17 18 equemene
18 18 equemene
KERNEL_CODE_OPENCL="""
19 18 equemene

20 18 equemene
// Marsaglia RNG very simple implementation
21 18 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
22 18 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
23 18 equemene
#define MWC   (znew+wnew)
24 18 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
25 18 equemene
#define CONG  (jcong=69069*jcong+1234567)
26 18 equemene
#define KISS  ((MWC^CONG)+SHR3)
27 18 equemene

28 18 equemene
#define MWCfp MWC * 2.328306435454494e-10f
29 18 equemene
#define KISSfp KISS * 2.328306435454494e-10f
30 18 equemene

31 18 equemene
__kernel void MainLoopOne(__global char *s,float T,float J,float B,
32 18 equemene
                          uint sizex,uint sizey,
33 18 equemene
                          uint iterations,uint seed_w,uint seed_z)
34 18 equemene

35 18 equemene
{
36 18 equemene
   uint z=seed_z;
37 18 equemene
   uint w=seed_w;
38 18 equemene

39 18 equemene
   for (uint i=0;i<iterations;i++) {
40 18 equemene

41 18 equemene
      uint x=(uint)(MWC%sizex) ;
42 18 equemene
      uint y=(uint)(MWC%sizey) ;
43 18 equemene

44 18 equemene
      int p=s[x+sizex*y];
45 18 equemene

46 18 equemene
      int d=s[x+sizex*((y+1)%sizey)];
47 18 equemene
      int u=s[x+sizex*((y-1)%sizey)];
48 18 equemene
      int l=s[((x-1)%sizex)+sizex*y];
49 18 equemene
      int r=s[((x+1)%sizex)+sizex*y];
50 18 equemene

51 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
52 18 equemene

53 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
54 18 equemene
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
55 18 equemene
   }
56 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
57 18 equemene
}
58 18 equemene

59 18 equemene
__kernel void MainLoopGlobal(__global char *s,__global float *T,float J,float B,
60 18 equemene
                             uint sizex,uint sizey,
61 18 equemene
                             uint iterations,uint seed_w,uint seed_z)
62 18 equemene

63 18 equemene
{
64 18 equemene
   uint z=seed_z/(get_global_id(0)+1);
65 18 equemene
   uint w=seed_w/(get_global_id(0)+1);
66 18 equemene
   float t=T[get_global_id(0)];
67 18 equemene
   uint ind=get_global_id(0);
68 18 equemene

69 18 equemene
   for (uint i=0;i<iterations;i++) {
70 18 equemene

71 18 equemene
      uint x=(uint)(MWC%sizex) ;
72 18 equemene
      uint y=(uint)(MWC%sizey) ;
73 18 equemene

74 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
75 18 equemene

76 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
77 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
78 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
79 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
80 18 equemene

81 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
82 18 equemene

83 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
84 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
85 18 equemene

86 18 equemene
   }
87 18 equemene

88 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
89 18 equemene

90 18 equemene
}
91 18 equemene

92 18 equemene
__kernel void MainLoopHybrid(__global char *s,__global float *T,float J,float B,
93 18 equemene
                             uint sizex,uint sizey,
94 18 equemene
                             uint iterations,uint seed_w,uint seed_z)
95 18 equemene

96 18 equemene
{
97 18 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
98 18 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
99 18 equemene
   float t=T[get_group_id(0)*get_num_groups(0)+get_local_id(0)];
100 18 equemene
   uint ind=get_group_id(0)*get_num_groups(0)+get_local_id(0);
101 18 equemene

102 18 equemene
   for (uint i=0;i<iterations;i++) {
103 18 equemene

104 18 equemene
      uint x=(uint)(MWC%sizex) ;
105 18 equemene
      uint y=(uint)(MWC%sizey) ;
106 18 equemene

107 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
108 18 equemene

109 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
110 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
111 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
112 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
113 18 equemene

114 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
115 18 equemene

116 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
117 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
118 18 equemene

119 18 equemene
   }
120 18 equemene

121 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
122 18 equemene

123 18 equemene
}
124 18 equemene

125 18 equemene
__kernel void MainLoopLocal(__global char *s,__global float *T,float J,float B,
126 18 equemene
                            uint sizex,uint sizey,
127 18 equemene
                            uint iterations,uint seed_w,uint seed_z)
128 18 equemene
{
129 18 equemene
   uint z=seed_z/(get_local_id(0)+1);
130 18 equemene
   uint w=seed_w/(get_local_id(0)+1);
131 18 equemene
   float t=T[get_local_id(0)];
132 18 equemene
   uint ind=get_local_id(0);
133 18 equemene

134 18 equemene
   for (uint i=0;i<iterations;i++) {
135 18 equemene

136 18 equemene
      uint x=(uint)(MWC%sizex) ;
137 18 equemene
      uint y=(uint)(MWC%sizey) ;
138 18 equemene

139 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
140 18 equemene

141 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
142 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
143 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
144 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
145 18 equemene

146 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
147 18 equemene

148 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
149 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
150 18 equemene
   }
151 18 equemene

152 18 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
153 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
154 18 equemene

155 18 equemene
}
156 18 equemene
"""
157 18 equemene
158 18 equemene
KERNEL_CODE_CUDA="""
159 18 equemene

160 18 equemene
// Marsaglia RNG very simple implementation
161 18 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
162 18 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
163 18 equemene
#define MWC   (znew+wnew)
164 18 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
165 18 equemene
#define CONG  (jcong=69069*jcong+1234567)
166 18 equemene
#define KISS  ((MWC^CONG)+SHR3)
167 18 equemene

168 18 equemene
#define MWCfp MWC * 2.328306435454494e-10f
169 18 equemene
#define KISSfp KISS * 2.328306435454494e-10f
170 18 equemene

171 18 equemene
__global__ void MainLoopOne(char *s,float T,float J,float B,
172 18 equemene
                            uint sizex,uint sizey,
173 18 equemene
                            uint iterations,uint seed_w,uint seed_z)
174 18 equemene

175 18 equemene
{
176 18 equemene
   uint z=seed_z;
177 18 equemene
   uint w=seed_w;
178 18 equemene

179 18 equemene
   for (uint i=0;i<iterations;i++) {
180 18 equemene

181 18 equemene
      uint x=(uint)(MWC%sizex) ;
182 18 equemene
      uint y=(uint)(MWC%sizey) ;
183 18 equemene

184 18 equemene
      int p=s[x+sizex*y];
185 18 equemene

186 18 equemene
      int d=s[x+sizex*((y+1)%sizey)];
187 18 equemene
      int u=s[x+sizex*((y-1)%sizey)];
188 18 equemene
      int l=s[((x-1)%sizex)+sizex*y];
189 18 equemene
      int r=s[((x+1)%sizex)+sizex*y];
190 18 equemene

191 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
192 18 equemene

193 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
194 18 equemene
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
195 18 equemene
   }
196 18 equemene
   __syncthreads();
197 18 equemene

198 18 equemene
}
199 18 equemene

200 18 equemene
__global__ void MainLoopGlobal(char *s,float *T,float J,float B,
201 18 equemene
                               uint sizex,uint sizey,
202 18 equemene
                               uint iterations,uint seed_w,uint seed_z)
203 18 equemene

204 18 equemene
{
205 18 equemene
   uint z=seed_z/(blockIdx.x+1);
206 18 equemene
   uint w=seed_w/(blockIdx.x+1);
207 18 equemene
   float t=T[blockIdx.x];
208 18 equemene
   uint ind=blockIdx.x;
209 18 equemene

210 18 equemene
   for (uint i=0;i<iterations;i++) {
211 18 equemene

212 18 equemene
      uint x=(uint)(MWC%sizex) ;
213 18 equemene
      uint y=(uint)(MWC%sizey) ;
214 18 equemene

215 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
216 18 equemene

217 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
218 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
219 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
220 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
221 18 equemene

222 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
223 18 equemene

224 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
225 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
226 18 equemene

227 18 equemene
   }
228 18 equemene
   __syncthreads();
229 18 equemene

230 18 equemene
}
231 18 equemene

232 18 equemene
__global__ void MainLoopHybrid(char *s,float *T,float J,float B,
233 18 equemene
                               uint sizex,uint sizey,
234 18 equemene
                               uint iterations,uint seed_w,uint seed_z)
235 18 equemene

236 18 equemene
{
237 18 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
238 18 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
239 18 equemene
   float t=T[blockDim.x*blockIdx.x+threadIdx.x];
240 18 equemene
   uint ind=blockDim.x*blockIdx.x+threadIdx.x;
241 18 equemene

242 18 equemene
   for (uint i=0;i<iterations;i++) {
243 18 equemene

244 18 equemene
      uint x=(uint)(MWC%sizex) ;
245 18 equemene
      uint y=(uint)(MWC%sizey) ;
246 18 equemene

247 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
248 18 equemene

249 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
250 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
251 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
252 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
253 18 equemene

254 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
255 18 equemene

256 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
257 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
258 18 equemene

259 18 equemene
   }
260 18 equemene
   __syncthreads();
261 18 equemene

262 18 equemene
}
263 18 equemene

264 18 equemene
__global__ void MainLoopLocal(char *s,float *T,float J,float B,
265 18 equemene
                              uint sizex,uint sizey,
266 18 equemene
                              uint iterations,uint seed_w,uint seed_z)
267 18 equemene
{
268 18 equemene
   uint z=seed_z/(threadIdx.x+1);
269 18 equemene
   uint w=seed_w/(threadIdx.x+1);
270 18 equemene
   float t=T[threadIdx.x];
271 18 equemene
   uint ind=threadIdx.x;
272 18 equemene

273 18 equemene
   for (uint i=0;i<iterations;i++) {
274 18 equemene

275 18 equemene
      uint x=(uint)(MWC%sizex) ;
276 18 equemene
      uint y=(uint)(MWC%sizey) ;
277 18 equemene

278 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
279 18 equemene

280 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
281 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
282 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
283 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
284 18 equemene

285 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
286 18 equemene

287 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
288 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
289 18 equemene
   }
290 18 equemene
   __syncthreads();
291 18 equemene

292 18 equemene
}
293 18 equemene
"""
294 18 equemene
295 18 equemene
# find prime factors of a number
296 18 equemene
# Get for WWW :
297 18 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
298 18 equemene
def PrimeFactors(x):
299 18 equemene
    factorlist=numpy.array([]).astype('uint32')
300 18 equemene
    loop=2
301 18 equemene
    while loop<=x:
302 18 equemene
        if x%loop==0:
303 18 equemene
            x/=loop
304 18 equemene
            factorlist=numpy.append(factorlist,[loop])
305 18 equemene
        else:
306 18 equemene
            loop+=1
307 18 equemene
    return factorlist
308 18 equemene
309 18 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
310 18 equemene
# output is thread number
311 18 equemene
def BestThreadsNumber(jobs):
312 18 equemene
    factors=PrimeFactors(jobs)
313 18 equemene
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
314 18 equemene
    threads=1
315 18 equemene
    for factor in matrix.transpose().ravel():
316 18 equemene
        threads=threads*factor
317 18 equemene
        if threads*threads>jobs:
318 18 equemene
            break
319 18 equemene
    return(long(threads))
320 18 equemene
321 18 equemene
def ImageOutput(sigma,prefix):
322 18 equemene
    Max=sigma.max()
323 18 equemene
    Min=sigma.min()
324 18 equemene
325 18 equemene
    # Normalize value as 8bits Integer
326 18 equemene
    SigmaInt=(255*(sigma-Min)/(Max-Min)).astype('uint8')
327 18 equemene
    image = Image.fromarray(SigmaInt)
328 18 equemene
    image.save("%s.jpg" % prefix)
329 18 equemene
330 18 equemene
def Metropolis(sigma,T,J,B,iterations):
331 18 equemene
    start=time.time()
332 18 equemene
333 18 equemene
    SizeX,SizeY=sigma.shape
334 18 equemene
335 18 equemene
    for p in xrange(0,iterations):
336 18 equemene
        # Random access coordonate
337 18 equemene
        X,Y=numpy.random.randint(SizeX),numpy.random.randint(SizeY)
338 18 equemene
339 18 equemene
        DeltaE=J*sigma[X,Y]*(2*(sigma[X,(Y+1)%SizeY]+
340 18 equemene
                                sigma[X,(Y-1)%SizeY]+
341 18 equemene
                                sigma[(X-1)%SizeX,Y]+
342 18 equemene
                                sigma[(X+1)%SizeX,Y])+B)
343 18 equemene
344 18 equemene
        if DeltaE < 0. or random() < exp(-DeltaE/T):
345 18 equemene
            sigma[X,Y]=-sigma[X,Y]
346 18 equemene
    duration=time.time()-start
347 18 equemene
    return(duration)
348 18 equemene
349 18 equemene
def MetropolisAllOpenCL(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
350 18 equemene
351 18 equemene
    # sigmaDict & Tlist are NOT respectively array & float
352 18 equemene
    # sigmaDict : dict of array for each temperatoire
353 18 equemene
    # TList : list of temperatures
354 18 equemene
355 18 equemene
    # Initialisation des variables en les CASTant correctement
356 18 equemene
357 18 equemene
    # Je detecte un peripherique GPU dans la liste des peripheriques
358 18 equemene
359 18 equemene
    HasGPU=False
360 18 equemene
    Id=1
361 18 equemene
    # Primary Device selection based on Device Id
362 18 equemene
    for platform in cl.get_platforms():
363 18 equemene
        for device in platform.get_devices():
364 144 equemene
            #deviceType=cl.device_type.to_string(device.type)
365 144 equemene
            deviceType="xPU"
366 18 equemene
            if Id==Device and not HasGPU:
367 18 equemene
                GPU=device
368 18 equemene
                print "CPU/GPU selected: ",device.name
369 18 equemene
                HasGPU=True
370 18 equemene
            Id=Id+1
371 18 equemene
372 18 equemene
    # Je cree le contexte et la queue pour son execution
373 18 equemene
    # ctx = cl.create_some_context()
374 18 equemene
    ctx = cl.Context([GPU])
375 18 equemene
    queue = cl.CommandQueue(ctx,
376 18 equemene
                            properties=cl.command_queue_properties.PROFILING_ENABLE)
377 18 equemene
378 18 equemene
    # Je recupere les flag possibles pour les buffers
379 18 equemene
    mf = cl.mem_flags
380 18 equemene
381 18 equemene
    # Concatenate all sigma in single array
382 18 equemene
    sigma=numpy.copy(sigmaDict[TList[0]])
383 18 equemene
    for T in TList[1:]:
384 18 equemene
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
385 18 equemene
386 18 equemene
    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)
387 18 equemene
    TCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=TList)
388 18 equemene
389 18 equemene
    MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
390 18 equemene
        options = "-cl-mad-enable -cl-fast-relaxed-math")
391 18 equemene
392 18 equemene
    SizeX,SizeY=sigmaDict[TList[0]].shape
393 18 equemene
394 18 equemene
    if ParaStyle=='Blocks':
395 18 equemene
        # Call OpenCL kernel
396 18 equemene
        # (1,) is Global work size (only 1 work size)
397 18 equemene
        # (1,) is local work size
398 18 equemene
        # SeedZCL is lattice translated in CL format
399 18 equemene
        # SeedWCL is lattice translated in CL format
400 18 equemene
        # step is number of iterations
401 18 equemene
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
402 18 equemene
                                             sigmaCL,
403 18 equemene
                                             TCL,
404 18 equemene
                                             numpy.float32(J),
405 18 equemene
                                             numpy.float32(B),
406 18 equemene
                                             numpy.uint32(SizeX),
407 18 equemene
                                             numpy.uint32(SizeY),
408 18 equemene
                                             numpy.uint32(iterations),
409 18 equemene
                                             numpy.uint32(nprnd(2**31-1)),
410 18 equemene
                                             numpy.uint32(nprnd(2**31-1)))
411 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
412 18 equemene
              (Alu,jobs,1,ParaStyle)
413 18 equemene
    elif ParaStyle=='Threads':
414 18 equemene
        # It's necessary to put a Local_ID equal to a Global_ID
415 18 equemene
        # Jobs are to be considerated as global number of jobs to do
416 18 equemene
        # And to be distributed to entities
417 18 equemene
        # For example :
418 18 equemene
        # G_ID=10 & L_ID=10 : 10 Threads on 1 UC
419 18 equemene
        # G_ID=10 & L_ID=1 : 10 Threads on 1 UC
420 18 equemene
421 18 equemene
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
422 18 equemene
                                            sigmaCL,
423 18 equemene
                                            TCL,
424 18 equemene
                                            numpy.float32(J),
425 18 equemene
                                            numpy.float32(B),
426 18 equemene
                                            numpy.uint32(SizeX),
427 18 equemene
                                            numpy.uint32(SizeY),
428 18 equemene
                                            numpy.uint32(iterations),
429 18 equemene
                                            numpy.uint32(nprnd(2**31-1)),
430 18 equemene
                                            numpy.uint32(nprnd(2**31-1)))
431 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
432 18 equemene
              (Alu,1,jobs,ParaStyle)
433 18 equemene
    else:
434 18 equemene
        threads=BestThreadsNumber(jobs)
435 18 equemene
        # en OpenCL, necessaire de mettre un Global_id identique au local_id
436 18 equemene
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
437 18 equemene
                                             sigmaCL,
438 18 equemene
                                             TCL,
439 18 equemene
                                             numpy.float32(J),
440 18 equemene
                                             numpy.float32(B),
441 18 equemene
                                             numpy.uint32(SizeX),
442 18 equemene
                                             numpy.uint32(SizeY),
443 18 equemene
                                             numpy.uint32(iterations),
444 18 equemene
                                             numpy.uint32(nprnd(2**31-1)),
445 18 equemene
                                             numpy.uint32(nprnd(2**31-1)))
446 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
447 18 equemene
              (Alu,jobs/threads,threads,ParaStyle)
448 18 equemene
449 18 equemene
    CLLaunch.wait()
450 18 equemene
    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
451 18 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
452 18 equemene
    sigmaCL.release()
453 18 equemene
454 18 equemene
    results=numpy.split(sigma,len(TList),axis=1)
455 18 equemene
    for T in TList:
456 18 equemene
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
457 18 equemene
458 18 equemene
    return(elapsed)
459 18 equemene
460 18 equemene
def MetropolisAllCuda(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
461 18 equemene
462 18 equemene
    # sigmaDict & Tlist are NOT respectively array & float
463 18 equemene
    # sigmaDict : dict of array for each temperatoire
464 18 equemene
    # TList : list of temperatures
465 18 equemene
466 18 equemene
    # Avec PyCUDA autoinit, rien a faire !
467 18 equemene
468 18 equemene
    mod = SourceModule(KERNEL_CODE_CUDA)
469 18 equemene
470 18 equemene
    MetropolisBlocksCuda=mod.get_function("MainLoopGlobal")
471 18 equemene
    MetropolisThreadsCuda=mod.get_function("MainLoopLocal")
472 18 equemene
    MetropolisHybridCuda=mod.get_function("MainLoopHybrid")
473 18 equemene
474 18 equemene
    # Concatenate all sigma in single array
475 18 equemene
    sigma=numpy.copy(sigmaDict[TList[0]])
476 18 equemene
    for T in TList[1:]:
477 18 equemene
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
478 18 equemene
479 18 equemene
    sigmaCU=cuda.InOut(sigma)
480 18 equemene
    TCU=cuda.InOut(TList)
481 18 equemene
482 18 equemene
    SizeX,SizeY=sigmaDict[TList[0]].shape
483 18 equemene
484 18 equemene
    start = pycuda.driver.Event()
485 18 equemene
    stop = pycuda.driver.Event()
486 18 equemene
487 18 equemene
    start.record()
488 18 equemene
    start.synchronize()
489 18 equemene
    if ParaStyle=='Blocks':
490 18 equemene
        # Call CUDA kernel
491 18 equemene
        # (1,) is Global work size (only 1 work size)
492 18 equemene
        # (1,) is local work size
493 18 equemene
        # SeedZCL is lattice translated in CL format
494 18 equemene
        # SeedWCL is lattice translated in CL format
495 18 equemene
        # step is number of iterations
496 18 equemene
        MetropolisBlocksCuda(sigmaCU,
497 18 equemene
                             TCU,
498 18 equemene
                             numpy.float32(J),
499 18 equemene
                             numpy.float32(B),
500 18 equemene
                             numpy.uint32(SizeX),
501 18 equemene
                             numpy.uint32(SizeY),
502 18 equemene
                             numpy.uint32(iterations),
503 18 equemene
                             numpy.uint32(nprnd(2**31-1)),
504 18 equemene
                             numpy.uint32(nprnd(2**31-1)),
505 18 equemene
                             grid=(jobs,1),block=(1,1,1))
506 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
507 18 equemene
              (Alu,jobs,1,ParaStyle)
508 18 equemene
    elif ParaStyle=='Threads':
509 18 equemene
        MetropolisThreadsCuda(sigmaCU,
510 18 equemene
                              TCU,
511 18 equemene
                              numpy.float32(J),
512 18 equemene
                              numpy.float32(B),
513 18 equemene
                              numpy.uint32(SizeX),
514 18 equemene
                              numpy.uint32(SizeY),
515 18 equemene
                              numpy.uint32(iterations),
516 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
517 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
518 18 equemene
                              grid=(1,1),block=(jobs,1,1))
519 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
520 18 equemene
              (Alu,1,jobs,ParaStyle)
521 18 equemene
    else:
522 18 equemene
        threads=BestThreadsNumber(jobs)
523 18 equemene
        MetropolisHybridCuda(sigmaCU,
524 18 equemene
                              TCU,
525 18 equemene
                              numpy.float32(J),
526 18 equemene
                              numpy.float32(B),
527 18 equemene
                              numpy.uint32(SizeX),
528 18 equemene
                              numpy.uint32(SizeY),
529 18 equemene
                              numpy.uint32(iterations),
530 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
531 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
532 18 equemene
                              grid=(jobs/threads,1),block=(threads,1,1))
533 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
534 18 equemene
              (Alu,jobs/threads,threads,ParaStyle)
535 18 equemene
536 18 equemene
    stop.record()
537 18 equemene
    stop.synchronize()
538 18 equemene
    elapsed = start.time_till(stop)*1e-3
539 18 equemene
540 18 equemene
    results=numpy.split(sigma,len(TList),axis=1)
541 18 equemene
    for T in TList:
542 18 equemene
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
543 18 equemene
544 18 equemene
    return(elapsed)
545 18 equemene
546 18 equemene
547 18 equemene
def Magnetization(sigma,M):
548 18 equemene
    return(numpy.sum(sigma)/(sigma.shape[0]*sigma.shape[1]*1.0))
549 18 equemene
550 18 equemene
def Energy(sigma,J):
551 18 equemene
    # Copier et caster
552 18 equemene
    E=numpy.copy(sigma).astype(numpy.float32)
553 18 equemene
554 18 equemene
    # Appel par slice
555 18 equemene
    E[1:-1,1:-1]=-J*E[1:-1,1:-1]*(E[:-2,1:-1]+E[2:,1:-1]+
556 18 equemene
                                  E[1:-1,:-2]+E[1:-1,2:])
557 18 equemene
558 18 equemene
    # Bien nettoyer la peripherie
559 18 equemene
    E[:,0]=0
560 18 equemene
    E[:,-1]=0
561 18 equemene
    E[0,:]=0
562 18 equemene
    E[-1,:]=0
563 18 equemene
564 18 equemene
    Energy=numpy.sum(E)
565 18 equemene
566 18 equemene
    return(Energy/(E.shape[0]*E.shape[1]*1.0))
567 18 equemene
568 18 equemene
def DisplayCurves(T,E,M,J,B):
569 18 equemene
570 18 equemene
    plt.xlabel("Temperature")
571 18 equemene
    plt.ylabel("Energy")
572 18 equemene
573 18 equemene
    Experience,=plt.plot(T,E,label="Energy")
574 18 equemene
575 18 equemene
    plt.legend()
576 18 equemene
    plt.show()
577 18 equemene
578 18 equemene
579 18 equemene
if __name__=='__main__':
580 18 equemene
581 18 equemene
    # Set defaults values
582 18 equemene
    # Alu can be CPU or GPU
583 18 equemene
    Alu='CPU'
584 18 equemene
    # Id of GPU : 0
585 18 equemene
    Device=0
586 18 equemene
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
587 18 equemene
    GpuStyle='OpenCL'
588 18 equemene
    # Parallel distribution can be on Threads or Blocks
589 18 equemene
    ParaStyle='Blocks'
590 18 equemene
    # Coupling factor
591 18 equemene
    J=1.
592 18 equemene
    # Magnetic Field
593 18 equemene
    B=0.
594 18 equemene
    # Size of Lattice
595 18 equemene
    Size=256
596 18 equemene
    # Default Temperatures (start, end, step)
597 18 equemene
    Tmin=0.1
598 18 equemene
    Tmax=5
599 18 equemene
    Tstep=0.1
600 18 equemene
    # Default Number of Iterations
601 18 equemene
    Iterations=Size*Size
602 18 equemene
    # Curves is True to print the curves
603 18 equemene
    Curves=False
604 18 equemene
605 18 equemene
    try:
606 18 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hcj:b:z:i:s:e:p:a:d:g:t:",["coupling=","magneticfield=","size=","iterations=","tempstart=","tempend=","tempstep=","alu=","gpustyle=","parastyle="])
607 18 equemene
    except getopt.GetoptError:
608 18 equemene
        print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks>' % sys.argv[0]
609 18 equemene
        sys.exit(2)
610 18 equemene
611 18 equemene
612 18 equemene
    for opt, arg in opts:
613 18 equemene
        if opt == '-h':
614 18 equemene
            print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks/Hybrid>' % sys.argv[0]
615 18 equemene
            sys.exit()
616 18 equemene
        elif opt == '-c':
617 18 equemene
            Curves=True
618 18 equemene
        elif opt in ("-j", "--coupling"):
619 18 equemene
            J = float(arg)
620 18 equemene
        elif opt in ("-b", "--magneticfield"):
621 18 equemene
            B = float(arg)
622 18 equemene
        elif opt in ("-s", "--tempmin"):
623 18 equemene
            Tmin = float(arg)
624 18 equemene
        elif opt in ("-e", "--tempmax"):
625 18 equemene
            Tmax = float(arg)
626 18 equemene
        elif opt in ("-p", "--tempstep"):
627 18 equemene
            Tstep = float(arg)
628 18 equemene
        elif opt in ("-i", "--iterations"):
629 18 equemene
            Iterations = int(arg)
630 18 equemene
        elif opt in ("-z", "--size"):
631 18 equemene
            Size = int(arg)
632 18 equemene
        elif opt in ("-a", "--alu"):
633 18 equemene
            Alu = arg
634 18 equemene
        elif opt in ("-d", "--device"):
635 18 equemene
            Device = int(arg)
636 18 equemene
        elif opt in ("-g", "--gpustyle"):
637 18 equemene
            GpuStyle = arg
638 18 equemene
        elif opt in ("-t", "--parastyle"):
639 18 equemene
            ParaStyle = arg
640 18 equemene
641 18 equemene
    if Alu=='CPU' and GpuStyle=='CUDA':
642 18 equemene
        print "Alu can't be CPU for CUDA, set Alu to GPU"
643 18 equemene
        Alu='GPU'
644 18 equemene
645 18 equemene
    if ParaStyle not in ('Blocks','Threads','Hybrid'):
646 18 equemene
        print "%s not exists, ParaStyle set as Threads !" % ParaStyle
647 18 equemene
        ParaStyle='Blocks'
648 18 equemene
649 18 equemene
    print "Compute unit : %s" % Alu
650 18 equemene
    print "Device Identification : %s" % Device
651 18 equemene
    print "GpuStyle used : %s" % GpuStyle
652 18 equemene
    print "Parallel Style used : %s" % ParaStyle
653 18 equemene
    print "Coupling Factor : %s" % J
654 18 equemene
    print "Magnetic Field :  %s" % B
655 18 equemene
    print "Size of lattice : %s" % Size
656 18 equemene
    print "Iterations : %s" % Iterations
657 18 equemene
    print "Temperature on start : %s" % Tmin
658 18 equemene
    print "Temperature on end : %s" % Tmax
659 18 equemene
    print "Temperature step : %s" % Tstep
660 18 equemene
661 18 equemene
    if GpuStyle=='CUDA':
662 18 equemene
        # For PyCUDA import
663 18 equemene
        import pycuda.driver as cuda
664 18 equemene
        import pycuda.gpuarray as gpuarray
665 18 equemene
        import pycuda.autoinit
666 18 equemene
        from pycuda.compiler import SourceModule
667 18 equemene
668 18 equemene
    if GpuStyle=='OpenCL':
669 18 equemene
        # For PyOpenCL import
670 18 equemene
        import pyopencl as cl
671 18 equemene
        Id=1
672 18 equemene
        for platform in cl.get_platforms():
673 18 equemene
            for device in platform.get_devices():
674 144 equemene
                #deviceType=cl.device_type.to_string(device.type)
675 144 equemene
                deviceType="xPU"
676 18 equemene
                print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
677 18 equemene
                Id=Id+1
678 18 equemene
679 18 equemene
    LAPIMAGE=False
680 18 equemene
681 18 equemene
    sigmaIn=numpy.where(numpy.random.randn(Size,Size)>0,1,-1).astype(numpy.int8)
682 18 equemene
683 18 equemene
    ImageOutput(sigmaIn,"Ising2D_Serial_%i_Initial" % (Size))
684 18 equemene
685 18 equemene
    # La temperature est passee comme parametre, attention au CAST !
686 18 equemene
    Trange=numpy.arange(Tmin,Tmax+Tstep,Tstep).astype(numpy.float32)
687 18 equemene
688 18 equemene
    E=[]
689 18 equemene
    M=[]
690 18 equemene
691 18 equemene
    sigma={}
692 18 equemene
    for T in Trange:
693 18 equemene
        sigma[T]=numpy.copy(sigmaIn)
694 18 equemene
695 18 equemene
    jobs=len(Trange)
696 18 equemene
697 18 equemene
    # For GPU, all process are launched
698 18 equemene
    if GpuStyle=='CUDA':
699 18 equemene
        duration=MetropolisAllCuda(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
700 18 equemene
    else:
701 18 equemene
        duration=MetropolisAllOpenCL(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
702 18 equemene
703 18 equemene
    print BestThreadsNumber(len(Trange))
704 18 equemene
705 18 equemene
    for T in Trange:
706 18 equemene
        E=numpy.append(E,Energy(sigma[T],J))
707 18 equemene
        M=numpy.append(M,Magnetization(sigma[T],B))
708 18 equemene
        print "CPU Time for each : %f" % (duration/len(Trange))
709 18 equemene
        print "Total Energy at Temperature %f : %f" % (T,E[-1])
710 18 equemene
        print "Total Magnetization at Temperature %f : %f" % (T,M[-1])
711 18 equemene
        ImageOutput(sigma[T],"Ising2D_Serial_%i_%1.1f_Final" % (Size,T))
712 18 equemene
713 18 equemene
    if Curves:
714 18 equemene
        DisplayCurves(Trange,E,M,J,B)
715 18 equemene