Statistiques
| Révision :

root / Ising / GPU / Ising2D-GPU.py @ 89

Historique | Voir | Annoter | Télécharger (23,63 ko)

1 18 equemene
#!/usr/bin/env python
2 18 equemene
#
3 18 equemene
# Ising2D model in serial mode
4 18 equemene
#
5 18 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
6 18 equemene
7 18 equemene
import sys
8 18 equemene
import numpy
9 18 equemene
import math
10 18 equemene
from PIL import Image
11 18 equemene
from math import exp
12 18 equemene
from random import random
13 18 equemene
import time
14 18 equemene
import getopt
15 18 equemene
import matplotlib.pyplot as plt
16 18 equemene
from numpy.random import randint as nprnd
17 18 equemene
18 18 equemene
KERNEL_CODE_OPENCL="""
19 18 equemene

20 18 equemene
// Marsaglia RNG very simple implementation
21 18 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
22 18 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
23 18 equemene
#define MWC   (znew+wnew)
24 18 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
25 18 equemene
#define CONG  (jcong=69069*jcong+1234567)
26 18 equemene
#define KISS  ((MWC^CONG)+SHR3)
27 18 equemene

28 18 equemene
#define MWCfp MWC * 2.328306435454494e-10f
29 18 equemene
#define KISSfp KISS * 2.328306435454494e-10f
30 18 equemene

31 18 equemene
__kernel void MainLoopOne(__global char *s,float T,float J,float B,
32 18 equemene
                          uint sizex,uint sizey,
33 18 equemene
                          uint iterations,uint seed_w,uint seed_z)
34 18 equemene

35 18 equemene
{
36 18 equemene
   uint z=seed_z;
37 18 equemene
   uint w=seed_w;
38 18 equemene

39 18 equemene
   for (uint i=0;i<iterations;i++) {
40 18 equemene

41 18 equemene
      uint x=(uint)(MWC%sizex) ;
42 18 equemene
      uint y=(uint)(MWC%sizey) ;
43 18 equemene

44 18 equemene
      int p=s[x+sizex*y];
45 18 equemene

46 18 equemene
      int d=s[x+sizex*((y+1)%sizey)];
47 18 equemene
      int u=s[x+sizex*((y-1)%sizey)];
48 18 equemene
      int l=s[((x-1)%sizex)+sizex*y];
49 18 equemene
      int r=s[((x+1)%sizex)+sizex*y];
50 18 equemene

51 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
52 18 equemene

53 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
54 18 equemene
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
55 18 equemene
   }
56 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
57 18 equemene
}
58 18 equemene

59 18 equemene
__kernel void MainLoopGlobal(__global char *s,__global float *T,float J,float B,
60 18 equemene
                             uint sizex,uint sizey,
61 18 equemene
                             uint iterations,uint seed_w,uint seed_z)
62 18 equemene

63 18 equemene
{
64 18 equemene
   uint z=seed_z/(get_global_id(0)+1);
65 18 equemene
   uint w=seed_w/(get_global_id(0)+1);
66 18 equemene
   float t=T[get_global_id(0)];
67 18 equemene
   uint ind=get_global_id(0);
68 18 equemene

69 18 equemene
   for (uint i=0;i<iterations;i++) {
70 18 equemene

71 18 equemene
      uint x=(uint)(MWC%sizex) ;
72 18 equemene
      uint y=(uint)(MWC%sizey) ;
73 18 equemene

74 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
75 18 equemene

76 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
77 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
78 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
79 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
80 18 equemene

81 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
82 18 equemene

83 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
84 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
85 18 equemene

86 18 equemene
   }
87 18 equemene

88 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
89 18 equemene

90 18 equemene
}
91 18 equemene

92 18 equemene
__kernel void MainLoopHybrid(__global char *s,__global float *T,float J,float B,
93 18 equemene
                             uint sizex,uint sizey,
94 18 equemene
                             uint iterations,uint seed_w,uint seed_z)
95 18 equemene

96 18 equemene
{
97 18 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
98 18 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
99 18 equemene
   float t=T[get_group_id(0)*get_num_groups(0)+get_local_id(0)];
100 18 equemene
   uint ind=get_group_id(0)*get_num_groups(0)+get_local_id(0);
101 18 equemene

102 18 equemene
   for (uint i=0;i<iterations;i++) {
103 18 equemene

104 18 equemene
      uint x=(uint)(MWC%sizex) ;
105 18 equemene
      uint y=(uint)(MWC%sizey) ;
106 18 equemene

107 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
108 18 equemene

109 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
110 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
111 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
112 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
113 18 equemene

114 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
115 18 equemene

116 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
117 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
118 18 equemene

119 18 equemene
   }
120 18 equemene

121 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
122 18 equemene

123 18 equemene
}
124 18 equemene

125 18 equemene
__kernel void MainLoopLocal(__global char *s,__global float *T,float J,float B,
126 18 equemene
                            uint sizex,uint sizey,
127 18 equemene
                            uint iterations,uint seed_w,uint seed_z)
128 18 equemene
{
129 18 equemene
   uint z=seed_z/(get_local_id(0)+1);
130 18 equemene
   uint w=seed_w/(get_local_id(0)+1);
131 18 equemene
   float t=T[get_local_id(0)];
132 18 equemene
   uint ind=get_local_id(0);
133 18 equemene

134 18 equemene
   for (uint i=0;i<iterations;i++) {
135 18 equemene

136 18 equemene
      uint x=(uint)(MWC%sizex) ;
137 18 equemene
      uint y=(uint)(MWC%sizey) ;
138 18 equemene

139 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
140 18 equemene

141 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
142 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
143 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
144 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
145 18 equemene

146 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
147 18 equemene

148 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
149 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
150 18 equemene
   }
151 18 equemene

152 18 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
153 18 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
154 18 equemene

155 18 equemene
}
156 18 equemene
"""
157 18 equemene
158 18 equemene
KERNEL_CODE_CUDA="""
159 18 equemene

160 18 equemene
// Marsaglia RNG very simple implementation
161 18 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
162 18 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
163 18 equemene
#define MWC   (znew+wnew)
164 18 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
165 18 equemene
#define CONG  (jcong=69069*jcong+1234567)
166 18 equemene
#define KISS  ((MWC^CONG)+SHR3)
167 18 equemene

168 18 equemene
#define MWCfp MWC * 2.328306435454494e-10f
169 18 equemene
#define KISSfp KISS * 2.328306435454494e-10f
170 18 equemene

171 18 equemene
__global__ void MainLoopOne(char *s,float T,float J,float B,
172 18 equemene
                            uint sizex,uint sizey,
173 18 equemene
                            uint iterations,uint seed_w,uint seed_z)
174 18 equemene

175 18 equemene
{
176 18 equemene
   uint z=seed_z;
177 18 equemene
   uint w=seed_w;
178 18 equemene

179 18 equemene
   for (uint i=0;i<iterations;i++) {
180 18 equemene

181 18 equemene
      uint x=(uint)(MWC%sizex) ;
182 18 equemene
      uint y=(uint)(MWC%sizey) ;
183 18 equemene

184 18 equemene
      int p=s[x+sizex*y];
185 18 equemene

186 18 equemene
      int d=s[x+sizex*((y+1)%sizey)];
187 18 equemene
      int u=s[x+sizex*((y-1)%sizey)];
188 18 equemene
      int l=s[((x-1)%sizex)+sizex*y];
189 18 equemene
      int r=s[((x+1)%sizex)+sizex*y];
190 18 equemene

191 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
192 18 equemene

193 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
194 18 equemene
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
195 18 equemene
   }
196 18 equemene
   __syncthreads();
197 18 equemene

198 18 equemene
}
199 18 equemene

200 18 equemene
__global__ void MainLoopGlobal(char *s,float *T,float J,float B,
201 18 equemene
                               uint sizex,uint sizey,
202 18 equemene
                               uint iterations,uint seed_w,uint seed_z)
203 18 equemene

204 18 equemene
{
205 18 equemene
   uint z=seed_z/(blockIdx.x+1);
206 18 equemene
   uint w=seed_w/(blockIdx.x+1);
207 18 equemene
   float t=T[blockIdx.x];
208 18 equemene
   uint ind=blockIdx.x;
209 18 equemene

210 18 equemene
   for (uint i=0;i<iterations;i++) {
211 18 equemene

212 18 equemene
      uint x=(uint)(MWC%sizex) ;
213 18 equemene
      uint y=(uint)(MWC%sizey) ;
214 18 equemene

215 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
216 18 equemene

217 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
218 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
219 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
220 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
221 18 equemene

222 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
223 18 equemene

224 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
225 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
226 18 equemene

227 18 equemene
   }
228 18 equemene
   __syncthreads();
229 18 equemene

230 18 equemene
}
231 18 equemene

232 18 equemene
__global__ void MainLoopHybrid(char *s,float *T,float J,float B,
233 18 equemene
                               uint sizex,uint sizey,
234 18 equemene
                               uint iterations,uint seed_w,uint seed_z)
235 18 equemene

236 18 equemene
{
237 18 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
238 18 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
239 18 equemene
   float t=T[blockDim.x*blockIdx.x+threadIdx.x];
240 18 equemene
   uint ind=blockDim.x*blockIdx.x+threadIdx.x;
241 18 equemene

242 18 equemene
   for (uint i=0;i<iterations;i++) {
243 18 equemene

244 18 equemene
      uint x=(uint)(MWC%sizex) ;
245 18 equemene
      uint y=(uint)(MWC%sizey) ;
246 18 equemene

247 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
248 18 equemene

249 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
250 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
251 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
252 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
253 18 equemene

254 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
255 18 equemene

256 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
257 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
258 18 equemene

259 18 equemene
   }
260 18 equemene
   __syncthreads();
261 18 equemene

262 18 equemene
}
263 18 equemene

264 18 equemene
__global__ void MainLoopLocal(char *s,float *T,float J,float B,
265 18 equemene
                              uint sizex,uint sizey,
266 18 equemene
                              uint iterations,uint seed_w,uint seed_z)
267 18 equemene
{
268 18 equemene
   uint z=seed_z/(threadIdx.x+1);
269 18 equemene
   uint w=seed_w/(threadIdx.x+1);
270 18 equemene
   float t=T[threadIdx.x];
271 18 equemene
   uint ind=threadIdx.x;
272 18 equemene

273 18 equemene
   for (uint i=0;i<iterations;i++) {
274 18 equemene

275 18 equemene
      uint x=(uint)(MWC%sizex) ;
276 18 equemene
      uint y=(uint)(MWC%sizey) ;
277 18 equemene

278 18 equemene
      int p=s[x+sizex*(y+sizey*ind)];
279 18 equemene

280 18 equemene
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
281 18 equemene
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
282 18 equemene
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
283 18 equemene
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
284 18 equemene

285 18 equemene
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
286 18 equemene

287 18 equemene
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
288 18 equemene
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
289 18 equemene
   }
290 18 equemene
   __syncthreads();
291 18 equemene

292 18 equemene
}
293 18 equemene
"""
294 18 equemene
295 18 equemene
# find prime factors of a number
296 18 equemene
# Get for WWW :
297 18 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
298 18 equemene
def PrimeFactors(x):
299 18 equemene
    factorlist=numpy.array([]).astype('uint32')
300 18 equemene
    loop=2
301 18 equemene
    while loop<=x:
302 18 equemene
        if x%loop==0:
303 18 equemene
            x/=loop
304 18 equemene
            factorlist=numpy.append(factorlist,[loop])
305 18 equemene
        else:
306 18 equemene
            loop+=1
307 18 equemene
    return factorlist
308 18 equemene
309 18 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
310 18 equemene
# output is thread number
311 18 equemene
def BestThreadsNumber(jobs):
312 18 equemene
    factors=PrimeFactors(jobs)
313 18 equemene
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
314 18 equemene
    threads=1
315 18 equemene
    for factor in matrix.transpose().ravel():
316 18 equemene
        threads=threads*factor
317 18 equemene
        if threads*threads>jobs:
318 18 equemene
            break
319 18 equemene
    return(long(threads))
320 18 equemene
321 18 equemene
def ImageOutput(sigma,prefix):
322 18 equemene
    Max=sigma.max()
323 18 equemene
    Min=sigma.min()
324 18 equemene
325 18 equemene
    # Normalize value as 8bits Integer
326 18 equemene
    SigmaInt=(255*(sigma-Min)/(Max-Min)).astype('uint8')
327 18 equemene
    image = Image.fromarray(SigmaInt)
328 18 equemene
    image.save("%s.jpg" % prefix)
329 18 equemene
330 18 equemene
def Metropolis(sigma,T,J,B,iterations):
331 18 equemene
    start=time.time()
332 18 equemene
333 18 equemene
    SizeX,SizeY=sigma.shape
334 18 equemene
335 18 equemene
    for p in xrange(0,iterations):
336 18 equemene
        # Random access coordonate
337 18 equemene
        X,Y=numpy.random.randint(SizeX),numpy.random.randint(SizeY)
338 18 equemene
339 18 equemene
        DeltaE=J*sigma[X,Y]*(2*(sigma[X,(Y+1)%SizeY]+
340 18 equemene
                                sigma[X,(Y-1)%SizeY]+
341 18 equemene
                                sigma[(X-1)%SizeX,Y]+
342 18 equemene
                                sigma[(X+1)%SizeX,Y])+B)
343 18 equemene
344 18 equemene
        if DeltaE < 0. or random() < exp(-DeltaE/T):
345 18 equemene
            sigma[X,Y]=-sigma[X,Y]
346 18 equemene
    duration=time.time()-start
347 18 equemene
    return(duration)
348 18 equemene
349 18 equemene
def MetropolisAllOpenCL(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
350 18 equemene
351 18 equemene
    # sigmaDict & Tlist are NOT respectively array & float
352 18 equemene
    # sigmaDict : dict of array for each temperatoire
353 18 equemene
    # TList : list of temperatures
354 18 equemene
355 18 equemene
    # Initialisation des variables en les CASTant correctement
356 18 equemene
357 18 equemene
    # Je detecte un peripherique GPU dans la liste des peripheriques
358 18 equemene
359 18 equemene
    HasGPU=False
360 18 equemene
    Id=1
361 18 equemene
    # Primary Device selection based on Device Id
362 18 equemene
    for platform in cl.get_platforms():
363 18 equemene
        for device in platform.get_devices():
364 18 equemene
            deviceType=cl.device_type.to_string(device.type)
365 18 equemene
            if Id==Device and not HasGPU:
366 18 equemene
                GPU=device
367 18 equemene
                print "CPU/GPU selected: ",device.name
368 18 equemene
                HasGPU=True
369 18 equemene
            Id=Id+1
370 18 equemene
    # Default Device selection based on ALU Type
371 18 equemene
    for platform in cl.get_platforms():
372 18 equemene
        for device in platform.get_devices():
373 18 equemene
            deviceType=cl.device_type.to_string(device.type)
374 18 equemene
            if deviceType=="GPU" and Alu=="GPU" and not HasGPU:
375 18 equemene
                GPU=device
376 18 equemene
                print "GPU selected: ",device.name
377 18 equemene
                HasGPU=True
378 18 equemene
            if deviceType=="CPU" and Alu=="CPU" and not HasGPU:
379 18 equemene
                GPU=device
380 18 equemene
                print "CPU selected: ",device.name
381 18 equemene
                HasGPU=True
382 18 equemene
383 18 equemene
    # Je cree le contexte et la queue pour son execution
384 18 equemene
    # ctx = cl.create_some_context()
385 18 equemene
    ctx = cl.Context([GPU])
386 18 equemene
    queue = cl.CommandQueue(ctx,
387 18 equemene
                            properties=cl.command_queue_properties.PROFILING_ENABLE)
388 18 equemene
389 18 equemene
    # Je recupere les flag possibles pour les buffers
390 18 equemene
    mf = cl.mem_flags
391 18 equemene
392 18 equemene
    # Concatenate all sigma in single array
393 18 equemene
    sigma=numpy.copy(sigmaDict[TList[0]])
394 18 equemene
    for T in TList[1:]:
395 18 equemene
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
396 18 equemene
397 18 equemene
    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)
398 18 equemene
    TCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=TList)
399 18 equemene
400 18 equemene
    MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
401 18 equemene
        options = "-cl-mad-enable -cl-fast-relaxed-math")
402 18 equemene
403 18 equemene
    SizeX,SizeY=sigmaDict[TList[0]].shape
404 18 equemene
405 18 equemene
    if ParaStyle=='Blocks':
406 18 equemene
        # Call OpenCL kernel
407 18 equemene
        # (1,) is Global work size (only 1 work size)
408 18 equemene
        # (1,) is local work size
409 18 equemene
        # SeedZCL is lattice translated in CL format
410 18 equemene
        # SeedWCL is lattice translated in CL format
411 18 equemene
        # step is number of iterations
412 18 equemene
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
413 18 equemene
                                             sigmaCL,
414 18 equemene
                                             TCL,
415 18 equemene
                                             numpy.float32(J),
416 18 equemene
                                             numpy.float32(B),
417 18 equemene
                                             numpy.uint32(SizeX),
418 18 equemene
                                             numpy.uint32(SizeY),
419 18 equemene
                                             numpy.uint32(iterations),
420 18 equemene
                                             numpy.uint32(nprnd(2**31-1)),
421 18 equemene
                                             numpy.uint32(nprnd(2**31-1)))
422 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
423 18 equemene
              (Alu,jobs,1,ParaStyle)
424 18 equemene
    elif ParaStyle=='Threads':
425 18 equemene
        # It's necessary to put a Local_ID equal to a Global_ID
426 18 equemene
        # Jobs are to be considerated as global number of jobs to do
427 18 equemene
        # And to be distributed to entities
428 18 equemene
        # For example :
429 18 equemene
        # G_ID=10 & L_ID=10 : 10 Threads on 1 UC
430 18 equemene
        # G_ID=10 & L_ID=1 : 10 Threads on 1 UC
431 18 equemene
432 18 equemene
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
433 18 equemene
                                            sigmaCL,
434 18 equemene
                                            TCL,
435 18 equemene
                                            numpy.float32(J),
436 18 equemene
                                            numpy.float32(B),
437 18 equemene
                                            numpy.uint32(SizeX),
438 18 equemene
                                            numpy.uint32(SizeY),
439 18 equemene
                                            numpy.uint32(iterations),
440 18 equemene
                                            numpy.uint32(nprnd(2**31-1)),
441 18 equemene
                                            numpy.uint32(nprnd(2**31-1)))
442 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
443 18 equemene
              (Alu,1,jobs,ParaStyle)
444 18 equemene
    else:
445 18 equemene
        threads=BestThreadsNumber(jobs)
446 18 equemene
        # en OpenCL, necessaire de mettre un Global_id identique au local_id
447 18 equemene
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
448 18 equemene
                                             sigmaCL,
449 18 equemene
                                             TCL,
450 18 equemene
                                             numpy.float32(J),
451 18 equemene
                                             numpy.float32(B),
452 18 equemene
                                             numpy.uint32(SizeX),
453 18 equemene
                                             numpy.uint32(SizeY),
454 18 equemene
                                             numpy.uint32(iterations),
455 18 equemene
                                             numpy.uint32(nprnd(2**31-1)),
456 18 equemene
                                             numpy.uint32(nprnd(2**31-1)))
457 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
458 18 equemene
              (Alu,jobs/threads,threads,ParaStyle)
459 18 equemene
460 18 equemene
    CLLaunch.wait()
461 18 equemene
    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
462 18 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
463 18 equemene
    sigmaCL.release()
464 18 equemene
465 18 equemene
    results=numpy.split(sigma,len(TList),axis=1)
466 18 equemene
    for T in TList:
467 18 equemene
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
468 18 equemene
469 18 equemene
    return(elapsed)
470 18 equemene
471 18 equemene
def MetropolisAllCuda(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
472 18 equemene
473 18 equemene
    # sigmaDict & Tlist are NOT respectively array & float
474 18 equemene
    # sigmaDict : dict of array for each temperatoire
475 18 equemene
    # TList : list of temperatures
476 18 equemene
477 18 equemene
    # Avec PyCUDA autoinit, rien a faire !
478 18 equemene
479 18 equemene
    mod = SourceModule(KERNEL_CODE_CUDA)
480 18 equemene
481 18 equemene
    MetropolisBlocksCuda=mod.get_function("MainLoopGlobal")
482 18 equemene
    MetropolisThreadsCuda=mod.get_function("MainLoopLocal")
483 18 equemene
    MetropolisHybridCuda=mod.get_function("MainLoopHybrid")
484 18 equemene
485 18 equemene
    # Concatenate all sigma in single array
486 18 equemene
    sigma=numpy.copy(sigmaDict[TList[0]])
487 18 equemene
    for T in TList[1:]:
488 18 equemene
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
489 18 equemene
490 18 equemene
    sigmaCU=cuda.InOut(sigma)
491 18 equemene
    TCU=cuda.InOut(TList)
492 18 equemene
493 18 equemene
    SizeX,SizeY=sigmaDict[TList[0]].shape
494 18 equemene
495 18 equemene
    start = pycuda.driver.Event()
496 18 equemene
    stop = pycuda.driver.Event()
497 18 equemene
498 18 equemene
    start.record()
499 18 equemene
    start.synchronize()
500 18 equemene
    if ParaStyle=='Blocks':
501 18 equemene
        # Call CUDA kernel
502 18 equemene
        # (1,) is Global work size (only 1 work size)
503 18 equemene
        # (1,) is local work size
504 18 equemene
        # SeedZCL is lattice translated in CL format
505 18 equemene
        # SeedWCL is lattice translated in CL format
506 18 equemene
        # step is number of iterations
507 18 equemene
        MetropolisBlocksCuda(sigmaCU,
508 18 equemene
                             TCU,
509 18 equemene
                             numpy.float32(J),
510 18 equemene
                             numpy.float32(B),
511 18 equemene
                             numpy.uint32(SizeX),
512 18 equemene
                             numpy.uint32(SizeY),
513 18 equemene
                             numpy.uint32(iterations),
514 18 equemene
                             numpy.uint32(nprnd(2**31-1)),
515 18 equemene
                             numpy.uint32(nprnd(2**31-1)),
516 18 equemene
                             grid=(jobs,1),block=(1,1,1))
517 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
518 18 equemene
              (Alu,jobs,1,ParaStyle)
519 18 equemene
    elif ParaStyle=='Threads':
520 18 equemene
        MetropolisThreadsCuda(sigmaCU,
521 18 equemene
                              TCU,
522 18 equemene
                              numpy.float32(J),
523 18 equemene
                              numpy.float32(B),
524 18 equemene
                              numpy.uint32(SizeX),
525 18 equemene
                              numpy.uint32(SizeY),
526 18 equemene
                              numpy.uint32(iterations),
527 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
528 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
529 18 equemene
                              grid=(1,1),block=(jobs,1,1))
530 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
531 18 equemene
              (Alu,1,jobs,ParaStyle)
532 18 equemene
    else:
533 18 equemene
        threads=BestThreadsNumber(jobs)
534 18 equemene
        MetropolisHybridCuda(sigmaCU,
535 18 equemene
                              TCU,
536 18 equemene
                              numpy.float32(J),
537 18 equemene
                              numpy.float32(B),
538 18 equemene
                              numpy.uint32(SizeX),
539 18 equemene
                              numpy.uint32(SizeY),
540 18 equemene
                              numpy.uint32(iterations),
541 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
542 18 equemene
                              numpy.uint32(nprnd(2**31-1)),
543 18 equemene
                              grid=(jobs/threads,1),block=(threads,1,1))
544 18 equemene
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
545 18 equemene
              (Alu,jobs/threads,threads,ParaStyle)
546 18 equemene
547 18 equemene
    stop.record()
548 18 equemene
    stop.synchronize()
549 18 equemene
    elapsed = start.time_till(stop)*1e-3
550 18 equemene
551 18 equemene
    results=numpy.split(sigma,len(TList),axis=1)
552 18 equemene
    for T in TList:
553 18 equemene
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
554 18 equemene
555 18 equemene
    return(elapsed)
556 18 equemene
557 18 equemene
558 18 equemene
def Magnetization(sigma,M):
559 18 equemene
    return(numpy.sum(sigma)/(sigma.shape[0]*sigma.shape[1]*1.0))
560 18 equemene
561 18 equemene
def Energy(sigma,J):
562 18 equemene
    # Copier et caster
563 18 equemene
    E=numpy.copy(sigma).astype(numpy.float32)
564 18 equemene
565 18 equemene
    # Appel par slice
566 18 equemene
    E[1:-1,1:-1]=-J*E[1:-1,1:-1]*(E[:-2,1:-1]+E[2:,1:-1]+
567 18 equemene
                                  E[1:-1,:-2]+E[1:-1,2:])
568 18 equemene
569 18 equemene
    # Bien nettoyer la peripherie
570 18 equemene
    E[:,0]=0
571 18 equemene
    E[:,-1]=0
572 18 equemene
    E[0,:]=0
573 18 equemene
    E[-1,:]=0
574 18 equemene
575 18 equemene
    Energy=numpy.sum(E)
576 18 equemene
577 18 equemene
    return(Energy/(E.shape[0]*E.shape[1]*1.0))
578 18 equemene
579 18 equemene
def DisplayCurves(T,E,M,J,B):
580 18 equemene
581 18 equemene
    plt.xlabel("Temperature")
582 18 equemene
    plt.ylabel("Energy")
583 18 equemene
584 18 equemene
    Experience,=plt.plot(T,E,label="Energy")
585 18 equemene
586 18 equemene
    plt.legend()
587 18 equemene
    plt.show()
588 18 equemene
589 18 equemene
590 18 equemene
if __name__=='__main__':
591 18 equemene
592 18 equemene
    # Set defaults values
593 18 equemene
    # Alu can be CPU or GPU
594 18 equemene
    Alu='CPU'
595 18 equemene
    # Id of GPU : 0
596 18 equemene
    Device=0
597 18 equemene
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
598 18 equemene
    GpuStyle='OpenCL'
599 18 equemene
    # Parallel distribution can be on Threads or Blocks
600 18 equemene
    ParaStyle='Blocks'
601 18 equemene
    # Coupling factor
602 18 equemene
    J=1.
603 18 equemene
    # Magnetic Field
604 18 equemene
    B=0.
605 18 equemene
    # Size of Lattice
606 18 equemene
    Size=256
607 18 equemene
    # Default Temperatures (start, end, step)
608 18 equemene
    Tmin=0.1
609 18 equemene
    Tmax=5
610 18 equemene
    Tstep=0.1
611 18 equemene
    # Default Number of Iterations
612 18 equemene
    Iterations=Size*Size
613 18 equemene
    # Curves is True to print the curves
614 18 equemene
    Curves=False
615 18 equemene
616 18 equemene
    try:
617 18 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hcj:b:z:i:s:e:p:a:d:g:t:",["coupling=","magneticfield=","size=","iterations=","tempstart=","tempend=","tempstep=","alu=","gpustyle=","parastyle="])
618 18 equemene
    except getopt.GetoptError:
619 18 equemene
        print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks>' % sys.argv[0]
620 18 equemene
        sys.exit(2)
621 18 equemene
622 18 equemene
623 18 equemene
    for opt, arg in opts:
624 18 equemene
        if opt == '-h':
625 18 equemene
            print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks/Hybrid>' % sys.argv[0]
626 18 equemene
            sys.exit()
627 18 equemene
        elif opt == '-c':
628 18 equemene
            Curves=True
629 18 equemene
        elif opt in ("-j", "--coupling"):
630 18 equemene
            J = float(arg)
631 18 equemene
        elif opt in ("-b", "--magneticfield"):
632 18 equemene
            B = float(arg)
633 18 equemene
        elif opt in ("-s", "--tempmin"):
634 18 equemene
            Tmin = float(arg)
635 18 equemene
        elif opt in ("-e", "--tempmax"):
636 18 equemene
            Tmax = float(arg)
637 18 equemene
        elif opt in ("-p", "--tempstep"):
638 18 equemene
            Tstep = float(arg)
639 18 equemene
        elif opt in ("-i", "--iterations"):
640 18 equemene
            Iterations = int(arg)
641 18 equemene
        elif opt in ("-z", "--size"):
642 18 equemene
            Size = int(arg)
643 18 equemene
        elif opt in ("-a", "--alu"):
644 18 equemene
            Alu = arg
645 18 equemene
        elif opt in ("-d", "--device"):
646 18 equemene
            Device = int(arg)
647 18 equemene
        elif opt in ("-g", "--gpustyle"):
648 18 equemene
            GpuStyle = arg
649 18 equemene
        elif opt in ("-t", "--parastyle"):
650 18 equemene
            ParaStyle = arg
651 18 equemene
652 18 equemene
    if Alu=='CPU' and GpuStyle=='CUDA':
653 18 equemene
        print "Alu can't be CPU for CUDA, set Alu to GPU"
654 18 equemene
        Alu='GPU'
655 18 equemene
656 18 equemene
    if ParaStyle not in ('Blocks','Threads','Hybrid'):
657 18 equemene
        print "%s not exists, ParaStyle set as Threads !" % ParaStyle
658 18 equemene
        ParaStyle='Blocks'
659 18 equemene
660 18 equemene
    print "Compute unit : %s" % Alu
661 18 equemene
    print "Device Identification : %s" % Device
662 18 equemene
    print "GpuStyle used : %s" % GpuStyle
663 18 equemene
    print "Parallel Style used : %s" % ParaStyle
664 18 equemene
    print "Coupling Factor : %s" % J
665 18 equemene
    print "Magnetic Field :  %s" % B
666 18 equemene
    print "Size of lattice : %s" % Size
667 18 equemene
    print "Iterations : %s" % Iterations
668 18 equemene
    print "Temperature on start : %s" % Tmin
669 18 equemene
    print "Temperature on end : %s" % Tmax
670 18 equemene
    print "Temperature step : %s" % Tstep
671 18 equemene
672 18 equemene
    if GpuStyle=='CUDA':
673 18 equemene
        # For PyCUDA import
674 18 equemene
        import pycuda.driver as cuda
675 18 equemene
        import pycuda.gpuarray as gpuarray
676 18 equemene
        import pycuda.autoinit
677 18 equemene
        from pycuda.compiler import SourceModule
678 18 equemene
679 18 equemene
    if GpuStyle=='OpenCL':
680 18 equemene
        # For PyOpenCL import
681 18 equemene
        import pyopencl as cl
682 18 equemene
        Id=1
683 18 equemene
        for platform in cl.get_platforms():
684 18 equemene
            for device in platform.get_devices():
685 18 equemene
                deviceType=cl.device_type.to_string(device.type)
686 18 equemene
                print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
687 18 equemene
                Id=Id+1
688 18 equemene
689 18 equemene
    LAPIMAGE=False
690 18 equemene
691 18 equemene
    sigmaIn=numpy.where(numpy.random.randn(Size,Size)>0,1,-1).astype(numpy.int8)
692 18 equemene
693 18 equemene
    ImageOutput(sigmaIn,"Ising2D_Serial_%i_Initial" % (Size))
694 18 equemene
695 18 equemene
    # La temperature est passee comme parametre, attention au CAST !
696 18 equemene
    Trange=numpy.arange(Tmin,Tmax+Tstep,Tstep).astype(numpy.float32)
697 18 equemene
698 18 equemene
    E=[]
699 18 equemene
    M=[]
700 18 equemene
701 18 equemene
    sigma={}
702 18 equemene
    for T in Trange:
703 18 equemene
        sigma[T]=numpy.copy(sigmaIn)
704 18 equemene
705 18 equemene
    jobs=len(Trange)
706 18 equemene
707 18 equemene
    # For GPU, all process are launched
708 18 equemene
    if GpuStyle=='CUDA':
709 18 equemene
        duration=MetropolisAllCuda(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
710 18 equemene
    else:
711 18 equemene
        duration=MetropolisAllOpenCL(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
712 18 equemene
713 18 equemene
    print BestThreadsNumber(len(Trange))
714 18 equemene
715 18 equemene
    for T in Trange:
716 18 equemene
        E=numpy.append(E,Energy(sigma[T],J))
717 18 equemene
        M=numpy.append(M,Magnetization(sigma[T],B))
718 18 equemene
        print "CPU Time for each : %f" % (duration/len(Trange))
719 18 equemene
        print "Total Energy at Temperature %f : %f" % (T,E[-1])
720 18 equemene
        print "Total Magnetization at Temperature %f : %f" % (T,M[-1])
721 18 equemene
        ImageOutput(sigma[T],"Ising2D_Serial_%i_%1.1f_Final" % (Size,T))
722 18 equemene
723 18 equemene
    if Curves:
724 18 equemene
        DisplayCurves(Trange,E,M,J,B)
725 18 equemene