Statistiques
| Révision :

root / Ising / GPU / Ising2D-GPU.py @ 18

Historique | Voir | Annoter | Télécharger (23,63 ko)

1
#!/usr/bin/env python
2
#
3
# Ising2D model in serial mode
4
#
5
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
6

    
7
import sys
8
import numpy
9
import math
10
from PIL import Image
11
from math import exp
12
from random import random
13
import time
14
import getopt
15
import matplotlib.pyplot as plt
16
from numpy.random import randint as nprnd
17

    
18
KERNEL_CODE_OPENCL="""
19

20
// Marsaglia RNG very simple implementation
21
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
22
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
23
#define MWC   (znew+wnew)
24
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
25
#define CONG  (jcong=69069*jcong+1234567)
26
#define KISS  ((MWC^CONG)+SHR3)
27

28
#define MWCfp MWC * 2.328306435454494e-10f
29
#define KISSfp KISS * 2.328306435454494e-10f
30

31
__kernel void MainLoopOne(__global char *s,float T,float J,float B,
32
                          uint sizex,uint sizey,
33
                          uint iterations,uint seed_w,uint seed_z)
34

35
{
36
   uint z=seed_z;
37
   uint w=seed_w;
38

39
   for (uint i=0;i<iterations;i++) {
40

41
      uint x=(uint)(MWC%sizex) ;
42
      uint y=(uint)(MWC%sizey) ;
43

44
      int p=s[x+sizex*y];
45

46
      int d=s[x+sizex*((y+1)%sizey)];
47
      int u=s[x+sizex*((y-1)%sizey)];
48
      int l=s[((x-1)%sizex)+sizex*y];
49
      int r=s[((x+1)%sizex)+sizex*y];
50

51
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
52

53
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
54
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
55
   }
56
   barrier(CLK_GLOBAL_MEM_FENCE);      
57
}
58

59
__kernel void MainLoopGlobal(__global char *s,__global float *T,float J,float B,
60
                             uint sizex,uint sizey,
61
                             uint iterations,uint seed_w,uint seed_z)
62

63
{
64
   uint z=seed_z/(get_global_id(0)+1);
65
   uint w=seed_w/(get_global_id(0)+1);
66
   float t=T[get_global_id(0)];
67
   uint ind=get_global_id(0);
68

69
   for (uint i=0;i<iterations;i++) {
70

71
      uint x=(uint)(MWC%sizex) ;
72
      uint y=(uint)(MWC%sizey) ;
73

74
      int p=s[x+sizex*(y+sizey*ind)];
75

76
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
77
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
78
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
79
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
80

81
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
82

83
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
84
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
85
      
86
   }
87

88
   barrier(CLK_GLOBAL_MEM_FENCE);
89
      
90
}
91

92
__kernel void MainLoopHybrid(__global char *s,__global float *T,float J,float B,
93
                             uint sizex,uint sizey,
94
                             uint iterations,uint seed_w,uint seed_z)
95

96
{
97
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
98
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
99
   float t=T[get_group_id(0)*get_num_groups(0)+get_local_id(0)];
100
   uint ind=get_group_id(0)*get_num_groups(0)+get_local_id(0);
101

102
   for (uint i=0;i<iterations;i++) {
103

104
      uint x=(uint)(MWC%sizex) ;
105
      uint y=(uint)(MWC%sizey) ;
106

107
      int p=s[x+sizex*(y+sizey*ind)];
108

109
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
110
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
111
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
112
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
113

114
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
115

116
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
117
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
118
      
119
   }
120

121
   barrier(CLK_GLOBAL_MEM_FENCE);
122
      
123
}
124

125
__kernel void MainLoopLocal(__global char *s,__global float *T,float J,float B,
126
                            uint sizex,uint sizey,
127
                            uint iterations,uint seed_w,uint seed_z)
128
{
129
   uint z=seed_z/(get_local_id(0)+1);
130
   uint w=seed_w/(get_local_id(0)+1);
131
   float t=T[get_local_id(0)];
132
   uint ind=get_local_id(0);
133

134
   for (uint i=0;i<iterations;i++) {
135

136
      uint x=(uint)(MWC%sizex) ;
137
      uint y=(uint)(MWC%sizey) ;
138

139
      int p=s[x+sizex*(y+sizey*ind)];
140

141
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
142
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
143
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
144
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
145

146
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
147

148
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
149
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
150
   }
151

152
   barrier(CLK_LOCAL_MEM_FENCE);
153
   barrier(CLK_GLOBAL_MEM_FENCE);
154
      
155
}
156
"""
157

    
158
KERNEL_CODE_CUDA="""
159

160
// Marsaglia RNG very simple implementation
161
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
162
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
163
#define MWC   (znew+wnew)
164
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
165
#define CONG  (jcong=69069*jcong+1234567)
166
#define KISS  ((MWC^CONG)+SHR3)
167

168
#define MWCfp MWC * 2.328306435454494e-10f
169
#define KISSfp KISS * 2.328306435454494e-10f
170

171
__global__ void MainLoopOne(char *s,float T,float J,float B,
172
                            uint sizex,uint sizey,
173
                            uint iterations,uint seed_w,uint seed_z)
174

175
{
176
   uint z=seed_z;
177
   uint w=seed_w;
178

179
   for (uint i=0;i<iterations;i++) {
180

181
      uint x=(uint)(MWC%sizex) ;
182
      uint y=(uint)(MWC%sizey) ;
183

184
      int p=s[x+sizex*y];
185

186
      int d=s[x+sizex*((y+1)%sizey)];
187
      int u=s[x+sizex*((y-1)%sizey)];
188
      int l=s[((x-1)%sizex)+sizex*y];
189
      int r=s[((x+1)%sizex)+sizex*y];
190

191
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
192

193
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
194
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
195
   }
196
   __syncthreads();
197

198
}
199

200
__global__ void MainLoopGlobal(char *s,float *T,float J,float B,
201
                               uint sizex,uint sizey,
202
                               uint iterations,uint seed_w,uint seed_z)
203

204
{
205
   uint z=seed_z/(blockIdx.x+1);
206
   uint w=seed_w/(blockIdx.x+1);
207
   float t=T[blockIdx.x];
208
   uint ind=blockIdx.x;
209

210
   for (uint i=0;i<iterations;i++) {
211

212
      uint x=(uint)(MWC%sizex) ;
213
      uint y=(uint)(MWC%sizey) ;
214

215
      int p=s[x+sizex*(y+sizey*ind)];
216

217
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
218
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
219
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
220
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
221

222
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
223

224
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
225
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
226
      
227
   }
228
   __syncthreads();
229

230
}
231

232
__global__ void MainLoopHybrid(char *s,float *T,float J,float B,
233
                               uint sizex,uint sizey,
234
                               uint iterations,uint seed_w,uint seed_z)
235

236
{
237
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
238
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
239
   float t=T[blockDim.x*blockIdx.x+threadIdx.x];
240
   uint ind=blockDim.x*blockIdx.x+threadIdx.x;
241

242
   for (uint i=0;i<iterations;i++) {
243

244
      uint x=(uint)(MWC%sizex) ;
245
      uint y=(uint)(MWC%sizey) ;
246

247
      int p=s[x+sizex*(y+sizey*ind)];
248

249
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
250
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
251
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
252
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
253

254
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
255

256
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
257
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
258
      
259
   }
260
   __syncthreads();
261

262
}
263

264
__global__ void MainLoopLocal(char *s,float *T,float J,float B,
265
                              uint sizex,uint sizey,
266
                              uint iterations,uint seed_w,uint seed_z)
267
{
268
   uint z=seed_z/(threadIdx.x+1);
269
   uint w=seed_w/(threadIdx.x+1);
270
   float t=T[threadIdx.x];
271
   uint ind=threadIdx.x;
272

273
   for (uint i=0;i<iterations;i++) {
274

275
      uint x=(uint)(MWC%sizex) ;
276
      uint y=(uint)(MWC%sizey) ;
277

278
      int p=s[x+sizex*(y+sizey*ind)];
279

280
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
281
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
282
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
283
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
284

285
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
286

287
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
288
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
289
   }
290
   __syncthreads();
291

292
}
293
"""
294

    
295
# find prime factors of a number
296
# Get for WWW :
297
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
298
def PrimeFactors(x):
299
    factorlist=numpy.array([]).astype('uint32')
300
    loop=2
301
    while loop<=x:
302
        if x%loop==0:
303
            x/=loop
304
            factorlist=numpy.append(factorlist,[loop])
305
        else:
306
            loop+=1
307
    return factorlist
308

    
309
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
310
# output is thread number
311
def BestThreadsNumber(jobs):
312
    factors=PrimeFactors(jobs)
313
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
314
    threads=1
315
    for factor in matrix.transpose().ravel():
316
        threads=threads*factor
317
        if threads*threads>jobs:
318
            break
319
    return(long(threads))
320

    
321
def ImageOutput(sigma,prefix):
322
    Max=sigma.max()
323
    Min=sigma.min()
324
    
325
    # Normalize value as 8bits Integer
326
    SigmaInt=(255*(sigma-Min)/(Max-Min)).astype('uint8')
327
    image = Image.fromarray(SigmaInt)
328
    image.save("%s.jpg" % prefix)
329
    
330
def Metropolis(sigma,T,J,B,iterations): 
331
    start=time.time()
332

    
333
    SizeX,SizeY=sigma.shape
334
    
335
    for p in xrange(0,iterations):
336
        # Random access coordonate
337
        X,Y=numpy.random.randint(SizeX),numpy.random.randint(SizeY)
338
        
339
        DeltaE=J*sigma[X,Y]*(2*(sigma[X,(Y+1)%SizeY]+
340
                                sigma[X,(Y-1)%SizeY]+
341
                                sigma[(X-1)%SizeX,Y]+
342
                                sigma[(X+1)%SizeX,Y])+B)
343
        
344
        if DeltaE < 0. or random() < exp(-DeltaE/T):
345
            sigma[X,Y]=-sigma[X,Y]
346
    duration=time.time()-start
347
    return(duration)
348

    
349
def MetropolisAllOpenCL(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
350

    
351
    # sigmaDict & Tlist are NOT respectively array & float
352
    # sigmaDict : dict of array for each temperatoire
353
    # TList : list of temperatures
354
          
355
    # Initialisation des variables en les CASTant correctement
356
    
357
    # Je detecte un peripherique GPU dans la liste des peripheriques
358

    
359
    HasGPU=False
360
    Id=1
361
    # Primary Device selection based on Device Id
362
    for platform in cl.get_platforms():
363
        for device in platform.get_devices():
364
            deviceType=cl.device_type.to_string(device.type)
365
            if Id==Device and not HasGPU:
366
                GPU=device
367
                print "CPU/GPU selected: ",device.name
368
                HasGPU=True
369
            Id=Id+1
370
    # Default Device selection based on ALU Type
371
    for platform in cl.get_platforms():
372
        for device in platform.get_devices():
373
            deviceType=cl.device_type.to_string(device.type)
374
            if deviceType=="GPU" and Alu=="GPU" and not HasGPU:
375
                GPU=device
376
                print "GPU selected: ",device.name
377
                HasGPU=True
378
            if deviceType=="CPU" and Alu=="CPU" and not HasGPU:        
379
                GPU=device
380
                print "CPU selected: ",device.name
381
                HasGPU=True
382
                                
383
    # Je cree le contexte et la queue pour son execution
384
    # ctx = cl.create_some_context()
385
    ctx = cl.Context([GPU])
386
    queue = cl.CommandQueue(ctx,
387
                            properties=cl.command_queue_properties.PROFILING_ENABLE)
388
    
389
    # Je recupere les flag possibles pour les buffers
390
    mf = cl.mem_flags
391

    
392
    # Concatenate all sigma in single array
393
    sigma=numpy.copy(sigmaDict[TList[0]])
394
    for T in TList[1:]:
395
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
396

    
397
    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)
398
    TCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=TList)
399
   
400
    MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
401
        options = "-cl-mad-enable -cl-fast-relaxed-math")
402

    
403
    SizeX,SizeY=sigmaDict[TList[0]].shape
404

    
405
    if ParaStyle=='Blocks':
406
        # Call OpenCL kernel
407
        # (1,) is Global work size (only 1 work size)
408
        # (1,) is local work size
409
        # SeedZCL is lattice translated in CL format
410
        # SeedWCL is lattice translated in CL format
411
        # step is number of iterations
412
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
413
                                             sigmaCL,
414
                                             TCL, 
415
                                             numpy.float32(J),
416
                                             numpy.float32(B),
417
                                             numpy.uint32(SizeX),
418
                                             numpy.uint32(SizeY),
419
                                             numpy.uint32(iterations),
420
                                             numpy.uint32(nprnd(2**31-1)),
421
                                             numpy.uint32(nprnd(2**31-1)))
422
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
423
              (Alu,jobs,1,ParaStyle)
424
    elif ParaStyle=='Threads':
425
        # It's necessary to put a Local_ID equal to a Global_ID
426
        # Jobs are to be considerated as global number of jobs to do
427
        # And to be distributed to entities
428
        # For example :
429
        # G_ID=10 & L_ID=10 : 10 Threads on 1 UC
430
        # G_ID=10 & L_ID=1 : 10 Threads on 1 UC
431
        
432
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
433
                                            sigmaCL,
434
                                            TCL,
435
                                            numpy.float32(J),
436
                                            numpy.float32(B),
437
                                            numpy.uint32(SizeX),
438
                                            numpy.uint32(SizeY),
439
                                            numpy.uint32(iterations),
440
                                            numpy.uint32(nprnd(2**31-1)),
441
                                            numpy.uint32(nprnd(2**31-1)))
442
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
443
              (Alu,1,jobs,ParaStyle)
444
    else:
445
        threads=BestThreadsNumber(jobs)
446
        # en OpenCL, necessaire de mettre un Global_id identique au local_id
447
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
448
                                             sigmaCL,
449
                                             TCL,
450
                                             numpy.float32(J),
451
                                             numpy.float32(B),
452
                                             numpy.uint32(SizeX),
453
                                             numpy.uint32(SizeY),
454
                                             numpy.uint32(iterations),
455
                                             numpy.uint32(nprnd(2**31-1)),
456
                                             numpy.uint32(nprnd(2**31-1)))
457
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
458
              (Alu,jobs/threads,threads,ParaStyle)
459
        
460
    CLLaunch.wait()
461
    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
462
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
463
    sigmaCL.release()
464

    
465
    results=numpy.split(sigma,len(TList),axis=1)
466
    for T in TList:
467
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
468

    
469
    return(elapsed)
470

    
471
def MetropolisAllCuda(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
472

    
473
    # sigmaDict & Tlist are NOT respectively array & float
474
    # sigmaDict : dict of array for each temperatoire
475
    # TList : list of temperatures
476
          
477
    # Avec PyCUDA autoinit, rien a faire !
478

    
479
    mod = SourceModule(KERNEL_CODE_CUDA)
480
    
481
    MetropolisBlocksCuda=mod.get_function("MainLoopGlobal")
482
    MetropolisThreadsCuda=mod.get_function("MainLoopLocal")
483
    MetropolisHybridCuda=mod.get_function("MainLoopHybrid")
484

    
485
    # Concatenate all sigma in single array
486
    sigma=numpy.copy(sigmaDict[TList[0]])
487
    for T in TList[1:]:
488
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
489

    
490
    sigmaCU=cuda.InOut(sigma)
491
    TCU=cuda.InOut(TList)
492

    
493
    SizeX,SizeY=sigmaDict[TList[0]].shape
494

    
495
    start = pycuda.driver.Event()
496
    stop = pycuda.driver.Event()
497
    
498
    start.record()
499
    start.synchronize()
500
    if ParaStyle=='Blocks':
501
        # Call CUDA kernel
502
        # (1,) is Global work size (only 1 work size)
503
        # (1,) is local work size
504
        # SeedZCL is lattice translated in CL format
505
        # SeedWCL is lattice translated in CL format
506
        # step is number of iterations
507
        MetropolisBlocksCuda(sigmaCU,
508
                             TCU, 
509
                             numpy.float32(J),
510
                             numpy.float32(B),
511
                             numpy.uint32(SizeX),
512
                             numpy.uint32(SizeY),
513
                             numpy.uint32(iterations),
514
                             numpy.uint32(nprnd(2**31-1)),
515
                             numpy.uint32(nprnd(2**31-1)),
516
                             grid=(jobs,1),block=(1,1,1))
517
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
518
              (Alu,jobs,1,ParaStyle)
519
    elif ParaStyle=='Threads':
520
        MetropolisThreadsCuda(sigmaCU,
521
                              TCU,
522
                              numpy.float32(J),
523
                              numpy.float32(B),
524
                              numpy.uint32(SizeX),
525
                              numpy.uint32(SizeY),
526
                              numpy.uint32(iterations),
527
                              numpy.uint32(nprnd(2**31-1)),
528
                              numpy.uint32(nprnd(2**31-1)),
529
                              grid=(1,1),block=(jobs,1,1))
530
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
531
              (Alu,1,jobs,ParaStyle)
532
    else:
533
        threads=BestThreadsNumber(jobs)        
534
        MetropolisHybridCuda(sigmaCU,
535
                              TCU,
536
                              numpy.float32(J),
537
                              numpy.float32(B),
538
                              numpy.uint32(SizeX),
539
                              numpy.uint32(SizeY),
540
                              numpy.uint32(iterations),
541
                              numpy.uint32(nprnd(2**31-1)),
542
                              numpy.uint32(nprnd(2**31-1)),
543
                              grid=(jobs/threads,1),block=(threads,1,1))
544
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
545
              (Alu,jobs/threads,threads,ParaStyle)
546
        
547
    stop.record()
548
    stop.synchronize()
549
    elapsed = start.time_till(stop)*1e-3
550

    
551
    results=numpy.split(sigma,len(TList),axis=1)
552
    for T in TList:
553
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
554

    
555
    return(elapsed)
556

    
557

    
558
def Magnetization(sigma,M):
559
    return(numpy.sum(sigma)/(sigma.shape[0]*sigma.shape[1]*1.0))
560

    
561
def Energy(sigma,J):
562
    # Copier et caster 
563
    E=numpy.copy(sigma).astype(numpy.float32)
564
    
565
    # Appel par slice
566
    E[1:-1,1:-1]=-J*E[1:-1,1:-1]*(E[:-2,1:-1]+E[2:,1:-1]+
567
                                  E[1:-1,:-2]+E[1:-1,2:])
568
    
569
    # Bien nettoyer la peripherie
570
    E[:,0]=0
571
    E[:,-1]=0
572
    E[0,:]=0
573
    E[-1,:]=0
574
    
575
    Energy=numpy.sum(E)
576

    
577
    return(Energy/(E.shape[0]*E.shape[1]*1.0))
578

    
579
def DisplayCurves(T,E,M,J,B):
580

    
581
    plt.xlabel("Temperature")
582
    plt.ylabel("Energy")
583

    
584
    Experience,=plt.plot(T,E,label="Energy") 
585

    
586
    plt.legend()
587
    plt.show()
588

    
589

    
590
if __name__=='__main__':
591

    
592
    # Set defaults values
593
    # Alu can be CPU or GPU
594
    Alu='CPU'
595
    # Id of GPU : 0 
596
    Device=0
597
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
598
    GpuStyle='OpenCL'
599
    # Parallel distribution can be on Threads or Blocks
600
    ParaStyle='Blocks'
601
    # Coupling factor
602
    J=1.
603
    # Magnetic Field
604
    B=0.
605
    # Size of Lattice
606
    Size=256
607
    # Default Temperatures (start, end, step)
608
    Tmin=0.1
609
    Tmax=5
610
    Tstep=0.1
611
    # Default Number of Iterations
612
    Iterations=Size*Size
613
    # Curves is True to print the curves
614
    Curves=False
615

    
616
    try:
617
        opts, args = getopt.getopt(sys.argv[1:],"hcj:b:z:i:s:e:p:a:d:g:t:",["coupling=","magneticfield=","size=","iterations=","tempstart=","tempend=","tempstep=","alu=","gpustyle=","parastyle="])
618
    except getopt.GetoptError:
619
        print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks>' % sys.argv[0]
620
        sys.exit(2)
621
    
622
 
623
    for opt, arg in opts:
624
        if opt == '-h':
625
            print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks/Hybrid>' % sys.argv[0]
626
            sys.exit()
627
        elif opt == '-c':
628
            Curves=True
629
        elif opt in ("-j", "--coupling"):
630
            J = float(arg)
631
        elif opt in ("-b", "--magneticfield"):
632
            B = float(arg)
633
        elif opt in ("-s", "--tempmin"):
634
            Tmin = float(arg)
635
        elif opt in ("-e", "--tempmax"):
636
            Tmax = float(arg)
637
        elif opt in ("-p", "--tempstep"):
638
            Tstep = float(arg)
639
        elif opt in ("-i", "--iterations"):
640
            Iterations = int(arg)
641
        elif opt in ("-z", "--size"):
642
            Size = int(arg)
643
        elif opt in ("-a", "--alu"):
644
            Alu = arg
645
        elif opt in ("-d", "--device"):
646
            Device = int(arg)
647
        elif opt in ("-g", "--gpustyle"):
648
            GpuStyle = arg
649
        elif opt in ("-t", "--parastyle"):
650
            ParaStyle = arg
651

    
652
    if Alu=='CPU' and GpuStyle=='CUDA':
653
        print "Alu can't be CPU for CUDA, set Alu to GPU"
654
        Alu='GPU'
655

    
656
    if ParaStyle not in ('Blocks','Threads','Hybrid'):
657
        print "%s not exists, ParaStyle set as Threads !" % ParaStyle
658
        ParaStyle='Blocks'
659
   
660
    print "Compute unit : %s" % Alu
661
    print "Device Identification : %s" % Device
662
    print "GpuStyle used : %s" % GpuStyle
663
    print "Parallel Style used : %s" % ParaStyle
664
    print "Coupling Factor : %s" % J
665
    print "Magnetic Field :  %s" % B
666
    print "Size of lattice : %s" % Size
667
    print "Iterations : %s" % Iterations
668
    print "Temperature on start : %s" % Tmin
669
    print "Temperature on end : %s" % Tmax
670
    print "Temperature step : %s" % Tstep
671

    
672
    if GpuStyle=='CUDA':
673
        # For PyCUDA import
674
        import pycuda.driver as cuda
675
        import pycuda.gpuarray as gpuarray
676
        import pycuda.autoinit
677
        from pycuda.compiler import SourceModule
678

    
679
    if GpuStyle=='OpenCL':
680
        # For PyOpenCL import
681
        import pyopencl as cl
682
        Id=1
683
        for platform in cl.get_platforms():
684
            for device in platform.get_devices():
685
                deviceType=cl.device_type.to_string(device.type)
686
                print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
687
                Id=Id+1
688

    
689
    LAPIMAGE=False
690

    
691
    sigmaIn=numpy.where(numpy.random.randn(Size,Size)>0,1,-1).astype(numpy.int8)
692

    
693
    ImageOutput(sigmaIn,"Ising2D_Serial_%i_Initial" % (Size))
694

    
695
    # La temperature est passee comme parametre, attention au CAST !
696
    Trange=numpy.arange(Tmin,Tmax+Tstep,Tstep).astype(numpy.float32)
697

    
698
    E=[]
699
    M=[]
700

    
701
    sigma={}
702
    for T in Trange:
703
        sigma[T]=numpy.copy(sigmaIn)
704

    
705
    jobs=len(Trange)
706
    
707
    # For GPU, all process are launched
708
    if GpuStyle=='CUDA':
709
        duration=MetropolisAllCuda(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
710
    else:
711
        duration=MetropolisAllOpenCL(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
712

    
713
    print BestThreadsNumber(len(Trange))
714
    
715
    for T in Trange:
716
        E=numpy.append(E,Energy(sigma[T],J))
717
        M=numpy.append(M,Magnetization(sigma[T],B))
718
        print "CPU Time for each : %f" % (duration/len(Trange))
719
        print "Total Energy at Temperature %f : %f" % (T,E[-1])
720
        print "Total Magnetization at Temperature %f : %f" % (T,M[-1])        
721
        ImageOutput(sigma[T],"Ising2D_Serial_%i_%1.1f_Final" % (Size,T))
722

    
723
    if Curves:
724
        DisplayCurves(Trange,E,M,J,B)
725

    
726