Statistiques
| Révision :

root / Ising / GPU / Ising2D-GPU.py @ 308

Historique | Voir | Annoter | Télécharger (23,16 ko)

1
#!/usr/bin/env python
2
#
3
# Ising2D model in serial mode
4
#
5
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
6

    
7
import sys
8
import numpy
9
import math
10
from PIL import Image
11
from math import exp
12
from random import random
13
import time
14
import getopt
15
import matplotlib.pyplot as plt
16
from numpy.random import randint as nprnd
17

    
18
KERNEL_CODE_OPENCL="""
19

20
// Marsaglia RNG very simple implementation
21
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
22
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
23
#define MWC   (znew+wnew)
24
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
25
#define CONG  (jcong=69069*jcong+1234567)
26
#define KISS  ((MWC^CONG)+SHR3)
27

28
#define MWCfp MWC * 2.328306435454494e-10f
29
#define KISSfp KISS * 2.328306435454494e-10f
30

31
__kernel void MainLoopOne(__global char *s,float T,float J,float B,
32
                          uint sizex,uint sizey,
33
                          uint iterations,uint seed_w,uint seed_z)
34

35
{
36
   uint z=seed_z;
37
   uint w=seed_w;
38

39
   for (uint i=0;i<iterations;i++) {
40

41
      uint x=(uint)(MWC%sizex) ;
42
      uint y=(uint)(MWC%sizey) ;
43

44
      int p=s[x+sizex*y];
45

46
      int d=s[x+sizex*((y+1)%sizey)];
47
      int u=s[x+sizex*((y-1)%sizey)];
48
      int l=s[((x-1)%sizex)+sizex*y];
49
      int r=s[((x+1)%sizex)+sizex*y];
50

51
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
52

53
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
54
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
55
   }
56
   barrier(CLK_GLOBAL_MEM_FENCE);      
57
}
58

59
__kernel void MainLoopGlobal(__global char *s,__global float *T,float J,float B,
60
                             uint sizex,uint sizey,
61
                             uint iterations,uint seed_w,uint seed_z)
62

63
{
64
   uint z=seed_z/(get_global_id(0)+1);
65
   uint w=seed_w/(get_global_id(0)+1);
66
   float t=T[get_global_id(0)];
67
   uint ind=get_global_id(0);
68

69
   for (uint i=0;i<iterations;i++) {
70

71
      uint x=(uint)(MWC%sizex) ;
72
      uint y=(uint)(MWC%sizey) ;
73

74
      int p=s[x+sizex*(y+sizey*ind)];
75

76
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
77
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
78
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
79
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
80

81
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
82

83
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
84
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
85
      
86
   }
87

88
   barrier(CLK_GLOBAL_MEM_FENCE);
89
      
90
}
91

92
__kernel void MainLoopHybrid(__global char *s,__global float *T,float J,float B,
93
                             uint sizex,uint sizey,
94
                             uint iterations,uint seed_w,uint seed_z)
95

96
{
97
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
98
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
99
   float t=T[get_group_id(0)*get_num_groups(0)+get_local_id(0)];
100
   uint ind=get_group_id(0)*get_num_groups(0)+get_local_id(0);
101

102
   for (uint i=0;i<iterations;i++) {
103

104
      uint x=(uint)(MWC%sizex) ;
105
      uint y=(uint)(MWC%sizey) ;
106

107
      int p=s[x+sizex*(y+sizey*ind)];
108

109
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
110
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
111
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
112
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
113

114
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
115

116
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
117
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
118
      
119
   }
120

121
   barrier(CLK_GLOBAL_MEM_FENCE);
122
      
123
}
124

125
__kernel void MainLoopLocal(__global char *s,__global float *T,float J,float B,
126
                            uint sizex,uint sizey,
127
                            uint iterations,uint seed_w,uint seed_z)
128
{
129
   uint z=seed_z/(get_local_id(0)+1);
130
   uint w=seed_w/(get_local_id(0)+1);
131
   float t=T[get_local_id(0)];
132
   uint ind=get_local_id(0);
133

134
   for (uint i=0;i<iterations;i++) {
135

136
      uint x=(uint)(MWC%sizex) ;
137
      uint y=(uint)(MWC%sizey) ;
138

139
      int p=s[x+sizex*(y+sizey*ind)];
140

141
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
142
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
143
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
144
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
145

146
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
147

148
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
149
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
150
   }
151

152
   barrier(CLK_LOCAL_MEM_FENCE);
153
   barrier(CLK_GLOBAL_MEM_FENCE);
154
      
155
}
156
"""
157

    
158
KERNEL_CODE_CUDA="""
159

160
// Marsaglia RNG very simple implementation
161
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
162
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
163
#define MWC   (znew+wnew)
164
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
165
#define CONG  (jcong=69069*jcong+1234567)
166
#define KISS  ((MWC^CONG)+SHR3)
167

168
#define MWCfp MWC * 2.328306435454494e-10f
169
#define KISSfp KISS * 2.328306435454494e-10f
170

171
__global__ void MainLoopOne(char *s,float T,float J,float B,
172
                            uint sizex,uint sizey,
173
                            uint iterations,uint seed_w,uint seed_z)
174

175
{
176
   uint z=seed_z;
177
   uint w=seed_w;
178

179
   for (uint i=0;i<iterations;i++) {
180

181
      uint x=(uint)(MWC%sizex) ;
182
      uint y=(uint)(MWC%sizey) ;
183

184
      int p=s[x+sizex*y];
185

186
      int d=s[x+sizex*((y+1)%sizey)];
187
      int u=s[x+sizex*((y-1)%sizey)];
188
      int l=s[((x-1)%sizex)+sizex*y];
189
      int r=s[((x+1)%sizex)+sizex*y];
190

191
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
192

193
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/T))) ? -1:1;
194
      s[x%sizex+sizex*(y%sizey)] = (char)factor*p;
195
   }
196
   __syncthreads();
197

198
}
199

200
__global__ void MainLoopGlobal(char *s,float *T,float J,float B,
201
                               uint sizex,uint sizey,
202
                               uint iterations,uint seed_w,uint seed_z)
203

204
{
205
   uint z=seed_z/(blockIdx.x+1);
206
   uint w=seed_w/(blockIdx.x+1);
207
   float t=T[blockIdx.x];
208
   uint ind=blockIdx.x;
209

210
   for (uint i=0;i<iterations;i++) {
211

212
      uint x=(uint)(MWC%sizex) ;
213
      uint y=(uint)(MWC%sizey) ;
214

215
      int p=s[x+sizex*(y+sizey*ind)];
216

217
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
218
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
219
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
220
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
221

222
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
223

224
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
225
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
226
      
227
   }
228
   __syncthreads();
229

230
}
231

232
__global__ void MainLoopHybrid(char *s,float *T,float J,float B,
233
                               uint sizex,uint sizey,
234
                               uint iterations,uint seed_w,uint seed_z)
235

236
{
237
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
238
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
239
   float t=T[blockDim.x*blockIdx.x+threadIdx.x];
240
   uint ind=blockDim.x*blockIdx.x+threadIdx.x;
241

242
   for (uint i=0;i<iterations;i++) {
243

244
      uint x=(uint)(MWC%sizex) ;
245
      uint y=(uint)(MWC%sizey) ;
246

247
      int p=s[x+sizex*(y+sizey*ind)];
248

249
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
250
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
251
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
252
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
253

254
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
255

256
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
257
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
258
      
259
   }
260
   __syncthreads();
261

262
}
263

264
__global__ void MainLoopLocal(char *s,float *T,float J,float B,
265
                              uint sizex,uint sizey,
266
                              uint iterations,uint seed_w,uint seed_z)
267
{
268
   uint z=seed_z/(threadIdx.x+1);
269
   uint w=seed_w/(threadIdx.x+1);
270
   float t=T[threadIdx.x];
271
   uint ind=threadIdx.x;
272

273
   for (uint i=0;i<iterations;i++) {
274

275
      uint x=(uint)(MWC%sizex) ;
276
      uint y=(uint)(MWC%sizey) ;
277

278
      int p=s[x+sizex*(y+sizey*ind)];
279

280
      int d=s[x+sizex*((y+1)%sizey+sizey*ind)];
281
      int u=s[x+sizex*((y-1)%sizey+sizey*ind)];
282
      int l=s[((x-1)%sizex)+sizex*(y+sizey*ind)];
283
      int r=s[((x+1)%sizex)+sizex*(y+sizey*ind)];
284

285
      float DeltaE=2.0f*p*(J*(u+d+l+r)+B);
286

287
      int factor=((DeltaE < 0.0f) || (MWCfp < exp(-DeltaE/t))) ? -1:1;
288
      s[x%sizex+sizex*(y%sizey+sizey*ind)] = (char)factor*p;
289
   }
290
   __syncthreads();
291

292
}
293
"""
294

    
295
# find prime factors of a number
296
# Get for WWW :
297
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
298
def PrimeFactors(x):
299
    factorlist=numpy.array([]).astype('uint32')
300
    loop=2
301
    while loop<=x:
302
        if x%loop==0:
303
            x/=loop
304
            factorlist=numpy.append(factorlist,[loop])
305
        else:
306
            loop+=1
307
    return factorlist
308

    
309
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
310
# output is thread number
311
def BestThreadsNumber(jobs):
312
    factors=PrimeFactors(jobs)
313
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
314
    threads=1
315
    for factor in matrix.transpose().ravel():
316
        threads=threads*factor
317
        if threads*threads>jobs:
318
            break
319
    return(long(threads))
320

    
321
def ImageOutput(sigma,prefix):
322
    Max=sigma.max()
323
    Min=sigma.min()
324
    
325
    # Normalize value as 8bits Integer
326
    SigmaInt=(255*(sigma-Min)/(Max-Min)).astype('uint8')
327
    image = Image.fromarray(SigmaInt)
328
    image.save("%s.jpg" % prefix)
329
    
330
def Metropolis(sigma,T,J,B,iterations): 
331
    start=time.time()
332

    
333
    SizeX,SizeY=sigma.shape
334
    
335
    for p in xrange(0,iterations):
336
        # Random access coordonate
337
        X,Y=numpy.random.randint(SizeX),numpy.random.randint(SizeY)
338
        
339
        DeltaE=J*sigma[X,Y]*(2*(sigma[X,(Y+1)%SizeY]+
340
                                sigma[X,(Y-1)%SizeY]+
341
                                sigma[(X-1)%SizeX,Y]+
342
                                sigma[(X+1)%SizeX,Y])+B)
343
        
344
        if DeltaE < 0. or random() < exp(-DeltaE/T):
345
            sigma[X,Y]=-sigma[X,Y]
346
    duration=time.time()-start
347
    return(duration)
348

    
349
def MetropolisAllOpenCL(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
350

    
351
    # sigmaDict & Tlist are NOT respectively array & float
352
    # sigmaDict : dict of array for each temperatoire
353
    # TList : list of temperatures
354
          
355
    # Initialisation des variables en les CASTant correctement
356
    
357
    # Je detecte un peripherique GPU dans la liste des peripheriques
358

    
359
    HasGPU=False
360
    Id=1
361
    # Primary Device selection based on Device Id
362
    for platform in cl.get_platforms():
363
        for device in platform.get_devices():
364
            #deviceType=cl.device_type.to_string(device.type)
365
            deviceType="xPU"
366
            if Id==Device and not HasGPU:
367
                GPU=device
368
                print "CPU/GPU selected: ",device.name
369
                HasGPU=True
370
            Id=Id+1
371
                                
372
    # Je cree le contexte et la queue pour son execution
373
    # ctx = cl.create_some_context()
374
    ctx = cl.Context([GPU])
375
    queue = cl.CommandQueue(ctx,
376
                            properties=cl.command_queue_properties.PROFILING_ENABLE)
377
    
378
    # Je recupere les flag possibles pour les buffers
379
    mf = cl.mem_flags
380

    
381
    # Concatenate all sigma in single array
382
    sigma=numpy.copy(sigmaDict[TList[0]])
383
    for T in TList[1:]:
384
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
385

    
386
    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)
387
    TCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=TList)
388
   
389
    MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
390
        options = "-cl-mad-enable -cl-fast-relaxed-math")
391

    
392
    SizeX,SizeY=sigmaDict[TList[0]].shape
393

    
394
    if ParaStyle=='Blocks':
395
        # Call OpenCL kernel
396
        # (1,) is Global work size (only 1 work size)
397
        # (1,) is local work size
398
        # SeedZCL is lattice translated in CL format
399
        # SeedWCL is lattice translated in CL format
400
        # step is number of iterations
401
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
402
                                             sigmaCL,
403
                                             TCL, 
404
                                             numpy.float32(J),
405
                                             numpy.float32(B),
406
                                             numpy.uint32(SizeX),
407
                                             numpy.uint32(SizeY),
408
                                             numpy.uint32(iterations),
409
                                             numpy.uint32(nprnd(2**31-1)),
410
                                             numpy.uint32(nprnd(2**31-1)))
411
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
412
              (Alu,jobs,1,ParaStyle)
413
    elif ParaStyle=='Threads':
414
        # It's necessary to put a Local_ID equal to a Global_ID
415
        # Jobs are to be considerated as global number of jobs to do
416
        # And to be distributed to entities
417
        # For example :
418
        # G_ID=10 & L_ID=10 : 10 Threads on 1 UC
419
        # G_ID=10 & L_ID=1 : 10 Threads on 1 UC
420
        
421
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
422
                                            sigmaCL,
423
                                            TCL,
424
                                            numpy.float32(J),
425
                                            numpy.float32(B),
426
                                            numpy.uint32(SizeX),
427
                                            numpy.uint32(SizeY),
428
                                            numpy.uint32(iterations),
429
                                            numpy.uint32(nprnd(2**31-1)),
430
                                            numpy.uint32(nprnd(2**31-1)))
431
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
432
              (Alu,1,jobs,ParaStyle)
433
    else:
434
        threads=BestThreadsNumber(jobs)
435
        # en OpenCL, necessaire de mettre un Global_id identique au local_id
436
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
437
                                             sigmaCL,
438
                                             TCL,
439
                                             numpy.float32(J),
440
                                             numpy.float32(B),
441
                                             numpy.uint32(SizeX),
442
                                             numpy.uint32(SizeY),
443
                                             numpy.uint32(iterations),
444
                                             numpy.uint32(nprnd(2**31-1)),
445
                                             numpy.uint32(nprnd(2**31-1)))
446
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
447
              (Alu,jobs/threads,threads,ParaStyle)
448
        
449
    CLLaunch.wait()
450
    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
451
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
452
    sigmaCL.release()
453

    
454
    results=numpy.split(sigma,len(TList),axis=1)
455
    for T in TList:
456
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
457

    
458
    return(elapsed)
459

    
460
def MetropolisAllCuda(sigmaDict,TList,J,B,iterations,jobs,ParaStyle,Alu,Device):
461

    
462
    # sigmaDict & Tlist are NOT respectively array & float
463
    # sigmaDict : dict of array for each temperatoire
464
    # TList : list of temperatures
465
          
466
    # Avec PyCUDA autoinit, rien a faire !
467

    
468
    mod = SourceModule(KERNEL_CODE_CUDA)
469
    
470
    MetropolisBlocksCuda=mod.get_function("MainLoopGlobal")
471
    MetropolisThreadsCuda=mod.get_function("MainLoopLocal")
472
    MetropolisHybridCuda=mod.get_function("MainLoopHybrid")
473

    
474
    # Concatenate all sigma in single array
475
    sigma=numpy.copy(sigmaDict[TList[0]])
476
    for T in TList[1:]:
477
        sigma=numpy.concatenate((sigma,sigmaDict[T]),axis=1)
478

    
479
    sigmaCU=cuda.InOut(sigma)
480
    TCU=cuda.InOut(TList)
481

    
482
    SizeX,SizeY=sigmaDict[TList[0]].shape
483

    
484
    start = pycuda.driver.Event()
485
    stop = pycuda.driver.Event()
486
    
487
    start.record()
488
    start.synchronize()
489
    if ParaStyle=='Blocks':
490
        # Call CUDA kernel
491
        # (1,) is Global work size (only 1 work size)
492
        # (1,) is local work size
493
        # SeedZCL is lattice translated in CL format
494
        # SeedWCL is lattice translated in CL format
495
        # step is number of iterations
496
        MetropolisBlocksCuda(sigmaCU,
497
                             TCU, 
498
                             numpy.float32(J),
499
                             numpy.float32(B),
500
                             numpy.uint32(SizeX),
501
                             numpy.uint32(SizeY),
502
                             numpy.uint32(iterations),
503
                             numpy.uint32(nprnd(2**31-1)),
504
                             numpy.uint32(nprnd(2**31-1)),
505
                             grid=(jobs,1),block=(1,1,1))
506
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
507
              (Alu,jobs,1,ParaStyle)
508
    elif ParaStyle=='Threads':
509
        MetropolisThreadsCuda(sigmaCU,
510
                              TCU,
511
                              numpy.float32(J),
512
                              numpy.float32(B),
513
                              numpy.uint32(SizeX),
514
                              numpy.uint32(SizeY),
515
                              numpy.uint32(iterations),
516
                              numpy.uint32(nprnd(2**31-1)),
517
                              numpy.uint32(nprnd(2**31-1)),
518
                              grid=(1,1),block=(jobs,1,1))
519
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
520
              (Alu,1,jobs,ParaStyle)
521
    else:
522
        threads=BestThreadsNumber(jobs)        
523
        MetropolisHybridCuda(sigmaCU,
524
                              TCU,
525
                              numpy.float32(J),
526
                              numpy.float32(B),
527
                              numpy.uint32(SizeX),
528
                              numpy.uint32(SizeY),
529
                              numpy.uint32(iterations),
530
                              numpy.uint32(nprnd(2**31-1)),
531
                              numpy.uint32(nprnd(2**31-1)),
532
                              grid=(jobs/threads,1),block=(threads,1,1))
533
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
534
              (Alu,jobs/threads,threads,ParaStyle)
535
        
536
    stop.record()
537
    stop.synchronize()
538
    elapsed = start.time_till(stop)*1e-3
539

    
540
    results=numpy.split(sigma,len(TList),axis=1)
541
    for T in TList:
542
        sigmaDict[T]=numpy.copy(results[numpy.nonzero(TList == T)[0][0]])
543

    
544
    return(elapsed)
545

    
546

    
547
def Magnetization(sigma,M):
548
    return(numpy.sum(sigma)/(sigma.shape[0]*sigma.shape[1]*1.0))
549

    
550
def Energy(sigma,J):
551
    # Copier et caster 
552
    E=numpy.copy(sigma).astype(numpy.float32)
553
    
554
    # Appel par slice
555
    E[1:-1,1:-1]=-J*E[1:-1,1:-1]*(E[:-2,1:-1]+E[2:,1:-1]+
556
                                  E[1:-1,:-2]+E[1:-1,2:])
557
    
558
    # Bien nettoyer la peripherie
559
    E[:,0]=0
560
    E[:,-1]=0
561
    E[0,:]=0
562
    E[-1,:]=0
563
    
564
    Energy=numpy.sum(E)
565

    
566
    return(Energy/(E.shape[0]*E.shape[1]*1.0))
567

    
568
def DisplayCurves(T,E,M,J,B):
569

    
570
    plt.xlabel("Temperature")
571
    plt.ylabel("Energy")
572

    
573
    Experience,=plt.plot(T,E,label="Energy") 
574

    
575
    plt.legend()
576
    plt.show()
577

    
578

    
579
if __name__=='__main__':
580

    
581
    # Set defaults values
582
    # Alu can be CPU or GPU
583
    Alu='CPU'
584
    # Id of GPU : 0 
585
    Device=0
586
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
587
    GpuStyle='OpenCL'
588
    # Parallel distribution can be on Threads or Blocks
589
    ParaStyle='Blocks'
590
    # Coupling factor
591
    J=1.
592
    # Magnetic Field
593
    B=0.
594
    # Size of Lattice
595
    Size=256
596
    # Default Temperatures (start, end, step)
597
    Tmin=0.1
598
    Tmax=5
599
    Tstep=0.1
600
    # Default Number of Iterations
601
    Iterations=Size*Size
602
    # Curves is True to print the curves
603
    Curves=False
604

    
605
    try:
606
        opts, args = getopt.getopt(sys.argv[1:],"hcj:b:z:i:s:e:p:a:d:g:t:",["coupling=","magneticfield=","size=","iterations=","tempstart=","tempend=","tempstep=","alu=","gpustyle=","parastyle="])
607
    except getopt.GetoptError:
608
        print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks>' % sys.argv[0]
609
        sys.exit(2)
610
    
611
 
612
    for opt, arg in opts:
613
        if opt == '-h':
614
            print '%s -j <Coupling Factor> -b <Magnetic Field> -z <Size of Lattice> -i <Iterations> -s <Minimum Temperature> -e <Maximum Temperature> -p <steP Temperature> -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -t <Threads/Blocks/Hybrid>' % sys.argv[0]
615
            sys.exit()
616
        elif opt == '-c':
617
            Curves=True
618
        elif opt in ("-j", "--coupling"):
619
            J = float(arg)
620
        elif opt in ("-b", "--magneticfield"):
621
            B = float(arg)
622
        elif opt in ("-s", "--tempmin"):
623
            Tmin = float(arg)
624
        elif opt in ("-e", "--tempmax"):
625
            Tmax = float(arg)
626
        elif opt in ("-p", "--tempstep"):
627
            Tstep = float(arg)
628
        elif opt in ("-i", "--iterations"):
629
            Iterations = int(arg)
630
        elif opt in ("-z", "--size"):
631
            Size = int(arg)
632
        elif opt in ("-a", "--alu"):
633
            Alu = arg
634
        elif opt in ("-d", "--device"):
635
            Device = int(arg)
636
        elif opt in ("-g", "--gpustyle"):
637
            GpuStyle = arg
638
        elif opt in ("-t", "--parastyle"):
639
            ParaStyle = arg
640

    
641
    if Alu=='CPU' and GpuStyle=='CUDA':
642
        print "Alu can't be CPU for CUDA, set Alu to GPU"
643
        Alu='GPU'
644

    
645
    if ParaStyle not in ('Blocks','Threads','Hybrid'):
646
        print "%s not exists, ParaStyle set as Threads !" % ParaStyle
647
        ParaStyle='Blocks'
648
   
649
    print "Compute unit : %s" % Alu
650
    print "Device Identification : %s" % Device
651
    print "GpuStyle used : %s" % GpuStyle
652
    print "Parallel Style used : %s" % ParaStyle
653
    print "Coupling Factor : %s" % J
654
    print "Magnetic Field :  %s" % B
655
    print "Size of lattice : %s" % Size
656
    print "Iterations : %s" % Iterations
657
    print "Temperature on start : %s" % Tmin
658
    print "Temperature on end : %s" % Tmax
659
    print "Temperature step : %s" % Tstep
660

    
661
    if GpuStyle=='CUDA':
662
        # For PyCUDA import
663
        import pycuda.driver as cuda
664
        import pycuda.gpuarray as gpuarray
665
        import pycuda.autoinit
666
        from pycuda.compiler import SourceModule
667

    
668
    if GpuStyle=='OpenCL':
669
        # For PyOpenCL import
670
        import pyopencl as cl
671
        Id=1
672
        for platform in cl.get_platforms():
673
            for device in platform.get_devices():
674
                #deviceType=cl.device_type.to_string(device.type)
675
                deviceType="xPU"
676
                print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
677
                Id=Id+1
678

    
679
    LAPIMAGE=False
680

    
681
    sigmaIn=numpy.where(numpy.random.randn(Size,Size)>0,1,-1).astype(numpy.int8)
682

    
683
    ImageOutput(sigmaIn,"Ising2D_Serial_%i_Initial" % (Size))
684

    
685
    # La temperature est passee comme parametre, attention au CAST !
686
    Trange=numpy.arange(Tmin,Tmax+Tstep,Tstep).astype(numpy.float32)
687

    
688
    E=[]
689
    M=[]
690

    
691
    sigma={}
692
    for T in Trange:
693
        sigma[T]=numpy.copy(sigmaIn)
694

    
695
    jobs=len(Trange)
696
    
697
    # For GPU, all process are launched
698
    if GpuStyle=='CUDA':
699
        duration=MetropolisAllCuda(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
700
    else:
701
        duration=MetropolisAllOpenCL(sigma,Trange,J,B,Iterations,jobs,ParaStyle,Alu,Device)
702

    
703
    print BestThreadsNumber(len(Trange))
704
    
705
    for T in Trange:
706
        E=numpy.append(E,Energy(sigma[T],J))
707
        M=numpy.append(M,Magnetization(sigma[T],B))
708
        print "CPU Time for each : %f" % (duration/len(Trange))
709
        print "Total Energy at Temperature %f : %f" % (T,E[-1])
710
        print "Total Magnetization at Temperature %f : %f" % (T,M[-1])        
711
        ImageOutput(sigma[T],"Ising2D_Serial_%i_%1.1f_Final" % (Size,T))
712

    
713
    if Curves:
714
        DisplayCurves(Trange,E,M,J,B)
715

    
716