Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 80

Historique | Voir | Annoter | Télécharger (26,47 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import math
23
from socket import gethostname
24

    
25
Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
26
Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
27

    
28
# find prime factors of a number
29
# Get for WWW :
30
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
31
def PrimeFactors(x):
32
  factorlist=numpy.array([]).astype('uint32')
33
  loop=2
34
  while loop<=x:
35
    if x%loop==0:
36
      x/=loop
37
      factorlist=numpy.append(factorlist,[loop])
38
    else:
39
      loop+=1
40
  return factorlist
41

    
42
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
43
# output is thread number
44
def BestThreadsNumber(jobs):
45
  factors=PrimeFactors(jobs)
46
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
47
  threads=1
48
  for factor in matrix.transpose().ravel():
49
    threads=threads*factor
50
    if threads*threads>jobs or threads>512:
51
      break
52
  return(long(threads))
53

    
54
# Predicted Amdahl Law (Reduced with s=1-p)  
55
def AmdahlR(N, T1, p):
56
  return (T1*(1-p+p/N))
57

    
58
# Predicted Amdahl Law
59
def Amdahl(N, T1, s, p):
60
  return (T1*(s+p/N))
61

    
62
# Predicted Mylq Law with first order
63
def Mylq(N, T1,s,c,p):
64
  return (T1*(s+p/N)+c*N)
65

    
66
# Predicted Mylq Law with second order
67
def Mylq2(N, T1,s,c1,c2,p):
68
  return (T1*(s+p/N)+c1*N+c2*N*N)
69

    
70
KERNEL_CODE_CUDA="""
71
#define TCONG 0
72
#define TSHR3 1
73
#define TMWC 2
74
#define TKISS 3
75

76
#define TINT32 0
77
#define TINT64 1
78
#define TFP32 2
79
#define TFP64 3
80

81
// Marsaglia RNG very simple implementation
82

83
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
84
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
85
#define MWC   (znew+wnew)
86
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
87
#define CONG  (jcong=69069*jcong+1234567)
88
#define KISS  ((MWC^CONG)+SHR3)
89

90
#define MWCfp MWC * 2.328306435454494e-10f
91
#define KISSfp KISS * 2.328306435454494e-10f
92
#define SHR3fp SHR3 * 2.328306435454494e-10f
93
#define CONGfp CONG * 2.328306435454494e-10f
94

95
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
96
{
97

98
#if TRNG == TCONG
99
   uint jcong=seed_z+work;
100
#elif TRNG == TSHR3
101
   uint jsr=seed_w+work;
102
#elif TRNG == TMWC
103
   uint z=seed_z+work;
104
   uint w=seed_w+work;
105
#elif TRNG == TKISS
106
   uint jcong=seed_z+work;
107
   uint jsr=seed_w+work;
108
   uint z=seed_z-work;
109
   uint w=seed_w-work;
110
#endif
111

112
   ulong total=0;
113

114
   for (ulong i=0;i<iterations;i++) {
115

116
#if TYPE == TINT32
117
    #define THEONE 1073741824
118
    #if TRNG == TCONG
119
        uint x=CONG>>17 ;
120
        uint y=CONG>>17 ;
121
    #elif TRNG == TSHR3
122
        uint x=SHR3>>17 ;
123
        uint y=SHR3>>17 ;
124
    #elif TRNG == TMWC
125
        uint x=MWC>>17 ;
126
        uint y=MWC>>17 ;
127
    #elif TRNG == TKISS
128
        uint x=KISS>>17 ;
129
        uint y=KISS>>17 ;
130
    #endif
131
#elif TYPE == TINT64
132
    #define THEONE 4611686018427387904
133
    #if TRNG == TCONG
134
        ulong x=(ulong)(CONG>>1) ;
135
        ulong y=(ulong)(CONG>>1) ;
136
    #elif TRNG == TSHR3
137
        ulong x=(ulong)(SHR3>>1) ;
138
        ulong y=(ulong)(SHR3>>1) ;
139
    #elif TRNG == TMWC
140
        ulong x=(ulong)(MWC>>1) ;
141
        ulong y=(ulong)(MWC>>1) ;
142
    #elif TRNG == TKISS
143
        ulong x=(ulong)(KISS>>1) ;
144
        ulong y=(ulong)(KISS>>1) ;
145
    #endif
146
#elif TYPE == TFP32
147
    #define THEONE 1.0f
148
    #if TRNG == TCONG
149
        float x=CONGfp ;
150
        float y=CONGfp ;
151
    #elif TRNG == TSHR3
152
        float x=SHR3fp ;
153
        float y=SHR3fp ;
154
    #elif TRNG == TMWC
155
        float x=MWCfp ;
156
        float y=MWCfp ;
157
    #elif TRNG == TKISS
158
      float x=KISSfp ;
159
      float y=KISSfp ;
160
    #endif
161
#elif TYPE == TFP64
162
    #define THEONE 1.0f
163
    #if TRNG == TCONG
164
        double x=(double)CONGfp ;
165
        double y=(double)CONGfp ;
166
    #elif TRNG == TSHR3
167
        double x=(double)SHR3fp ;
168
        double y=(double)SHR3fp ;
169
    #elif TRNG == TMWC
170
        double x=(double)MWCfp ;
171
        double y=(double)MWCfp ;
172
    #elif TRNG == TKISS
173
        double x=(double)KISSfp ;
174
        double y=(double)KISSfp ;
175
    #endif
176
#endif
177

178
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
179
      total+=inside;
180
   }
181

182
   return(total);
183
}
184

185
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
186
{
187
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
188
   s[blockIdx.x]=total;
189
   __syncthreads();
190

191
}
192

193
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
194
{
195
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
196
   s[threadIdx.x]=total;
197
   __syncthreads();
198

199
}
200

201
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
202
{
203
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
204
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
205
   __syncthreads();
206
}
207

208
"""
209

    
210
KERNEL_CODE_OPENCL="""
211
#define TCONG 0
212
#define TSHR3 1
213
#define TMWC 2
214
#define TKISS 3
215

216
#define TINT32 0
217
#define TINT64 1
218
#define TFP32 2
219
#define TFP64 3
220

221
// Marsaglia RNG very simple implementation
222
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
223
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
224

225
#define MWC   (znew+wnew)
226
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
227
#define CONG  (jcong=69069*jcong+1234567)
228
#define KISS  ((MWC^CONG)+SHR3)
229

230
#define MWCfp MWC * 2.328306435454494e-10f
231
#define KISSfp KISS * 2.328306435454494e-10f
232
#define CONGfp CONG * 2.328306435454494e-10f
233
#define SHR3fp SHR3 * 2.328306435454494e-10f
234

235
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
236
{
237

238
#if TRNG == TCONG
239
   uint jcong=seed_z+work;
240
#elif TRNG == TSHR3
241
   uint jsr=seed_w+work;
242
#elif TRNG == TMWC
243
   uint z=seed_z+work;
244
   uint w=seed_w+work;
245
#elif TRNG == TKISS
246
   uint jcong=seed_z+work;
247
   uint jsr=seed_w+work;
248
   uint z=seed_z-work;
249
   uint w=seed_w-work;
250
#endif
251

252
   ulong total=0;
253

254
   for (ulong i=0;i<iterations;i++) {
255

256
#if TYPE == TINT32
257
    #define THEONE 1073741824
258
    #if TRNG == TCONG
259
        uint x=CONG>>17 ;
260
        uint y=CONG>>17 ;
261
    #elif TRNG == TSHR3
262
        uint x=SHR3>>17 ;
263
        uint y=SHR3>>17 ;
264
    #elif TRNG == TMWC
265
        uint x=MWC>>17 ;
266
        uint y=MWC>>17 ;
267
    #elif TRNG == TKISS
268
        uint x=KISS>>17 ;
269
        uint y=KISS>>17 ;
270
    #endif
271
#elif TYPE == TINT64
272
    #define THEONE 4611686018427387904
273
    #if TRNG == TCONG
274
        ulong x=(ulong)(CONG>>1) ;
275
        ulong y=(ulong)(CONG>>1) ;
276
    #elif TRNG == TSHR3
277
        ulong x=(ulong)(SHR3>>1) ;
278
        ulong y=(ulong)(SHR3>>1) ;
279
    #elif TRNG == TMWC
280
        ulong x=(ulong)(MWC>>1) ;
281
        ulong y=(ulong)(MWC>>1) ;
282
    #elif TRNG == TKISS
283
        ulong x=(ulong)(KISS>>1) ;
284
        ulong y=(ulong)(KISS>>1) ;
285
    #endif
286
#elif TYPE == TFP32
287
    #define THEONE 1.0f
288
    #if TRNG == TCONG
289
        float x=CONGfp ;
290
        float y=CONGfp ;
291
    #elif TRNG == TSHR3
292
        float x=SHR3fp ;
293
        float y=SHR3fp ;
294
    #elif TRNG == TMWC
295
        float x=MWCfp ;
296
        float y=MWCfp ;
297
    #elif TRNG == TKISS
298
      float x=KISSfp ;
299
      float y=KISSfp ;
300
    #endif
301
#elif TYPE == TFP64
302
#pragma OPENCL EXTENSION cl_khr_fp64: enable
303
    #define THEONE 1.0f
304
    #if TRNG == TCONG
305
        double x=(double)CONGfp ;
306
        double y=(double)CONGfp ;
307
    #elif TRNG == TSHR3
308
        double x=(double)SHR3fp ;
309
        double y=(double)SHR3fp ;
310
    #elif TRNG == TMWC
311
        double x=(double)MWCfp ;
312
        double y=(double)MWCfp ;
313
    #elif TRNG == TKISS
314
        double x=(double)KISSfp ;
315
        double y=(double)KISSfp ;
316
    #endif
317
#endif
318

319
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
320
      total+=inside;
321
   }
322
   
323
   return(total);
324
}
325

326
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
327
{
328
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
329
   barrier(CLK_GLOBAL_MEM_FENCE);
330
   s[get_global_id(0)]=total;     
331
}
332

333

334
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
335
{
336
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
337
   barrier(CLK_LOCAL_MEM_FENCE);
338
   s[get_local_id(0)]=total;
339
}
340

341
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
342
{
343
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
344
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
345
   s[get_global_id(0)]=total;
346
}
347

348
"""
349

    
350
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,RNG,ValueType):
351

    
352
  # Avec PyCUDA autoinit, rien a faire !
353
  
354
  circleCU = cuda.InOut(circle)
355

    
356
  try:
357
    mod = SourceModule(KERNEL_CODE_CUDA,options=['--compiler-options','-Wall -DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
358
  except:
359
    print "Compilation seems to brake"
360
  
361
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
362
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
363
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
364
  
365
  start = pycuda.driver.Event()
366
  stop = pycuda.driver.Event()
367
  
368
  MyPi=numpy.zeros(steps)
369
  MyDuration=numpy.zeros(steps)
370

    
371
  if iterations%jobs==0:
372
    iterationsCL=numpy.uint64(iterations/jobs)
373
    iterationsNew=iterationsCL*jobs
374
  else:
375
    iterationsCL=numpy.uint64(iterations/jobs+1)
376
    iterationsNew=iterations
377

    
378
  for i in range(steps):
379
    start.record()
380
    start.synchronize()
381
    if ParaStyle=='Blocks':
382
      MetropolisBlocksCU(circleCU,
383
                         numpy.uint64(iterationsCL),
384
                         numpy.uint32(nprnd(2**30/jobs)),
385
                         numpy.uint32(nprnd(2**30/jobs)),
386
                         grid=(jobs,1),
387
                         block=(1,1,1))
388
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
389
            (Alu,jobs,1,ParaStyle)      
390
    elif ParaStyle=='Hybrid':
391
      threads=BestThreadsNumber(jobs)
392
      MetropolisHybridCU(circleCU,
393
                         numpy.uint64(iterationsCL),
394
                         numpy.uint32(nprnd(2**30/jobs)),
395
                         numpy.uint32(nprnd(2**30/jobs)),
396
                         grid=(jobs,1),
397
                         block=(threads,1,1))
398
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
399
            (Alu,jobs/threads,threads,ParaStyle)
400
    else:
401
      MetropolisJobsCU(circleCU,
402
                       numpy.uint64(iterationsCL),
403
                       numpy.uint32(nprnd(2**30/jobs)),
404
                       numpy.uint32(nprnd(2**30/jobs)),
405
                       grid=(1,1),
406
                       block=(jobs,1,1))
407
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
408
            (Alu,jobs,1,ParaStyle)
409
    stop.record()
410
    stop.synchronize()
411
                
412
    elapsed = start.time_till(stop)*1e-3
413

    
414
    MyDuration[i]=elapsed
415
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
416
    MyPi[i]=numpy.median(AllPi)
417
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
418

    
419

    
420
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)
421

    
422
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
423

    
424

    
425
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
426
                     RNG,ValueType):
427
        
428
  # Initialisation des variables en les CASTant correctement
429
    
430
  if Device==0:
431
    print "Enter XPU selector based on ALU type: first selected"
432
    HasXPU=False
433
    # Default Device selection based on ALU Type
434
    for platform in cl.get_platforms():
435
      for device in platform.get_devices():
436
        deviceType=cl.device_type.to_string(device.type)
437
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
438
          XPU=device
439
          print "GPU selected: ",device.name
440
          HasXPU=True
441
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
442
          XPU=device
443
          print "CPU selected: ",device.name
444
          HasXPU=True
445
  else:
446
    print "Enter XPU selector based on device number & ALU type"
447
    Id=1
448
    HasXPU=False
449
    # Primary Device selection based on Device Id
450
    for platform in cl.get_platforms():
451
      for device in platform.get_devices():
452
        deviceType=cl.device_type.to_string(device.type)
453
        if Id==Device and Alu==deviceType and HasXPU==False:
454
          XPU=device
455
          print "CPU/GPU selected: ",device.name.lstrip()
456
          HasXPU=True
457
        Id=Id+1
458
    if HasXPU==False:
459
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
460
          (Device,Alu,Id-1)
461
      return(0,0,0)
462
                                
463
  # Je cree le contexte et la queue pour son execution
464
  ctx = cl.Context([XPU])
465
  queue = cl.CommandQueue(ctx,
466
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
467

    
468
  # Je recupere les flag possibles pour les buffers
469
  mf = cl.mem_flags
470
        
471
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
472

    
473
  
474
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
475
    options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType]))
476

    
477
  i=0
478

    
479
  MyPi=numpy.zeros(steps)
480
  MyDuration=numpy.zeros(steps)
481
  
482
  if iterations%jobs==0:
483
    iterationsCL=numpy.uint64(iterations/jobs)
484
    iterationsNew=numpy.uint64(iterationsCL*jobs)
485
  else:
486
    iterationsCL=numpy.uint64(iterations/jobs+1)
487
    iterationsNew=numpy.uint64(iterations)
488

    
489
  for i in range(steps):
490
                
491
    if ParaStyle=='Blocks':
492
      # Call OpenCL kernel
493
      # (1,) is Global work size (only 1 work size)
494
      # (1,) is local work size
495
      # circleCL is lattice translated in CL format
496
      # SeedZCL is lattice translated in CL format
497
      # SeedWCL is lattice translated in CL format
498
      # step is number of iterations
499
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
500
                                           circleCL,
501
                                           numpy.uint64(iterationsCL),
502
                                           numpy.uint32(nprnd(2**30/jobs)),
503
                                           numpy.uint32(nprnd(2**30/jobs)))
504
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
505
            (Alu,jobs,1,ParaStyle)
506
    elif ParaStyle=='Hybrid':
507
      threads=BestThreadsNumber(jobs)
508
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
509
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
510
                                          circleCL,
511
                                          numpy.uint64(iterationsCL),
512
                                          numpy.uint32(nprnd(2**30/jobs)),
513
                                          numpy.uint32(nprnd(2**30/jobs)))
514
        
515
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
516
            (Alu,jobs/threads,threads,ParaStyle)
517
    else:
518
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
519
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
520
                                          circleCL,
521
                                          numpy.uint64(iterationsCL),
522
                                          numpy.uint32(nprnd(2**30/jobs)),
523
                                          numpy.uint32(nprnd(2**30/jobs)))
524
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
525

    
526
    CLLaunch.wait()
527
    cl.enqueue_copy(queue, circle, circleCL).wait()
528

    
529
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
530

    
531
    print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
532
    MyDuration[i]=elapsed
533
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
534
    MyPi[i]=numpy.median(AllPi)
535
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
536

    
537
  circleCL.release()
538

    
539
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)
540
        
541
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
542

    
543

    
544
def FitAndPrint(N,D,Curves):
545

    
546
  from scipy.optimize import curve_fit
547
  import matplotlib.pyplot as plt
548

    
549
  try:
550
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
551

    
552
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
553
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
554
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
555
    coeffs_Amdahl[0]=D[0]
556
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
557
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
558
  except:
559
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
560

    
561
  try:
562
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
563

    
564
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
565
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
566
    coeffs_AmdahlR[0]=D[0]
567
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
568
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
569

    
570
  except:
571
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
572

    
573
  try:
574
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
575

    
576
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
577
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
578
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
579
    coeffs_Mylq[0]=D[0]
580
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
581
                                                            coeffs_Mylq[1],
582
                                                            coeffs_Mylq[3],
583
                                                            coeffs_Mylq[2])
584
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
585
                coeffs_Mylq[3])
586
  except:
587
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
588

    
589
  try:
590
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
591

    
592
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
593
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
594
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
595
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
596
    coeffs_Mylq2[0]=D[0]
597
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
598
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
599
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
600

    
601
  except:
602
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
603

    
604
  if Curves:
605
    plt.xlabel("Number of Threads/work Items")
606
    plt.ylabel("Total Elapsed Time")
607

    
608
    Experience,=plt.plot(N,D,'ro') 
609
    try:
610
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
611
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
612
    except:
613
      print "Fit curves seem not to be available"
614

    
615
    plt.legend()
616
    plt.show()
617

    
618
if __name__=='__main__':
619

    
620
  # Set defaults values
621
  
622
  # Alu can be CPU, GPU or ACCELERATOR
623
  Alu='CPU'
624
  # Id of GPU : 1 is for first find !
625
  Device=0
626
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
627
  GpuStyle='OpenCL'
628
  # Parallel distribution can be on Threads or Blocks
629
  ParaStyle='Blocks'
630
  # Iterations is integer
631
  Iterations=100000000
632
  # JobStart in first number of Jobs to explore
633
  JobStart=1
634
  # JobEnd is last number of Jobs to explore
635
  JobEnd=16
636
  # JobStep is the step of Jobs to explore
637
  JobStep=1
638
  # Redo is the times to redo the test to improve metrology
639
  Redo=1
640
  # OutMetrology is method for duration estimation : False is GPU inside
641
  OutMetrology=False
642
  Metrology='InMetro'
643
  # Curves is True to print the curves
644
  Curves=False
645
  # Fit is True to print the curves
646
  Fit=False
647
  # Marsaglia RNG
648
  RNG='KISS'
649
  # Value type : INT32, INT64, FP32, FP64
650
  ValueType='INT32'
651
  
652
  try:
653
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:v:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","marsaglia=","valuetype="])
654
  except getopt.GetoptError:
655
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64> ' % sys.argv[0]
656
    sys.exit(2)
657
    
658
  for opt, arg in opts:
659
    if opt == '-h':
660
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>' % sys.argv[0]
661

    
662
      print "\nInformations about devices detected under OpenCL:"
663
      # For PyOpenCL import
664
      try:
665
        import pyopencl as cl
666
        Id=1
667
        for platform in cl.get_platforms():
668
          for device in platform.get_devices():
669
            deviceType=cl.device_type.to_string(device.type)
670
            print "Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())
671
            Id=Id+1
672

    
673
        print
674
        sys.exit()
675
      except ImportError:
676
        print "Your platform does not seem to support OpenCL"
677
        
678
    elif opt == '-o':
679
      OutMetrology=True
680
      Metrology='OutMetro'
681
    elif opt == '-c':
682
      Curves=True
683
    elif opt == '-f':
684
      Fit=True
685
    elif opt in ("-a", "--alu"):
686
      Alu = arg
687
    elif opt in ("-d", "--device"):
688
      Device = int(arg)
689
    elif opt in ("-g", "--gpustyle"):
690
      GpuStyle = arg
691
    elif opt in ("-p", "--parastyle"):
692
      ParaStyle = arg
693
    elif opt in ("-m", "--marsaglia"):
694
      RNG = arg
695
    elif opt in ("-v", "--valuetype"):
696
      ValueType = arg
697
    elif opt in ("-i", "--iterations"):
698
      Iterations = numpy.uint64(arg)
699
    elif opt in ("-s", "--jobstart"):
700
      JobStart = int(arg)
701
    elif opt in ("-e", "--jobend"):
702
      JobEnd = int(arg)
703
    elif opt in ("-t", "--jobstep"):
704
      JobStep = int(arg)
705
    elif opt in ("-r", "--redo"):
706
      Redo = int(arg)
707

    
708
  if Alu=='CPU' and GpuStyle=='CUDA':
709
    print "Alu can't be CPU for CUDA, set Alu to GPU"
710
    Alu='GPU'
711

    
712
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
713
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
714
    ParaStyle='Threads'
715

    
716
  print "Compute unit : %s" % Alu
717
  print "Device Identification : %s" % Device
718
  print "GpuStyle used : %s" % GpuStyle
719
  print "Parallel Style used : %s" % ParaStyle
720
  print "Iterations : %s" % Iterations
721
  print "Number of threads on start : %s" % JobStart
722
  print "Number of threads on end : %s" % JobEnd
723
  print "Number of redo : %s" % Redo
724
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
725
  print "Type of Marsaglia RNG used : %s" % RNG
726
  print "Type of variable : %s" % ValueType
727

    
728
  if GpuStyle=='CUDA':
729
    try:
730
      # For PyCUDA import
731
      import pycuda.driver as cuda
732
      import pycuda.gpuarray as gpuarray
733
      import pycuda.autoinit
734
      from pycuda.compiler import SourceModule
735
    except ImportError:
736
      print "Platform does not seem to support CUDA"
737

    
738
  if GpuStyle=='OpenCL':
739
    try:
740
      # For PyOpenCL import
741
      import pyopencl as cl
742
      Id=1
743
      for platform in cl.get_platforms():
744
        for device in platform.get_devices():
745
          deviceType=cl.device_type.to_string(device.type)
746
          print "Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())
747

    
748
          if Id == Device:
749
            # Set the Alu as detected Device Type
750
            Alu=deviceType
751
          Id=Id+1
752
    except ImportError:
753
      print "Platform does not seem to support CUDA"
754
      
755
  average=numpy.array([]).astype(numpy.float32)
756
  median=numpy.array([]).astype(numpy.float32)
757
  stddev=numpy.array([]).astype(numpy.float32)
758
  averageRate=numpy.array([]).astype(numpy.float32)
759
  medianRate=numpy.array([]).astype(numpy.float32)
760
  stddevRate=numpy.array([]).astype(numpy.float32)
761

    
762
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
763

    
764
  Jobs=JobStart
765

    
766
  while Jobs <= JobEnd:
767
    avg,med,std=0,0,0
768
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
769
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
770

    
771
    if OutMetrology: 
772
      duration=numpy.array([]).astype(numpy.float32)
773
      rate=numpy.array([]).astype(numpy.float32)
774
      for i in range(Redo):
775
        start=time.time()
776
        if GpuStyle=='CUDA':
777
          try:
778
            a,m,s,aR,mR,sR=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,RNG,ValueType)
779
          except:
780
            print "Problem with %i // computations on Cuda" % Jobs
781
        elif GpuStyle=='OpenCL':
782
          try:
783
            a,m,s,aR,mR,sR=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device,RNG,ValueType)
784
          except:
785
            print "Problem with %i // computations on OpenCL" % Jobs            
786
        duration=numpy.append(duration,time.time()-start)
787
        rate=numpy.append(rate,Iterations/(time.time()-start))
788
      if (a,m,s) != (0,0,0): 
789
        avg=numpy.mean(duration)
790
        med=numpy.median(duration)
791
        std=numpy.std(duration)
792
        avgR=numpy.mean(Iterations/duration)
793
        medR=numpy.median(Iterations/duration)
794
        stdR=numpy.std(Iterations/duration)
795
      else:
796
        print "Values seem to be wrong..."
797
    else:
798
      if GpuStyle=='CUDA':
799
        try:
800
          avg,med,std,avgR,medR,stdR=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,RNG,ValueType)
801
        except:
802
          print "Problem with %i // computations on Cuda" % Jobs
803
      elif GpuStyle=='OpenCL':
804
        try:
805
          avg,med,std,avgR,medR,stdR=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,RNG,ValueType)
806
        except:
807
          print "Problem with %i // computations on OpenCL" % Jobs            
808

    
809
    if (avg,med,std) != (0,0,0):
810
      print "jobs,avg,med,std",Jobs,avg,med,std
811
      average=numpy.append(average,avg)
812
      median=numpy.append(median,med)
813
      stddev=numpy.append(stddev,std)
814
      averageRate=numpy.append(averageRate,avgR)
815
      medianRate=numpy.append(medianRate,medR)
816
      stddevRate=numpy.append(stddevRate,stdR)
817
    else:
818
      print "Values seem to be wrong..."
819
    #THREADS*=2
820
    if len(average)!=0:
821
      averageRate=averageRate.astype(int)
822
      medianRate=medianRate.astype(int)
823
      stddevRate=stddevRate.astype(int)
824
      numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev,averageRate,medianRate,stddevRate))
825
      ToSave=[ ExploredJobs,average,median,stddev,averageRate,medianRate,stddevRate ]
826
      numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %e %e %e %i %i %i')
827
    Jobs+=JobStep
828

    
829
  if Fit:
830
    FitAndPrint(ExploredJobs,median,Curves)