Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 246

Historique | Voir | Annoter | Télécharger (26,63 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import math
26
from socket import gethostname
27

    
28
Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
29
Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
30

    
31
# find prime factors of a number
32
# Get for WWW :
33
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
34
def PrimeFactors(x):
35
  factorlist=numpy.array([]).astype('uint32')
36
  loop=2
37
  while loop<=x:
38
    if x%loop==0:
39
      x/=loop
40
      factorlist=numpy.append(factorlist,[loop])
41
    else:
42
      loop+=1
43
  return factorlist
44

    
45
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
46
# output is thread number
47
def BestThreadsNumber(jobs):
48
  factors=PrimeFactors(jobs)
49
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
50
  threads=1
51
  for factor in matrix.transpose().ravel():
52
    threads=threads*factor
53
    if threads*threads>jobs or threads>512:
54
      break
55
  return(long(threads))
56

    
57
# Predicted Amdahl Law (Reduced with s=1-p)  
58
def AmdahlR(N, T1, p):
59
  return (T1*(1-p+p/N))
60

    
61
# Predicted Amdahl Law
62
def Amdahl(N, T1, s, p):
63
  return (T1*(s+p/N))
64

    
65
# Predicted Mylq Law with first order
66
def Mylq(N, T1,s,c,p):
67
  return (T1*(s+p/N)+c*N)
68

    
69
# Predicted Mylq Law with second order
70
def Mylq2(N, T1,s,c1,c2,p):
71
  return (T1*(s+p/N)+c1*N+c2*N*N)
72

    
73
KERNEL_CODE_CUDA="""
74
#define TCONG 0
75
#define TSHR3 1
76
#define TMWC 2
77
#define TKISS 3
78

79
#define TINT32 0
80
#define TINT64 1
81
#define TFP32 2
82
#define TFP64 3
83

84
// Marsaglia RNG very simple implementation
85

86
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
87
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
88
#define MWC   (znew+wnew)
89
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
90
#define CONG  (jcong=69069*jcong+1234567)
91
#define KISS  ((MWC^CONG)+SHR3)
92

93
#define MWCfp MWC * 2.328306435454494e-10f
94
#define KISSfp KISS * 2.328306435454494e-10f
95
#define SHR3fp SHR3 * 2.328306435454494e-10f
96
#define CONGfp CONG * 2.328306435454494e-10f
97

98
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
99
{
100

101
#if TRNG == TCONG
102
   uint jcong=seed_z+work;
103
#elif TRNG == TSHR3
104
   uint jsr=seed_w+work;
105
#elif TRNG == TMWC
106
   uint z=seed_z+work;
107
   uint w=seed_w+work;
108
#elif TRNG == TKISS
109
   uint jcong=seed_z+work;
110
   uint jsr=seed_w+work;
111
   uint z=seed_z-work;
112
   uint w=seed_w-work;
113
#endif
114

115
   ulong total=0;
116

117
   for (ulong i=0;i<iterations;i++) {
118

119
#if TYPE == TINT32
120
    #define THEONE 1073741824
121
    #if TRNG == TCONG
122
        uint x=CONG>>17 ;
123
        uint y=CONG>>17 ;
124
    #elif TRNG == TSHR3
125
        uint x=SHR3>>17 ;
126
        uint y=SHR3>>17 ;
127
    #elif TRNG == TMWC
128
        uint x=MWC>>17 ;
129
        uint y=MWC>>17 ;
130
    #elif TRNG == TKISS
131
        uint x=KISS>>17 ;
132
        uint y=KISS>>17 ;
133
    #endif
134
#elif TYPE == TINT64
135
    #define THEONE 4611686018427387904
136
    #if TRNG == TCONG
137
        ulong x=(ulong)(CONG>>1) ;
138
        ulong y=(ulong)(CONG>>1) ;
139
    #elif TRNG == TSHR3
140
        ulong x=(ulong)(SHR3>>1) ;
141
        ulong y=(ulong)(SHR3>>1) ;
142
    #elif TRNG == TMWC
143
        ulong x=(ulong)(MWC>>1) ;
144
        ulong y=(ulong)(MWC>>1) ;
145
    #elif TRNG == TKISS
146
        ulong x=(ulong)(KISS>>1) ;
147
        ulong y=(ulong)(KISS>>1) ;
148
    #endif
149
#elif TYPE == TFP32
150
    #define THEONE 1.0f
151
    #if TRNG == TCONG
152
        float x=CONGfp ;
153
        float y=CONGfp ;
154
    #elif TRNG == TSHR3
155
        float x=SHR3fp ;
156
        float y=SHR3fp ;
157
    #elif TRNG == TMWC
158
        float x=MWCfp ;
159
        float y=MWCfp ;
160
    #elif TRNG == TKISS
161
      float x=KISSfp ;
162
      float y=KISSfp ;
163
    #endif
164
#elif TYPE == TFP64
165
    #define THEONE 1.0f
166
    #if TRNG == TCONG
167
        double x=(double)CONGfp ;
168
        double y=(double)CONGfp ;
169
    #elif TRNG == TSHR3
170
        double x=(double)SHR3fp ;
171
        double y=(double)SHR3fp ;
172
    #elif TRNG == TMWC
173
        double x=(double)MWCfp ;
174
        double y=(double)MWCfp ;
175
    #elif TRNG == TKISS
176
        double x=(double)KISSfp ;
177
        double y=(double)KISSfp ;
178
    #endif
179
#endif
180

181
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
182
      total+=inside;
183
   }
184

185
   return(total);
186
}
187

188
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
189
{
190
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
191
   s[blockIdx.x]=total;
192
   __syncthreads();
193

194
}
195

196
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
197
{
198
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
199
   s[threadIdx.x]=total;
200
   __syncthreads();
201

202
}
203

204
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
205
{
206
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
207
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
208
   __syncthreads();
209
}
210

211
"""
212

    
213
KERNEL_CODE_OPENCL="""
214
#define TCONG 0
215
#define TSHR3 1
216
#define TMWC 2
217
#define TKISS 3
218

219
#define TINT32 0
220
#define TINT64 1
221
#define TFP32 2
222
#define TFP64 3
223

224
// Marsaglia RNG very simple implementation
225
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
226
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
227

228
#define MWC   (znew+wnew)
229
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
230
#define CONG  (jcong=69069*jcong+1234567)
231
#define KISS  ((MWC^CONG)+SHR3)
232

233
#define MWCfp MWC * 2.328306435454494e-10f
234
#define KISSfp KISS * 2.328306435454494e-10f
235
#define CONGfp CONG * 2.328306435454494e-10f
236
#define SHR3fp SHR3 * 2.328306435454494e-10f
237

238
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
239
{
240

241
#if TRNG == TCONG
242
   uint jcong=seed_z+work;
243
#elif TRNG == TSHR3
244
   uint jsr=seed_w+work;
245
#elif TRNG == TMWC
246
   uint z=seed_z+work;
247
   uint w=seed_w+work;
248
#elif TRNG == TKISS
249
   uint jcong=seed_z+work;
250
   uint jsr=seed_w+work;
251
   uint z=seed_z-work;
252
   uint w=seed_w-work;
253
#endif
254

255
   ulong total=0;
256

257
   for (ulong i=0;i<iterations;i++) {
258

259
#if TYPE == TINT32
260
    #define THEONE 1073741824
261
    #if TRNG == TCONG
262
        uint x=CONG>>17 ;
263
        uint y=CONG>>17 ;
264
    #elif TRNG == TSHR3
265
        uint x=SHR3>>17 ;
266
        uint y=SHR3>>17 ;
267
    #elif TRNG == TMWC
268
        uint x=MWC>>17 ;
269
        uint y=MWC>>17 ;
270
    #elif TRNG == TKISS
271
        uint x=KISS>>17 ;
272
        uint y=KISS>>17 ;
273
    #endif
274
#elif TYPE == TINT64
275
    #define THEONE 4611686018427387904
276
    #if TRNG == TCONG
277
        ulong x=(ulong)(CONG>>1) ;
278
        ulong y=(ulong)(CONG>>1) ;
279
    #elif TRNG == TSHR3
280
        ulong x=(ulong)(SHR3>>1) ;
281
        ulong y=(ulong)(SHR3>>1) ;
282
    #elif TRNG == TMWC
283
        ulong x=(ulong)(MWC>>1) ;
284
        ulong y=(ulong)(MWC>>1) ;
285
    #elif TRNG == TKISS
286
        ulong x=(ulong)(KISS>>1) ;
287
        ulong y=(ulong)(KISS>>1) ;
288
    #endif
289
#elif TYPE == TFP32
290
    #define THEONE 1.0f
291
    #if TRNG == TCONG
292
        float x=CONGfp ;
293
        float y=CONGfp ;
294
    #elif TRNG == TSHR3
295
        float x=SHR3fp ;
296
        float y=SHR3fp ;
297
    #elif TRNG == TMWC
298
        float x=MWCfp ;
299
        float y=MWCfp ;
300
    #elif TRNG == TKISS
301
      float x=KISSfp ;
302
      float y=KISSfp ;
303
    #endif
304
#elif TYPE == TFP64
305
#pragma OPENCL EXTENSION cl_khr_fp64: enable
306
    #define THEONE 1.0f
307
    #if TRNG == TCONG
308
        double x=(double)CONGfp ;
309
        double y=(double)CONGfp ;
310
    #elif TRNG == TSHR3
311
        double x=(double)SHR3fp ;
312
        double y=(double)SHR3fp ;
313
    #elif TRNG == TMWC
314
        double x=(double)MWCfp ;
315
        double y=(double)MWCfp ;
316
    #elif TRNG == TKISS
317
        double x=(double)KISSfp ;
318
        double y=(double)KISSfp ;
319
    #endif
320
#endif
321

322
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
323
      total+=inside;
324
   }
325
   
326
   return(total);
327
}
328

329
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
330
{
331
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
332
   barrier(CLK_GLOBAL_MEM_FENCE);
333
   s[get_global_id(0)]=total;     
334
}
335

336

337
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
338
{
339
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
340
   barrier(CLK_LOCAL_MEM_FENCE);
341
   s[get_local_id(0)]=total;
342
}
343

344
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
345
{
346
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
347
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
348
   s[get_global_id(0)]=total;
349
}
350

351
"""
352

    
353
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,RNG,ValueType):
354

    
355
  # Avec PyCUDA autoinit, rien a faire !
356
  
357
  circleCU = cuda.InOut(circle)
358

    
359
  try:
360
    mod = SourceModule(KERNEL_CODE_CUDA,options=['--compiler-options','-Wall -DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
361
  except:
362
    print "Compilation seems to brake"
363
  
364
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
365
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
366
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
367
  
368
  start = pycuda.driver.Event()
369
  stop = pycuda.driver.Event()
370
  
371
  MyPi=numpy.zeros(steps)
372
  MyDuration=numpy.zeros(steps)
373

    
374
  if iterations%jobs==0:
375
    iterationsCL=numpy.uint64(iterations/jobs)
376
    iterationsNew=iterationsCL*jobs
377
  else:
378
    iterationsCL=numpy.uint64(iterations/jobs+1)
379
    iterationsNew=iterations
380

    
381
  for i in range(steps):
382
    start.record()
383
    start.synchronize()
384
    if ParaStyle=='Blocks':
385
      MetropolisBlocksCU(circleCU,
386
                         numpy.uint64(iterationsCL),
387
                         numpy.uint32(nprnd(2**30/jobs)),
388
                         numpy.uint32(nprnd(2**30/jobs)),
389
                         grid=(jobs,1),
390
                         block=(1,1,1))
391
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
392
            (Alu,jobs,1,ParaStyle)      
393
    elif ParaStyle=='Hybrid':
394
      threads=BestThreadsNumber(jobs)
395
      MetropolisHybridCU(circleCU,
396
                         numpy.uint64(iterationsCL),
397
                         numpy.uint32(nprnd(2**30/jobs)),
398
                         numpy.uint32(nprnd(2**30/jobs)),
399
                         grid=(jobs,1),
400
                         block=(threads,1,1))
401
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
402
            (Alu,jobs/threads,threads,ParaStyle)
403
    else:
404
      MetropolisJobsCU(circleCU,
405
                       numpy.uint64(iterationsCL),
406
                       numpy.uint32(nprnd(2**30/jobs)),
407
                       numpy.uint32(nprnd(2**30/jobs)),
408
                       grid=(1,1),
409
                       block=(jobs,1,1))
410
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
411
            (Alu,jobs,1,ParaStyle)
412
    stop.record()
413
    stop.synchronize()
414
                
415
    elapsed = start.time_till(stop)*1e-3
416

    
417
    MyDuration[i]=elapsed
418
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
419
    MyPi[i]=numpy.median(AllPi)
420
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
421

    
422

    
423
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)
424

    
425
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
426

    
427

    
428
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
429
                     RNG,ValueType):
430
        
431
  # Initialisation des variables en les CASTant correctement
432
    
433
  if Device==0:
434
    print "Enter XPU selector based on ALU type: first selected"
435
    HasXPU=False
436
    # Default Device selection based on ALU Type
437
    for platform in cl.get_platforms():
438
      for device in platform.get_devices():
439
        deviceType=cl.device_type.to_string(device.type)
440
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
441
          XPU=device
442
          print "GPU selected: ",device.name
443
          HasXPU=True
444
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
445
          XPU=device
446
          print "CPU selected: ",device.name
447
          HasXPU=True
448
  else:
449
    print "Enter XPU selector based on device number & ALU type"
450
    Id=1
451
    HasXPU=False
452
    # Primary Device selection based on Device Id
453
    for platform in cl.get_platforms():
454
      for device in platform.get_devices():
455
        deviceType=cl.device_type.to_string(device.type)
456
        if Id==Device and Alu==deviceType and HasXPU==False:
457
          XPU=device
458
          print "CPU/GPU selected: ",device.name.lstrip()
459
          HasXPU=True
460
        Id=Id+1
461
    if HasXPU==False:
462
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
463
          (Device,Alu,Id-1)
464
      return(0,0,0)
465
                                
466
  # Je cree le contexte et la queue pour son execution
467
  ctx = cl.Context([XPU])
468
  queue = cl.CommandQueue(ctx,
469
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
470

    
471
  # Je recupere les flag possibles pour les buffers
472
  mf = cl.mem_flags
473
        
474
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
475

    
476
  
477
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
478
    options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType]))
479

    
480
  i=0
481

    
482
  MyPi=numpy.zeros(steps)
483
  MyDuration=numpy.zeros(steps)
484
  
485
  if iterations%jobs==0:
486
    iterationsCL=numpy.uint64(iterations/jobs)
487
    iterationsNew=numpy.uint64(iterationsCL*jobs)
488
  else:
489
    iterationsCL=numpy.uint64(iterations/jobs+1)
490
    iterationsNew=numpy.uint64(iterations)
491

    
492
  for i in range(steps):
493
                
494
    if ParaStyle=='Blocks':
495
      # Call OpenCL kernel
496
      # (1,) is Global work size (only 1 work size)
497
      # (1,) is local work size
498
      # circleCL is lattice translated in CL format
499
      # SeedZCL is lattice translated in CL format
500
      # SeedWCL is lattice translated in CL format
501
      # step is number of iterations
502
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
503
                                           circleCL,
504
                                           numpy.uint64(iterationsCL),
505
                                           numpy.uint32(nprnd(2**30/jobs)),
506
                                           numpy.uint32(nprnd(2**30/jobs)))
507
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
508
            (Alu,jobs,1,ParaStyle)
509
    elif ParaStyle=='Hybrid':
510
      threads=BestThreadsNumber(jobs)
511
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
512
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
513
                                          circleCL,
514
                                          numpy.uint64(iterationsCL),
515
                                          numpy.uint32(nprnd(2**30/jobs)),
516
                                          numpy.uint32(nprnd(2**30/jobs)))
517
        
518
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
519
            (Alu,jobs/threads,threads,ParaStyle)
520
    else:
521
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
522
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
523
                                          circleCL,
524
                                          numpy.uint64(iterationsCL),
525
                                          numpy.uint32(nprnd(2**30/jobs)),
526
                                          numpy.uint32(nprnd(2**30/jobs)))
527
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
528

    
529
    CLLaunch.wait()
530
    cl.enqueue_copy(queue, circle, circleCL).wait()
531

    
532
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
533

    
534
    print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
535
    MyDuration[i]=elapsed
536
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
537
    MyPi[i]=numpy.median(AllPi)
538
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
539

    
540
  circleCL.release()
541

    
542
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)
543
        
544
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
545

    
546

    
547
def FitAndPrint(N,D,Curves):
548

    
549
  from scipy.optimize import curve_fit
550
  import matplotlib.pyplot as plt
551

    
552
  try:
553
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
554

    
555
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
556
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
557
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
558
    coeffs_Amdahl[0]=D[0]
559
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
560
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
561
  except:
562
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
563

    
564
  try:
565
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
566

    
567
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
568
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
569
    coeffs_AmdahlR[0]=D[0]
570
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
571
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
572

    
573
  except:
574
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
575

    
576
  try:
577
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
578

    
579
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
580
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
581
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
582
    coeffs_Mylq[0]=D[0]
583
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
584
                                                            coeffs_Mylq[1],
585
                                                            coeffs_Mylq[3],
586
                                                            coeffs_Mylq[2])
587
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
588
                coeffs_Mylq[3])
589
  except:
590
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
591

    
592
  try:
593
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
594

    
595
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
596
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
597
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
598
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
599
    coeffs_Mylq2[0]=D[0]
600
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
601
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
602
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
603

    
604
  except:
605
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
606

    
607
  if Curves:
608
    plt.xlabel("Number of Threads/work Items")
609
    plt.ylabel("Total Elapsed Time")
610

    
611
    Experience,=plt.plot(N,D,'ro') 
612
    try:
613
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
614
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
615
    except:
616
      print "Fit curves seem not to be available"
617

    
618
    plt.legend()
619
    plt.show()
620

    
621
if __name__=='__main__':
622

    
623
  # Set defaults values
624
  
625
  # Alu can be CPU, GPU or ACCELERATOR
626
  Alu='CPU'
627
  # Id of GPU : 1 is for first find !
628
  Device=0
629
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
630
  GpuStyle='OpenCL'
631
  # Parallel distribution can be on Threads or Blocks
632
  ParaStyle='Blocks'
633
  # Iterations is integer
634
  Iterations=100000000
635
  # JobStart in first number of Jobs to explore
636
  JobStart=1
637
  # JobEnd is last number of Jobs to explore
638
  JobEnd=16
639
  # JobStep is the step of Jobs to explore
640
  JobStep=1
641
  # Redo is the times to redo the test to improve metrology
642
  Redo=1
643
  # OutMetrology is method for duration estimation : False is GPU inside
644
  OutMetrology=False
645
  Metrology='InMetro'
646
  # Curves is True to print the curves
647
  Curves=False
648
  # Fit is True to print the curves
649
  Fit=False
650
  # Marsaglia RNG
651
  RNG='MWC'
652
  # Value type : INT32, INT64, FP32, FP64
653
  ValueType='FP32'
654
  
655
  try:
656
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:v:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","marsaglia=","valuetype="])
657
  except getopt.GetoptError:
658
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64> ' % sys.argv[0]
659
    sys.exit(2)
660
    
661
  for opt, arg in opts:
662
    if opt == '-h':
663
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>' % sys.argv[0]
664

    
665
      print "\nInformations about devices detected under OpenCL:"
666
      # For PyOpenCL import
667
      try:
668
        import pyopencl as cl
669
        Id=1
670
        for platform in cl.get_platforms():
671
          for device in platform.get_devices():
672
            deviceType=cl.device_type.to_string(device.type)
673
            print "Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())
674
            Id=Id+1
675

    
676
        print
677
        sys.exit()
678
      except ImportError:
679
        print "Your platform does not seem to support OpenCL"
680
        
681
    elif opt == '-o':
682
      OutMetrology=True
683
      Metrology='OutMetro'
684
    elif opt == '-c':
685
      Curves=True
686
    elif opt == '-f':
687
      Fit=True
688
    elif opt in ("-a", "--alu"):
689
      Alu = arg
690
    elif opt in ("-d", "--device"):
691
      Device = int(arg)
692
    elif opt in ("-g", "--gpustyle"):
693
      GpuStyle = arg
694
    elif opt in ("-p", "--parastyle"):
695
      ParaStyle = arg
696
    elif opt in ("-m", "--marsaglia"):
697
      RNG = arg
698
    elif opt in ("-v", "--valuetype"):
699
      ValueType = arg
700
    elif opt in ("-i", "--iterations"):
701
      Iterations = numpy.uint64(arg)
702
    elif opt in ("-s", "--jobstart"):
703
      JobStart = int(arg)
704
    elif opt in ("-e", "--jobend"):
705
      JobEnd = int(arg)
706
    elif opt in ("-t", "--jobstep"):
707
      JobStep = int(arg)
708
    elif opt in ("-r", "--redo"):
709
      Redo = int(arg)
710

    
711
  if Alu=='CPU' and GpuStyle=='CUDA':
712
    print "Alu can't be CPU for CUDA, set Alu to GPU"
713
    Alu='GPU'
714

    
715
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
716
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
717
    ParaStyle='Threads'
718

    
719
  print "Compute unit : %s" % Alu
720
  print "Device Identification : %s" % Device
721
  print "GpuStyle used : %s" % GpuStyle
722
  print "Parallel Style used : %s" % ParaStyle
723
  print "Iterations : %s" % Iterations
724
  print "Number of threads on start : %s" % JobStart
725
  print "Number of threads on end : %s" % JobEnd
726
  print "Number of redo : %s" % Redo
727
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
728
  print "Type of Marsaglia RNG used : %s" % RNG
729
  print "Type of variable : %s" % ValueType
730

    
731
  if GpuStyle=='CUDA':
732
    try:
733
      # For PyCUDA import
734
      import pycuda.driver as cuda
735
      import pycuda.gpuarray as gpuarray
736
      import pycuda.autoinit
737
      from pycuda.compiler import SourceModule
738
    except ImportError:
739
      print "Platform does not seem to support CUDA"
740

    
741
  if GpuStyle=='OpenCL':
742
    try:
743
      # For PyOpenCL import
744
      import pyopencl as cl
745
      Id=1
746
      for platform in cl.get_platforms():
747
        for device in platform.get_devices():
748
          deviceType=cl.device_type.to_string(device.type)
749
          print "Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())
750

    
751
          if Id == Device:
752
            # Set the Alu as detected Device Type
753
            Alu=deviceType
754
          Id=Id+1
755
    except ImportError:
756
      print "Platform does not seem to support CUDA"
757
      
758
  average=numpy.array([]).astype(numpy.float32)
759
  median=numpy.array([]).astype(numpy.float32)
760
  stddev=numpy.array([]).astype(numpy.float32)
761
  averageRate=numpy.array([]).astype(numpy.float32)
762
  medianRate=numpy.array([]).astype(numpy.float32)
763
  stddevRate=numpy.array([]).astype(numpy.float32)
764

    
765
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
766

    
767
  Jobs=JobStart
768

    
769
  while Jobs <= JobEnd:
770
    avg,med,std=0,0,0
771
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
772
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
773

    
774
    if OutMetrology: 
775
      duration=numpy.array([]).astype(numpy.float32)
776
      rate=numpy.array([]).astype(numpy.float32)
777
      for i in range(Redo):
778
        start=time.time()
779
        if GpuStyle=='CUDA':
780
          try:
781
            a,m,s,aR,mR,sR=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,RNG,ValueType)
782
          except:
783
            print "Problem with %i // computations on Cuda" % Jobs
784
        elif GpuStyle=='OpenCL':
785
          try:
786
            a,m,s,aR,mR,sR=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device,RNG,ValueType)
787
          except:
788
            print "Problem with %i // computations on OpenCL" % Jobs            
789
        duration=numpy.append(duration,time.time()-start)
790
        rate=numpy.append(rate,Iterations/(time.time()-start))
791
      if (a,m,s) != (0,0,0): 
792
        avg=numpy.mean(duration)
793
        med=numpy.median(duration)
794
        std=numpy.std(duration)
795
        avgR=numpy.mean(Iterations/duration)
796
        medR=numpy.median(Iterations/duration)
797
        stdR=numpy.std(Iterations/duration)
798
      else:
799
        print "Values seem to be wrong..."
800
    else:
801
      if GpuStyle=='CUDA':
802
        try:
803
          avg,med,std,avgR,medR,stdR=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,RNG,ValueType)
804
        except:
805
          print "Problem with %i // computations on Cuda" % Jobs
806
      elif GpuStyle=='OpenCL':
807
        try:
808
          avg,med,std,avgR,medR,stdR=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,RNG,ValueType)
809
        except:
810
          print "Problem with %i // computations on OpenCL" % Jobs            
811

    
812
    if (avg,med,std) != (0,0,0):
813
      print "jobs,avg,med,std",Jobs,avg,med,std
814
      average=numpy.append(average,avg)
815
      median=numpy.append(median,med)
816
      stddev=numpy.append(stddev,std)
817
      averageRate=numpy.append(averageRate,avgR)
818
      medianRate=numpy.append(medianRate,medR)
819
      stddevRate=numpy.append(stddevRate,stdR)
820
    else:
821
      print "Values seem to be wrong..."
822
    #THREADS*=2
823
    if len(average)!=0:
824
      averageRate=averageRate.astype(int)
825
      medianRate=medianRate.astype(int)
826
      stddevRate=stddevRate.astype(int)
827
      numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev,averageRate,medianRate,stddevRate))
828
      ToSave=[ ExploredJobs,average,median,stddev,averageRate,medianRate,stddevRate ]
829
      numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %e %e %e %i %i %i')
830
    Jobs+=JobStep
831

    
832
  if Fit:
833
    FitAndPrint(ExploredJobs,median,Curves)