Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 50

Historique | Voir | Annoter | Télécharger (26,31 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MC using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import matplotlib.pyplot as plt
23
import math
24
from scipy.optimize import curve_fit
25
from socket import gethostname
26

    
27
# find prime factors of a number
28
# Get for WWW :
29
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
30
def PrimeFactors(x):
31
  factorlist=numpy.array([]).astype('uint32')
32
  loop=2
33
  while loop<=x:
34
    if x%loop==0:
35
      x/=loop
36
      factorlist=numpy.append(factorlist,[loop])
37
    else:
38
      loop+=1
39
  return factorlist
40

    
41
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
42
# output is thread number
43
def BestThreadsNumber(jobs):
44
  factors=PrimeFactors(jobs)
45
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
46
  threads=1
47
  for factor in matrix.transpose().ravel():
48
    threads=threads*factor
49
    if threads*threads>jobs:
50
      break
51
  return(long(threads))
52

    
53
# Predicted Amdahl Law (Reduced with s=1-p)  
54
def AmdahlR(N, T1, p):
55
  return (T1*(1-p+p/N))
56

    
57
# Predicted Amdahl Law
58
def Amdahl(N, T1, s, p):
59
  return (T1*(s+p/N))
60

    
61
# Predicted Mylq Law with first order
62
def Mylq(N, T1,s,c,p):
63
  return (T1*(s+p/N)+c*N)
64

    
65
# Predicted Mylq Law with second order
66
def Mylq2(N, T1,s,c1,c2,p):
67
  return (T1*(s+p/N)+c1*N+c2*N*N)
68

    
69
KERNEL_CODE_CUDA="""
70

71
// Marsaglia RNG very simple implementation
72

73
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
74
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
75
#define MWC   (znew+wnew)
76
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
77
#define CONG  (jcong=69069*jcong+1234567)
78
#define KISS  ((MWC^CONG)+SHR3)
79

80
#define MWCfp MWC * 2.328306435454494e-10f
81
#define KISSfp KISS * 2.328306435454494e-10f
82

83
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
84
{
85
   uint z=seed_z/(blockIdx.x+1);
86
   uint w=seed_w/(blockIdx.x+1);
87

88
   ulong total=0;
89

90
   for (ulong i=0;i<iterations;i++) {
91

92
      float x=MWCfp ;
93
      float y=MWCfp ;
94

95
      // Matching test
96
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
97
      total+=inside;
98

99
   }
100

101
   s[blockIdx.x]=total;
102
   __syncthreads();
103

104
}
105

106
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
107
{
108
   uint z=seed_z/(threadIdx.x+1);
109
   uint w=seed_w/(threadIdx.x+1);
110

111
   ulong total=0;
112

113
   for (ulong i=0;i<iterations;i++) {
114

115
      float x=MWCfp ;
116
      float y=MWCfp ;
117

118
      // Matching test
119
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
120
      total+=inside;
121

122
   }
123

124
   s[threadIdx.x]=total;
125
   __syncthreads();
126

127
}
128

129
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
130
{
131
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
132
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
133

134
   ulong total=0;
135

136
   for (ulong i=0;i<iterations;i++) {
137

138
      float x=MWCfp ;
139
      float y=MWCfp ;
140

141
      // Matching test
142
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
143
      total+=inside;
144

145
   }
146

147
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
148
   __syncthreads();
149

150
}
151

152
__global__ void MainLoopBlocks64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
153
{
154
   uint z=seed_z/(blockIdx.x+1);
155
   uint w=seed_w/(blockIdx.x+1);
156

157
   ulong total=0;
158

159
   for (ulong i=0;i<iterations;i++) {
160

161
      double x=(double)MWCfp ;
162
      double y=(double)MWCfp ;
163

164
      // Matching test
165
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
166
      total+=inside;
167

168
   }
169

170
   s[blockIdx.x]=total;
171
   __syncthreads();
172

173
}
174

175
__global__ void MainLoopThreads64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
176
{
177
   uint z=seed_z/(threadIdx.x+1);
178
   uint w=seed_w/(threadIdx.x+1);
179

180
   ulong total=0;
181

182
   for (ulong i=0;i<iterations;i++) {
183

184
      double x=(double)MWCfp ;
185
      double y=(double)MWCfp ;
186

187
      // Matching test
188
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
189
      total+=inside;
190

191
   }
192

193
   s[threadIdx.x]=total;
194
   __syncthreads();
195

196
}
197

198
__global__ void MainLoopHybrid64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
199
{
200
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
201
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
202

203
   ulong total=0;
204

205
   for (ulong i=0;i<iterations;i++) {
206

207
      double x=(double)MWCfp ;
208
      double y=(double)MWCfp ;
209

210
      // Matching test
211
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
212
      total+=inside;
213

214
   }
215

216
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
217
   __syncthreads();
218

219
}
220
"""
221

    
222
KERNEL_CODE_OPENCL="""
223
#pragma OPENCL EXTENSION cl_khr_fp64: enable
224

225
// Marsaglia RNG very simple implementation
226
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
227
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
228
#define MWC   (znew+wnew)
229
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
230
#define CONG  (jcong=69069*jcong+1234567)
231
#define KISS  ((MWC^CONG)+SHR3)
232

233
#define MWCfp MWC * 2.328306435454494e-10f
234
#define KISSfp KISS * 2.328306435454494e-10f
235

236
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
237
{
238
   uint z=seed_z/(get_global_id(0)+1);
239
   uint w=seed_w/(get_global_id(0)+1);
240

241
   ulong total=0;
242

243
   for (ulong i=0;i<iterations;i++) {
244

245
      float x=MWCfp ;
246
      float y=MWCfp ;
247

248
      // Matching test
249
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
250
      total+=inside;
251
   }
252
   s[get_global_id(0)]=total;
253
   barrier(CLK_GLOBAL_MEM_FENCE);
254
      
255
}
256

257
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
258
{
259
   uint z=seed_z/(get_local_id(0)+1);
260
   uint w=seed_w/(get_local_id(0)+1);
261

262
   ulong total=0;
263

264
   for (ulong i=0;i<iterations;i++) {
265

266
      float x=MWCfp ;
267
      float y=MWCfp ;
268

269
      // Matching test
270
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
271
      total+=inside;
272
   }
273
   s[get_local_id(0)]=total;
274
   barrier(CLK_LOCAL_MEM_FENCE);
275
      
276
}
277

278
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
279
{
280
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
281
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
282

283
   ulong total=0;
284

285
   for (uint i=0;i<iterations;i++) {
286

287
      float x=MWCfp ;
288
     float y=MWCfp ;
289

290
      // Matching test
291
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
292
      total+=inside;
293
   }
294
   barrier(CLK_LOCAL_MEM_FENCE);
295
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
296
      
297
}
298

299
__kernel void MainLoopGlobal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
300
{
301
   uint z=seed_z/(get_global_id(0)+1);
302
   uint w=seed_w/(get_global_id(0)+1);
303

304
   ulong total=0;
305

306
   for (ulong i=0;i<iterations;i++) {
307

308
      double x=(double)MWCfp ;
309
      double y=(double)MWCfp ;
310

311
      // Matching test
312
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
313
      total+=inside;
314
   }
315
   s[get_global_id(0)]=total;
316
   barrier(CLK_GLOBAL_MEM_FENCE);
317
      
318
}
319

320
__kernel void MainLoopLocal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
321
{
322
   uint z=seed_z/(get_local_id(0)+1);
323
   uint w=seed_w/(get_local_id(0)+1);
324

325
   ulong total=0;
326

327
   for (ulong i=0;i<iterations;i++) {
328

329
      double x=(double)MWCfp ;
330
      double y=(double)MWCfp ;
331

332
      // Matching test
333
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
334
      total+=inside;
335
   }
336
   s[get_local_id(0)]=total;
337
   barrier(CLK_LOCAL_MEM_FENCE);
338
      
339
}
340

341
__kernel void MainLoopHybrid64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
342
{
343
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
344
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
345

346
   ulong total=0;
347

348
   for (uint i=0;i<iterations;i++) {
349

350
      double x=(double)MWCfp ;
351
      double y=(double)MWCfp ;
352

353
      // Matching test
354
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
355
      total+=inside;
356
   }
357
   barrier(CLK_LOCAL_MEM_FENCE);
358
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
359
      
360
}
361
"""
362

    
363
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,DoublePrecision):
364

    
365
  # Avec PyCUDA autoinit, rien a faire !
366
  
367
  circleCU = cuda.InOut(circle)
368
  
369
  mod = SourceModule(KERNEL_CODE_CUDA)
370

    
371
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
372
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
373
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
374
  MetropolisBlocks64CU=mod.get_function("MainLoopBlocks64")
375
  MetropolisJobs64CU=mod.get_function("MainLoopThreads64")
376
  MetropolisHybrid64CU=mod.get_function("MainLoopHybrid64")
377
  
378
  start = pycuda.driver.Event()
379
  stop = pycuda.driver.Event()
380
  
381
  MyPi=numpy.zeros(steps)
382
  MyDuration=numpy.zeros(steps)
383

    
384
  if iterations%jobs==0:
385
    iterationsCL=numpy.uint64(iterations/jobs)
386
    iterationsNew=iterationsCL*jobs
387
  else:
388
    iterationsCL=numpy.uint64(iterations/jobs+1)
389
    iterationsNew=iterations
390

    
391
  for i in range(steps):
392
    start.record()
393
    start.synchronize()
394
    if ParaStyle=='Blocks':
395
      if DoublePrecision:
396
        MetropolisBlocksCU(circleCU,
397
                           numpy.uint64(iterationsCL),
398
                           numpy.uint32(nprnd(2**30/jobs)),
399
                           numpy.uint32(nprnd(2**30/jobs)),
400
                           grid=(jobs,1),
401
                           block=(1,1,1))
402
      else:
403
        MetropolisBlocks64CU(circleCU,
404
                             numpy.uint64(iterationsCL),
405
                             numpy.uint32(nprnd(2**30/jobs)),
406
                             numpy.uint32(nprnd(2**30/jobs)),
407
                             grid=(jobs,1),
408
                             block=(1,1,1))
409
        
410
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
411
            (Alu,jobs,1,ParaStyle)      
412
    elif ParaStyle=='Hybrid':
413
      threads=BestThreadsNumber(jobs)
414
      if DoublePrecision:
415
        MetropolisHybrid64CU(circleCU,
416
                             numpy.uint64(iterationsCL),
417
                             numpy.uint32(nprnd(2**30/jobs)),
418
                             numpy.uint32(nprnd(2**30/jobs)),
419
                             grid=(jobs,1),
420
                             block=(threads,1,1))
421
      else:
422
        MetropolisHybridCU(circleCU,
423
                           numpy.uint64(iterationsCL),
424
                           numpy.uint32(nprnd(2**30/jobs)),
425
                           numpy.uint32(nprnd(2**30/jobs)),
426
                           grid=(jobs,1),
427
                           block=(threads,1,1))
428
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
429
            (Alu,jobs/threads,threads,ParaStyle)
430
    else:
431
      if DoublePrecision:
432
        MetropolisJobs64CU(circleCU,
433
                           numpy.uint64(iterationsCL),
434
                           numpy.uint32(nprnd(2**30/jobs)),
435
                           numpy.uint32(nprnd(2**30/jobs)),
436
                           grid=(1,1),
437
                           block=(jobs,1,1))
438
      else:
439
        MetropolisJobsCU(circleCU,
440
                         numpy.uint64(iterationsCL),
441
                         numpy.uint32(nprnd(2**30/jobs)),
442
                         numpy.uint32(nprnd(2**30/jobs)),
443
                         grid=(1,1),
444
                         block=(jobs,1,1))
445
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
446
            (Alu,jobs,1,ParaStyle)
447
    stop.record()
448
    stop.synchronize()
449
                
450
    elapsed = start.time_till(stop)*1e-3
451

    
452
    MyDuration[i]=elapsed
453
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
454
    MyPi[i]=numpy.median(AllPi)
455
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
456

    
457

    
458
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
459

    
460
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
461

    
462

    
463
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
464
                     DoublePrecision):
465
        
466
  # Initialisation des variables en les CASTant correctement
467
    
468
  if Device==0:
469
    print "Enter XPU selector based on ALU type: first selected"
470
    HasXPU=False
471
    # Default Device selection based on ALU Type
472
    for platform in cl.get_platforms():
473
      for device in platform.get_devices():
474
        deviceType=cl.device_type.to_string(device.type)
475
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
476
          XPU=device
477
          print "GPU selected: ",device.name
478
          HasXPU=True
479
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
480
          XPU=device
481
          print "CPU selected: ",device.name
482
          HasXPU=True
483
  else:
484
    print "Enter XPU selector based on device number & ALU type"
485
    Id=1
486
    HasXPU=False
487
    # Primary Device selection based on Device Id
488
    for platform in cl.get_platforms():
489
      for device in platform.get_devices():
490
        deviceType=cl.device_type.to_string(device.type)
491
        if Id==Device and Alu==deviceType and HasXPU==False:
492
          XPU=device
493
          print "CPU/GPU selected: ",device.name
494
          HasXPU=True
495
        Id=Id+1
496
    if HasXPU==False:
497
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
498
          (Device,Alu,Id-1)
499
      return(0,0,0)
500
                                
501
  # Je cree le contexte et la queue pour son execution
502
  ctx = cl.Context([XPU])
503
  queue = cl.CommandQueue(ctx,
504
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
505

    
506
  # Je recupere les flag possibles pour les buffers
507
  mf = cl.mem_flags
508
        
509
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
510

    
511
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
512
    options = "-cl-mad-enable -cl-fast-relaxed-math")
513

    
514
  i=0
515

    
516
  MyPi=numpy.zeros(steps)
517
  MyDuration=numpy.zeros(steps)
518
  
519
  if iterations%jobs==0:
520
    iterationsCL=numpy.uint64(iterations/jobs)
521
    iterationsNew=numpy.uint64(iterationsCL*jobs)
522
  else:
523
    iterationsCL=numpy.uint64(iterations/jobs+1)
524
    iterationsNew=numpy.uint64(iterations)
525

    
526
  for i in range(steps):
527
                
528
    if ParaStyle=='Blocks':
529
      # Call OpenCL kernel
530
      # (1,) is Global work size (only 1 work size)
531
      # (1,) is local work size
532
      # circleCL is lattice translated in CL format
533
      # SeedZCL is lattice translated in CL format
534
      # SeedWCL is lattice translated in CL format
535
      # step is number of iterations
536
      if DoublePrecision:
537
        CLLaunch=MetropolisCL.MainLoopGlobal64(queue,(jobs,),None,
538
                                               circleCL,
539
                                               numpy.uint64(iterationsCL),
540
                                               numpy.uint32(nprnd(2**30/jobs)),
541
                                               numpy.uint32(nprnd(2**30/jobs)))
542
      else:
543
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
544
                                             circleCL,
545
                                             numpy.uint64(iterationsCL),
546
                                             numpy.uint32(nprnd(2**30/jobs)),
547
                                             numpy.uint32(nprnd(2**30/jobs)))
548
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
549
            (Alu,jobs,1,ParaStyle)
550
    elif ParaStyle=='Hybrid':
551
      threads=BestThreadsNumber(jobs)
552
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
553
      if DoublePrecision:
554
        CLLaunch=MetropolisCL.MainLoopHybrid64(queue,(jobs,),(threads,),
555
                                               circleCL,
556
                                               numpy.uint64(iterationsCL),
557
                                               numpy.uint32(nprnd(2**30/jobs)),
558
                                               numpy.uint32(nprnd(2**30/jobs)))
559
      else:
560
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
561
                                             circleCL,
562
                                             numpy.uint64(iterationsCL),
563
                                             numpy.uint32(nprnd(2**30/jobs)),
564
                                             numpy.uint32(nprnd(2**30/jobs)))
565
        
566
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
567
            (Alu,jobs/threads,threads,ParaStyle)
568
    else:
569
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
570
      if DoublePrecision:
571
        CLLaunch=MetropolisCL.MainLoopLocal64(queue,(jobs,),(jobs,),
572
                                              circleCL,
573
                                              numpy.uint64(iterationsCL),
574
                                              numpy.uint32(nprnd(2**30/jobs)),
575
                                              numpy.uint32(nprnd(2**30/jobs)))
576
      else:
577
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
578
                                            circleCL,
579
                                            numpy.uint64(iterationsCL),
580
                                            numpy.uint32(nprnd(2**30/jobs)),
581
                                            numpy.uint32(nprnd(2**30/jobs)))
582
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
583

    
584
    CLLaunch.wait()
585
    cl.enqueue_copy(queue, circle, circleCL).wait()
586

    
587
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
588

    
589
    MyDuration[i]=elapsed
590
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
591
    MyPi[i]=numpy.median(AllPi)
592
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
593

    
594
  circleCL.release()
595

    
596
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
597
        
598
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
599

    
600

    
601
def FitAndPrint(N,D,Curves):
602

    
603
  try:
604
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
605

    
606
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
607
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
608
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
609
    coeffs_Amdahl[0]=D[0]
610
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
611
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
612
  except:
613
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
614

    
615
  try:
616
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
617

    
618
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
619
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
620
    coeffs_AmdahlR[0]=D[0]
621
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
622
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
623

    
624
  except:
625
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
626

    
627
  try:
628
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
629

    
630
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
631
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
632
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
633
    coeffs_Mylq[0]=D[0]
634
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
635
                                                            coeffs_Mylq[1],
636
                                                            coeffs_Mylq[3],
637
                                                            coeffs_Mylq[2])
638
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
639
                coeffs_Mylq[3])
640
  except:
641
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
642

    
643
  try:
644
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
645

    
646
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
647
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
648
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
649
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
650
    coeffs_Mylq2[0]=D[0]
651
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
652
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
653
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
654

    
655
  except:
656
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
657

    
658
  if Curves:
659
    plt.xlabel("Number of Threads/work Items")
660
    plt.ylabel("Total Elapsed Time")
661

    
662
    Experience,=plt.plot(N,D,'ro') 
663
    try:
664
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
665
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
666
    except:
667
      print "Fit curves seem not to be available"
668

    
669
    plt.legend()
670
    plt.show()
671

    
672
if __name__=='__main__':
673

    
674
  # Set defaults values
675
  # Alu can be CPU or GPU
676
  Alu='CPU'
677
  # Id of GPU : 1 is for first find !
678
  Device=0
679
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
680
  GpuStyle='OpenCL'
681
  # Parallel distribution can be on Threads or Blocks
682
  ParaStyle='Blocks'
683
  # Iterations is integer
684
  Iterations=100000000
685
  # JobStart in first number of Jobs to explore
686
  JobStart=1
687
  # JobEnd is last number of Jobs to explore
688
  JobEnd=16
689
  # JobStep is the step of Jobs to explore
690
  JobStep=1
691
  # Redo is the times to redo the test to improve metrology
692
  Redo=1
693
  # OutMetrology is method for duration estimation : False is GPU inside
694
  OutMetrology=False
695
  Metrology='InMetro'
696
  # Curves is True to print the curves
697
  Curves=False
698
  # DoublePrecision on FP calculus
699
  DoublePrecision=False
700

    
701
  try:
702
    opts, args = getopt.getopt(sys.argv[1:],"hocla:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
703
  except getopt.GetoptError:
704
    print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
705
    sys.exit(2)
706
    
707
  for opt, arg in opts:
708
    if opt == '-h':
709
      print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
710

    
711
      print "\nInformations about devices detected under OpenCL:"
712
      # For PyOpenCL import
713
      import pyopencl as cl
714
      Id=1
715
      for platform in cl.get_platforms():
716
        for device in platform.get_devices():
717
          deviceType=cl.device_type.to_string(device.type)
718
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
719
          Id=Id+1
720

    
721
      sys.exit()
722
    elif opt == '-o':
723
      OutMetrology=True
724
      Metrology='OutMetro'
725
    elif opt == '-l':
726
      DoublePrecision=True
727
    elif opt == '-c':
728
      Curves=True
729
    elif opt in ("-a", "--alu"):
730
      Alu = arg
731
    elif opt in ("-d", "--device"):
732
      Device = int(arg)
733
    elif opt in ("-g", "--gpustyle"):
734
      GpuStyle = arg
735
    elif opt in ("-p", "--parastyle"):
736
      ParaStyle = arg
737
    elif opt in ("-i", "--iterations"):
738
      Iterations = numpy.uint64(arg)
739
    elif opt in ("-s", "--jobstart"):
740
      JobStart = int(arg)
741
    elif opt in ("-e", "--jobend"):
742
      JobEnd = int(arg)
743
    elif opt in ("-t", "--jobstep"):
744
      JobStep = int(arg)
745
    elif opt in ("-r", "--redo"):
746
      Redo = int(arg)
747

    
748
  if Alu=='CPU' and GpuStyle=='CUDA':
749
    print "Alu can't be CPU for CUDA, set Alu to GPU"
750
    Alu='GPU'
751

    
752
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
753
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
754
    ParaStyle='Threads'
755

    
756
  print "Compute unit : %s" % Alu
757
  print "Device Identification : %s" % Device
758
  print "GpuStyle used : %s" % GpuStyle
759
  print "Parallel Style used : %s" % ParaStyle
760
  print "Iterations : %s" % Iterations
761
  print "Number of threads on start : %s" % JobStart
762
  print "Number of threads on end : %s" % JobEnd
763
  print "Number of redo : %s" % Redo
764
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
765
  print "Double Precision in Kernels : %r" % DoublePrecision
766

    
767
  if GpuStyle=='CUDA':
768
    # For PyCUDA import
769
    import pycuda.driver as cuda
770
    import pycuda.gpuarray as gpuarray
771
    import pycuda.autoinit
772
    from pycuda.compiler import SourceModule
773

    
774
  if GpuStyle=='OpenCL':
775
    # For PyOpenCL import
776
    import pyopencl as cl
777
    Id=1
778
    for platform in cl.get_platforms():
779
      for device in platform.get_devices():
780
        deviceType=cl.device_type.to_string(device.type)
781
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
782
        if Id == Device:
783
          # Set the Alu as detected Device Type
784
          Alu=deviceType
785
        Id=Id+1
786

    
787
  average=numpy.array([]).astype(numpy.float32)
788
  median=numpy.array([]).astype(numpy.float32)
789
  stddev=numpy.array([]).astype(numpy.float32)
790

    
791
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
792

    
793
  Jobs=JobStart
794

    
795
  while Jobs <= JobEnd:
796
    avg,med,std=0,0,0
797
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
798
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
799

    
800
    if OutMetrology:
801
      duration=numpy.array([]).astype(numpy.float32)
802
      for i in range(Redo):
803
        start=time.time()
804
        if GpuStyle=='CUDA':
805
          try:
806
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
807
                                 DoublePrecision)
808
          except:
809
            print "Problem with %i // computations on Cuda" % Jobs
810
        elif GpuStyle=='OpenCL':
811
          try:
812
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
813
                                   Alu,Device,DoublePrecision)
814
          except:
815
            print "Problem with %i // computations on OpenCL" % Jobs            
816
        duration=numpy.append(duration,time.time()-start)
817
      if (a,m,s) != (0,0,0):
818
        avg=numpy.mean(duration)
819
        med=numpy.median(duration)
820
        std=numpy.std(duration)
821
      else:
822
        print "Values seem to be wrong..."
823
    else:
824
      if GpuStyle=='CUDA':
825
        try:
826
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,
827
                                     DoublePrecision)
828
        except:
829
          print "Problem with %i // computations on Cuda" % Jobs
830
      elif GpuStyle=='OpenCL':
831
        # try:
832
        #   avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
833
        # except:
834
        #   print "Problem with %i // computations on OpenCL" % Jobs            
835
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,DoublePrecision)
836

    
837
    if (avg,med,std) != (0,0,0):
838
      print "jobs,avg,med,std",Jobs,avg,med,std
839
      average=numpy.append(average,avg)
840
      median=numpy.append(median,med)
841
      stddev=numpy.append(stddev,std)
842
    else:
843
      print "Values seem to be wrong..."
844
    #THREADS*=2
845
    if DoublePrecision:
846
      Precision='DP'
847
    else:
848
      Precision='SP'
849
    if len(average)!=0:
850
      numpy.savez("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
851
      ToSave=[ ExploredJobs,average,median,stddev ]
852
      numpy.savetxt("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
853
    Jobs+=JobStep
854

    
855
  FitAndPrint(ExploredJobs,median,Curves)
856