Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 64

Historique | Voir | Annoter | Télécharger (26,72 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import math
23
from socket import gethostname
24

    
25
# find prime factors of a number
26
# Get for WWW :
27
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
28
def PrimeFactors(x):
29
  factorlist=numpy.array([]).astype('uint32')
30
  loop=2
31
  while loop<=x:
32
    if x%loop==0:
33
      x/=loop
34
      factorlist=numpy.append(factorlist,[loop])
35
    else:
36
      loop+=1
37
  return factorlist
38

    
39
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
40
# output is thread number
41
def BestThreadsNumber(jobs):
42
  factors=PrimeFactors(jobs)
43
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
44
  threads=1
45
  for factor in matrix.transpose().ravel():
46
    threads=threads*factor
47
    if threads*threads>jobs:
48
      break
49
  return(long(threads))
50

    
51
# Predicted Amdahl Law (Reduced with s=1-p)  
52
def AmdahlR(N, T1, p):
53
  return (T1*(1-p+p/N))
54

    
55
# Predicted Amdahl Law
56
def Amdahl(N, T1, s, p):
57
  return (T1*(s+p/N))
58

    
59
# Predicted Mylq Law with first order
60
def Mylq(N, T1,s,c,p):
61
  return (T1*(s+p/N)+c*N)
62

    
63
# Predicted Mylq Law with second order
64
def Mylq2(N, T1,s,c1,c2,p):
65
  return (T1*(s+p/N)+c1*N+c2*N*N)
66

    
67
KERNEL_CODE_CUDA="""
68

69
// Marsaglia RNG very simple implementation
70

71
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73
#define MWC   (znew+wnew)
74
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
75
#define CONG  (jcong=69069*jcong+1234567)
76
#define KISS  ((MWC^CONG)+SHR3)
77

78
#define MWCfp MWC * 2.328306435454494e-10f
79
#define KISSfp KISS * 2.328306435454494e-10f
80

81
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
82
{
83
   uint z=seed_z/(blockIdx.x+1);
84
   uint w=seed_w/(blockIdx.x+1);
85

86
   ulong total=0;
87

88
   for (ulong i=0;i<iterations;i++) {
89

90
      float x=MWCfp ;
91
      float y=MWCfp ;
92

93
      // Matching test
94
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
95
      total+=inside;
96

97
   }
98

99
   s[blockIdx.x]=total;
100
   __syncthreads();
101

102
}
103

104
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
105
{
106
   uint z=seed_z/(threadIdx.x+1);
107
   uint w=seed_w/(threadIdx.x+1);
108

109
   ulong total=0;
110

111
   for (ulong i=0;i<iterations;i++) {
112

113
      float x=MWCfp ;
114
      float y=MWCfp ;
115

116
      // Matching test
117
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
118
      total+=inside;
119

120
   }
121

122
   s[threadIdx.x]=total;
123
   __syncthreads();
124

125
}
126

127
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
128
{
129
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
130
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
131

132
   ulong total=0;
133

134
   for (ulong i=0;i<iterations;i++) {
135

136
      float x=MWCfp ;
137
      float y=MWCfp ;
138

139
      // Matching test
140
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
141
      total+=inside;
142

143
   }
144

145
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
146
   __syncthreads();
147

148
}
149

150
__global__ void MainLoopBlocks64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
151
{
152
   uint z=seed_z/(blockIdx.x+1);
153
   uint w=seed_w/(blockIdx.x+1);
154

155
   ulong total=0;
156

157
   for (ulong i=0;i<iterations;i++) {
158

159
      double x=(double)MWCfp ;
160
      double y=(double)MWCfp ;
161

162
      // Matching test
163
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
164
      total+=inside;
165

166
   }
167

168
   s[blockIdx.x]=total;
169
   __syncthreads();
170

171
}
172

173
__global__ void MainLoopThreads64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
174
{
175
   uint z=seed_z/(threadIdx.x+1);
176
   uint w=seed_w/(threadIdx.x+1);
177

178
   ulong total=0;
179

180
   for (ulong i=0;i<iterations;i++) {
181

182
      double x=(double)MWCfp ;
183
      double y=(double)MWCfp ;
184

185
      // Matching test
186
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
187
      total+=inside;
188

189
   }
190

191
   s[threadIdx.x]=total;
192
   __syncthreads();
193

194
}
195

196
__global__ void MainLoopHybrid64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
197
{
198
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
199
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
200

201
   ulong total=0;
202

203
   for (ulong i=0;i<iterations;i++) {
204

205
      double x=(double)MWCfp ;
206
      double y=(double)MWCfp ;
207

208
      // Matching test
209
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
210
      total+=inside;
211

212
   }
213

214
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
215
   __syncthreads();
216

217
}
218
"""
219

    
220
KERNEL_CODE_OPENCL="""
221
#pragma OPENCL EXTENSION cl_khr_fp64: enable
222

223
// Marsaglia RNG very simple implementation
224
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
225
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
226
#define MWC   (znew+wnew)
227
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
228
#define CONG  (jcong=69069*jcong+1234567)
229
#define KISS  ((MWC^CONG)+SHR3)
230

231
#define MWCfp MWC * 2.328306435454494e-10f
232
#define KISSfp KISS * 2.328306435454494e-10f
233

234
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
235
{
236
   uint z=seed_z/(get_global_id(0)+1);
237
   uint w=seed_w/(get_global_id(0)+1);
238

239
   ulong total=0;
240

241
   for (ulong i=0;i<iterations;i++) {
242

243
      float x=MWCfp ;
244
      float y=MWCfp ;
245

246
      // Matching test
247
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
248
      total+=inside;
249
   }
250
   s[get_global_id(0)]=total;
251
   barrier(CLK_GLOBAL_MEM_FENCE);
252
      
253
}
254

255
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
256
{
257
   uint z=seed_z/(get_local_id(0)+1);
258
   uint w=seed_w/(get_local_id(0)+1);
259

260
   ulong total=0;
261

262
   for (ulong i=0;i<iterations;i++) {
263

264
      float x=MWCfp ;
265
      float y=MWCfp ;
266

267
      // Matching test
268
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
269
      total+=inside;
270
   }
271
   s[get_local_id(0)]=total;
272
   barrier(CLK_LOCAL_MEM_FENCE);
273
      
274
}
275

276
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
277
{
278
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
279
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
280

281
   ulong total=0;
282

283
   for (uint i=0;i<iterations;i++) {
284

285
      float x=MWCfp ;
286
     float y=MWCfp ;
287

288
      // Matching test
289
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
290
      total+=inside;
291
   }
292
   barrier(CLK_LOCAL_MEM_FENCE);
293
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
294
      
295
}
296

297
__kernel void MainLoopGlobal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
298
{
299
   uint z=seed_z/(get_global_id(0)+1);
300
   uint w=seed_w/(get_global_id(0)+1);
301

302
   ulong total=0;
303

304
   for (ulong i=0;i<iterations;i++) {
305

306
      double x=(double)MWCfp ;
307
      double y=(double)MWCfp ;
308

309
      // Matching test
310
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
311
      total+=inside;
312
   }
313
   s[get_global_id(0)]=total;
314
   barrier(CLK_GLOBAL_MEM_FENCE);
315
      
316
}
317

318
__kernel void MainLoopLocal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
319
{
320
   uint z=seed_z/(get_local_id(0)+1);
321
   uint w=seed_w/(get_local_id(0)+1);
322

323
   ulong total=0;
324

325
   for (ulong i=0;i<iterations;i++) {
326

327
      double x=(double)MWCfp ;
328
      double y=(double)MWCfp ;
329

330
      // Matching test
331
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
332
      total+=inside;
333
   }
334
   s[get_local_id(0)]=total;
335
   barrier(CLK_LOCAL_MEM_FENCE);
336
      
337
}
338

339
__kernel void MainLoopHybrid64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
340
{
341
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
342
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
343

344
   ulong total=0;
345

346
   for (uint i=0;i<iterations;i++) {
347

348
      double x=(double)MWCfp ;
349
      double y=(double)MWCfp ;
350

351
      // Matching test
352
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
353
      total+=inside;
354
   }
355
   barrier(CLK_LOCAL_MEM_FENCE);
356
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
357
      
358
}
359
"""
360

    
361
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,DoublePrecision):
362

    
363
  # Avec PyCUDA autoinit, rien a faire !
364
  
365
  circleCU = cuda.InOut(circle)
366
  
367
  mod = SourceModule(KERNEL_CODE_CUDA)
368

    
369
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
370
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
371
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
372
  MetropolisBlocks64CU=mod.get_function("MainLoopBlocks64")
373
  MetropolisJobs64CU=mod.get_function("MainLoopThreads64")
374
  MetropolisHybrid64CU=mod.get_function("MainLoopHybrid64")
375
  
376
  start = pycuda.driver.Event()
377
  stop = pycuda.driver.Event()
378
  
379
  MyPi=numpy.zeros(steps)
380
  MyDuration=numpy.zeros(steps)
381

    
382
  if iterations%jobs==0:
383
    iterationsCL=numpy.uint64(iterations/jobs)
384
    iterationsNew=iterationsCL*jobs
385
  else:
386
    iterationsCL=numpy.uint64(iterations/jobs+1)
387
    iterationsNew=iterations
388

    
389
  for i in range(steps):
390
    start.record()
391
    start.synchronize()
392
    if ParaStyle=='Blocks':
393
      if DoublePrecision:
394
        MetropolisBlocksCU(circleCU,
395
                           numpy.uint64(iterationsCL),
396
                           numpy.uint32(nprnd(2**30/jobs)),
397
                           numpy.uint32(nprnd(2**30/jobs)),
398
                           grid=(jobs,1),
399
                           block=(1,1,1))
400
      else:
401
        MetropolisBlocks64CU(circleCU,
402
                             numpy.uint64(iterationsCL),
403
                             numpy.uint32(nprnd(2**30/jobs)),
404
                             numpy.uint32(nprnd(2**30/jobs)),
405
                             grid=(jobs,1),
406
                             block=(1,1,1))
407
        
408
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
409
            (Alu,jobs,1,ParaStyle)      
410
    elif ParaStyle=='Hybrid':
411
      threads=BestThreadsNumber(jobs)
412
      if DoublePrecision:
413
        MetropolisHybrid64CU(circleCU,
414
                             numpy.uint64(iterationsCL),
415
                             numpy.uint32(nprnd(2**30/jobs)),
416
                             numpy.uint32(nprnd(2**30/jobs)),
417
                             grid=(jobs,1),
418
                             block=(threads,1,1))
419
      else:
420
        MetropolisHybridCU(circleCU,
421
                           numpy.uint64(iterationsCL),
422
                           numpy.uint32(nprnd(2**30/jobs)),
423
                           numpy.uint32(nprnd(2**30/jobs)),
424
                           grid=(jobs,1),
425
                           block=(threads,1,1))
426
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
427
            (Alu,jobs/threads,threads,ParaStyle)
428
    else:
429
      if DoublePrecision:
430
        MetropolisJobs64CU(circleCU,
431
                           numpy.uint64(iterationsCL),
432
                           numpy.uint32(nprnd(2**30/jobs)),
433
                           numpy.uint32(nprnd(2**30/jobs)),
434
                           grid=(1,1),
435
                           block=(jobs,1,1))
436
      else:
437
        MetropolisJobsCU(circleCU,
438
                         numpy.uint64(iterationsCL),
439
                         numpy.uint32(nprnd(2**30/jobs)),
440
                         numpy.uint32(nprnd(2**30/jobs)),
441
                         grid=(1,1),
442
                         block=(jobs,1,1))
443
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
444
            (Alu,jobs,1,ParaStyle)
445
    stop.record()
446
    stop.synchronize()
447
                
448
    elapsed = start.time_till(stop)*1e-3
449

    
450
    MyDuration[i]=elapsed
451
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
452
    MyPi[i]=numpy.median(AllPi)
453
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
454

    
455

    
456
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
457

    
458
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
459

    
460

    
461
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
462
                     DoublePrecision):
463
        
464
  # Initialisation des variables en les CASTant correctement
465
    
466
  if Device==0:
467
    print "Enter XPU selector based on ALU type: first selected"
468
    HasXPU=False
469
    # Default Device selection based on ALU Type
470
    for platform in cl.get_platforms():
471
      for device in platform.get_devices():
472
        deviceType=cl.device_type.to_string(device.type)
473
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
474
          XPU=device
475
          print "GPU selected: ",device.name
476
          HasXPU=True
477
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
478
          XPU=device
479
          print "CPU selected: ",device.name
480
          HasXPU=True
481
  else:
482
    print "Enter XPU selector based on device number & ALU type"
483
    Id=1
484
    HasXPU=False
485
    # Primary Device selection based on Device Id
486
    for platform in cl.get_platforms():
487
      for device in platform.get_devices():
488
        deviceType=cl.device_type.to_string(device.type)
489
        if Id==Device and Alu==deviceType and HasXPU==False:
490
          XPU=device
491
          print "CPU/GPU selected: ",device.name
492
          HasXPU=True
493
        Id=Id+1
494
    if HasXPU==False:
495
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
496
          (Device,Alu,Id-1)
497
      return(0,0,0)
498
                                
499
  # Je cree le contexte et la queue pour son execution
500
  ctx = cl.Context([XPU])
501
  queue = cl.CommandQueue(ctx,
502
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
503

    
504
  # Je recupere les flag possibles pour les buffers
505
  mf = cl.mem_flags
506
        
507
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
508

    
509
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
510
    options = "-cl-mad-enable -cl-fast-relaxed-math")
511

    
512
  i=0
513

    
514
  MyPi=numpy.zeros(steps)
515
  MyDuration=numpy.zeros(steps)
516
  
517
  if iterations%jobs==0:
518
    iterationsCL=numpy.uint64(iterations/jobs)
519
    iterationsNew=numpy.uint64(iterationsCL*jobs)
520
  else:
521
    iterationsCL=numpy.uint64(iterations/jobs+1)
522
    iterationsNew=numpy.uint64(iterations)
523

    
524
  for i in range(steps):
525
                
526
    if ParaStyle=='Blocks':
527
      # Call OpenCL kernel
528
      # (1,) is Global work size (only 1 work size)
529
      # (1,) is local work size
530
      # circleCL is lattice translated in CL format
531
      # SeedZCL is lattice translated in CL format
532
      # SeedWCL is lattice translated in CL format
533
      # step is number of iterations
534
      if DoublePrecision:
535
        CLLaunch=MetropolisCL.MainLoopGlobal64(queue,(jobs,),None,
536
                                               circleCL,
537
                                               numpy.uint64(iterationsCL),
538
                                               numpy.uint32(nprnd(2**30/jobs)),
539
                                               numpy.uint32(nprnd(2**30/jobs)))
540
      else:
541
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
542
                                             circleCL,
543
                                             numpy.uint64(iterationsCL),
544
                                             numpy.uint32(nprnd(2**30/jobs)),
545
                                             numpy.uint32(nprnd(2**30/jobs)))
546
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
547
            (Alu,jobs,1,ParaStyle)
548
    elif ParaStyle=='Hybrid':
549
      threads=BestThreadsNumber(jobs)
550
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
551
      if DoublePrecision:
552
        CLLaunch=MetropolisCL.MainLoopHybrid64(queue,(jobs,),(threads,),
553
                                               circleCL,
554
                                               numpy.uint64(iterationsCL),
555
                                               numpy.uint32(nprnd(2**30/jobs)),
556
                                               numpy.uint32(nprnd(2**30/jobs)))
557
      else:
558
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
559
                                             circleCL,
560
                                             numpy.uint64(iterationsCL),
561
                                             numpy.uint32(nprnd(2**30/jobs)),
562
                                             numpy.uint32(nprnd(2**30/jobs)))
563
        
564
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
565
            (Alu,jobs/threads,threads,ParaStyle)
566
    else:
567
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
568
      if DoublePrecision:
569
        CLLaunch=MetropolisCL.MainLoopLocal64(queue,(jobs,),(jobs,),
570
                                              circleCL,
571
                                              numpy.uint64(iterationsCL),
572
                                              numpy.uint32(nprnd(2**30/jobs)),
573
                                              numpy.uint32(nprnd(2**30/jobs)))
574
      else:
575
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
576
                                            circleCL,
577
                                            numpy.uint64(iterationsCL),
578
                                            numpy.uint32(nprnd(2**30/jobs)),
579
                                            numpy.uint32(nprnd(2**30/jobs)))
580
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
581

    
582
    CLLaunch.wait()
583
    cl.enqueue_copy(queue, circle, circleCL).wait()
584

    
585
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
586

    
587
    MyDuration[i]=elapsed
588
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
589
    MyPi[i]=numpy.median(AllPi)
590
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
591

    
592
  circleCL.release()
593

    
594
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
595
        
596
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
597

    
598

    
599
def FitAndPrint(N,D,Curves):
600

    
601
  from scipy.optimize import curve_fit
602
  import matplotlib.pyplot as plt
603

    
604
  try:
605
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
606

    
607
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
608
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
609
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
610
    coeffs_Amdahl[0]=D[0]
611
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
612
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
613
  except:
614
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
615

    
616
  try:
617
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
618

    
619
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
620
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
621
    coeffs_AmdahlR[0]=D[0]
622
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
623
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
624

    
625
  except:
626
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
627

    
628
  try:
629
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
630

    
631
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
632
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
633
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
634
    coeffs_Mylq[0]=D[0]
635
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
636
                                                            coeffs_Mylq[1],
637
                                                            coeffs_Mylq[3],
638
                                                            coeffs_Mylq[2])
639
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
640
                coeffs_Mylq[3])
641
  except:
642
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
643

    
644
  try:
645
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
646

    
647
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
648
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
649
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
650
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
651
    coeffs_Mylq2[0]=D[0]
652
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
653
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
654
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
655

    
656
  except:
657
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
658

    
659
  if Curves:
660
    plt.xlabel("Number of Threads/work Items")
661
    plt.ylabel("Total Elapsed Time")
662

    
663
    Experience,=plt.plot(N,D,'ro') 
664
    try:
665
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
666
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
667
    except:
668
      print "Fit curves seem not to be available"
669

    
670
    plt.legend()
671
    plt.show()
672

    
673
if __name__=='__main__':
674

    
675
  # Set defaults values
676
  
677
  # Alu can be CPU, GPU or ACCELERATOR
678
  Alu='CPU'
679
  # Id of GPU : 1 is for first find !
680
  Device=0
681
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
682
  GpuStyle='OpenCL'
683
  # Parallel distribution can be on Threads or Blocks
684
  ParaStyle='Blocks'
685
  # Iterations is integer
686
  Iterations=100000000
687
  # JobStart in first number of Jobs to explore
688
  JobStart=1
689
  # JobEnd is last number of Jobs to explore
690
  JobEnd=16
691
  # JobStep is the step of Jobs to explore
692
  JobStep=1
693
  # Redo is the times to redo the test to improve metrology
694
  Redo=1
695
  # OutMetrology is method for duration estimation : False is GPU inside
696
  OutMetrology=False
697
  Metrology='InMetro'
698
  # Curves is True to print the curves
699
  Curves=False
700
  # Fit is True to print the curves
701
  Fit=False
702
  # DoublePrecision on FP calculus
703
  DoublePrecision=False
704

    
705
  try:
706
    opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
707
  except getopt.GetoptError:
708
    print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
709
    sys.exit(2)
710
    
711
  for opt, arg in opts:
712
    if opt == '-h':
713
      print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
714

    
715
      print "\nInformations about devices detected under OpenCL:"
716
      # For PyOpenCL import
717
      try:
718
        import pyopencl as cl
719
        Id=1
720
        for platform in cl.get_platforms():
721
          for device in platform.get_devices():
722
            deviceType=cl.device_type.to_string(device.type)
723
            print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
724
            Id=Id+1
725

    
726
        print
727
        sys.exit()
728
      except ImportError:
729
        print "Your platform does not seem to support OpenCL"
730
        
731
    elif opt == '-o':
732
      OutMetrology=True
733
      Metrology='OutMetro'
734
    elif opt == '-l':
735
      DoublePrecision=True
736
    elif opt == '-c':
737
      Curves=True
738
    elif opt == '-f':
739
      Fit=True
740
    elif opt in ("-a", "--alu"):
741
      Alu = arg
742
    elif opt in ("-d", "--device"):
743
      Device = int(arg)
744
    elif opt in ("-g", "--gpustyle"):
745
      GpuStyle = arg
746
    elif opt in ("-p", "--parastyle"):
747
      ParaStyle = arg
748
    elif opt in ("-i", "--iterations"):
749
      Iterations = numpy.uint64(arg)
750
    elif opt in ("-s", "--jobstart"):
751
      JobStart = int(arg)
752
    elif opt in ("-e", "--jobend"):
753
      JobEnd = int(arg)
754
    elif opt in ("-t", "--jobstep"):
755
      JobStep = int(arg)
756
    elif opt in ("-r", "--redo"):
757
      Redo = int(arg)
758

    
759
  if Alu=='CPU' and GpuStyle=='CUDA':
760
    print "Alu can't be CPU for CUDA, set Alu to GPU"
761
    Alu='GPU'
762

    
763
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
764
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
765
    ParaStyle='Threads'
766

    
767
  print "Compute unit : %s" % Alu
768
  print "Device Identification : %s" % Device
769
  print "GpuStyle used : %s" % GpuStyle
770
  print "Parallel Style used : %s" % ParaStyle
771
  print "Iterations : %s" % Iterations
772
  print "Number of threads on start : %s" % JobStart
773
  print "Number of threads on end : %s" % JobEnd
774
  print "Number of redo : %s" % Redo
775
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
776
  print "Double Precision in Kernels : %r" % DoublePrecision
777

    
778
  if GpuStyle=='CUDA':
779
    try:
780
      # For PyCUDA import
781
      import pycuda.driver as cuda
782
      import pycuda.gpuarray as gpuarray
783
      import pycuda.autoinit
784
      from pycuda.compiler import SourceModule
785
    except ImportError:
786
      print "Platform does not seem to support CUDA"
787

    
788
  if GpuStyle=='OpenCL':
789
    try:
790
      # For PyOpenCL import
791
      import pyopencl as cl
792
      Id=1
793
      for platform in cl.get_platforms():
794
        for device in platform.get_devices():
795
          deviceType=cl.device_type.to_string(device.type)
796
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
797
          if Id == Device:
798
            # Set the Alu as detected Device Type
799
            Alu=deviceType
800
          Id=Id+1
801
    except ImportError:
802
      print "Platform does not seem to support CUDA"
803
      
804
  average=numpy.array([]).astype(numpy.float32)
805
  median=numpy.array([]).astype(numpy.float32)
806
  stddev=numpy.array([]).astype(numpy.float32)
807

    
808
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
809

    
810
  Jobs=JobStart
811

    
812
  while Jobs <= JobEnd:
813
    avg,med,std=0,0,0
814
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
815
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
816

    
817
    if OutMetrology:
818
      duration=numpy.array([]).astype(numpy.float32)
819
      for i in range(Redo):
820
        start=time.time()
821
        if GpuStyle=='CUDA':
822
          try:
823
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
824
                                 DoublePrecision)
825
          except:
826
            print "Problem with %i // computations on Cuda" % Jobs
827
        elif GpuStyle=='OpenCL':
828
          try:
829
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
830
                                   Alu,Device,DoublePrecision)
831
          except:
832
            print "Problem with %i // computations on OpenCL" % Jobs            
833
        duration=numpy.append(duration,time.time()-start)
834
      if (a,m,s) != (0,0,0):
835
        avg=numpy.mean(duration)
836
        med=numpy.median(duration)
837
        std=numpy.std(duration)
838
      else:
839
        print "Values seem to be wrong..."
840
    else:
841
      if GpuStyle=='CUDA':
842
        try:
843
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,
844
                                     DoublePrecision)
845
        except:
846
          print "Problem with %i // computations on Cuda" % Jobs
847
      elif GpuStyle=='OpenCL':
848
        try:
849
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,DoublePrecision)
850
        except:
851
          print "Problem with %i // computations on OpenCL" % Jobs            
852

    
853
    if (avg,med,std) != (0,0,0):
854
      print "jobs,avg,med,std",Jobs,avg,med,std
855
      average=numpy.append(average,avg)
856
      median=numpy.append(median,med)
857
      stddev=numpy.append(stddev,std)
858
    else:
859
      print "Values seem to be wrong..."
860
    #THREADS*=2
861
    if DoublePrecision:
862
      Precision='DP'
863
    else:
864
      Precision='SP'
865
    if len(average)!=0:
866
      numpy.savez("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
867
      ToSave=[ ExploredJobs,average,median,stddev ]
868
      numpy.savetxt("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
869
    Jobs+=JobStep
870

    
871
  if Fit:
872
    FitAndPrint(ExploredJobs,median,Curves)
873