Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 60

Historique | Voir | Annoter | Télécharger (24,2 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import math
23
from socket import gethostname
24

    
25
# find prime factors of a number
26
# Get for WWW :
27
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
28
def PrimeFactors(x):
29
  factorlist=numpy.array([]).astype('uint32')
30
  loop=2
31
  while loop<=x:
32
    if x%loop==0:
33
      x/=loop
34
      factorlist=numpy.append(factorlist,[loop])
35
    else:
36
      loop+=1
37
  return factorlist
38

    
39
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
40
# output is thread number
41
def BestThreadsNumber(jobs):
42
  factors=PrimeFactors(jobs)
43
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
44
  threads=1
45
  for factor in matrix.transpose().ravel():
46
    threads=threads*factor
47
    if threads*threads>jobs:
48
      break
49
  return(long(threads))
50

    
51
# Predicted Amdahl Law (Reduced with s=1-p)  
52
def AmdahlR(N, T1, p):
53
  return (T1*(1-p+p/N))
54

    
55
# Predicted Amdahl Law
56
def Amdahl(N, T1, s, p):
57
  return (T1*(s+p/N))
58

    
59
# Predicted Mylq Law with first order
60
def Mylq(N, T1,s,c,p):
61
  return (T1*(s+p/N)+c*N)
62

    
63
# Predicted Mylq Law with second order
64
def Mylq2(N, T1,s,c1,c2,p):
65
  return (T1*(s+p/N)+c1*N+c2*N*N)
66

    
67
KERNEL_CODE_CUDA="""
68

69
// Marsaglia RNG very simple implementation
70

71
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73
#define MWC   (znew+wnew)
74
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
75
#define CONG  (jcong=69069*jcong+1234567)
76
#define KISS  ((MWC^CONG)+SHR3)
77

78
#define MWCfp MWC * 2.328306435454494e-10f
79
#define KISSfp KISS * 2.328306435454494e-10f
80

81
#define MAX 4294967296
82

83
uint rotl(uint value, int shift) {
84
   return (value << shift) | (value >> (sizeof(value) * 8 - shift));
85
}
86
 
87
uint rotr(uint value, int shift) {
88
   return (value >> shift) | (value << (sizeof(value) * 8 - shift));
89
}
90

91
__global__ void MainLoopBlocks(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
92
{
93
   // uint z=rotl(seed_z,blockIdx.x);
94
   // uint w=rotr(seed_w,blockIdx.x);
95

96
   // uint jsr=rotl(seed_z,blockIdx.x);
97
   // uint jcong=rotr(seed_w,blockIdx.x);
98

99
   uint z=seed_z/(blockIdx.x+1);
100
   uint w=seed_w%(blockIdx.x+1);
101

102
   uint jsr=seed_z/(blockIdx.x+1);
103
   uint jcong=seed_w%(blockIdx.x+1);
104

105
   for (ulong i=0;i<iterations;i++) {
106

107
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
108

109
   }
110
   __threadfence_block();
111
}
112

113
__global__ void MainLoopThreads(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
114
{ 
115
   // uint z=rotl(seed_z,threadIdx.x);
116
   // uint w=rotr(seed_w,threadIdx.x);
117

118
   // uint jsr=rotl(seed_z,threadIdx.x);
119
   // uint jcong=rotr(seed_w,threadIdx.x);
120

121
   uint z=seed_z;
122
   uint w=seed_w;
123

124
   uint jsr=seed_z;
125
   uint jcong=seed_w;
126

127
   for (ulong i=0;i<iterations;i++) {
128

129
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
130
   }
131

132
   __syncthreads();
133
}
134

135
__global__ void MainLoopHybrid(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
136
{
137
   uint z=seed_z;
138
   uint w=seed_w;
139

140
   uint jsr=seed_z;
141
   uint jcong=seed_w;
142

143
   for (ulong i=0;i<iterations;i++) {
144

145
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
146
   }
147

148
   __syncthreads();
149

150
}
151
"""
152

    
153
KERNEL_CODE_OPENCL="""
154
// Marsaglia RNG very simple implementation
155
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
156
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
157

158
#define MWC   (znew+wnew)
159
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160
#define CONG  (jcong=69069*jcong+1234567)
161
#define KISS  ((MWC^CONG)+SHR3)
162

163
#define CONGfp CONG * 2.328306435454494e-10f
164
#define SHR3fp SHR3 * 2.328306435454494e-10f
165
#define MWCfp MWC * 2.328306435454494e-10f
166
#define KISSfp KISS * 2.328306435454494e-10f
167

168
#define MAX (ulong)4294967296
169

170
uint rotl(uint value, int shift) {
171
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
172
}
173
 
174
uint rotr(uint value, int shift) {
175
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
176
}
177

178
__kernel void MainLoopGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
179
{
180
   //__private const float id=(float)get_global_id(0);
181
   //__private const float size=(float)get_global_size(0);
182
   //__private const float block=space/size;
183

184
   __private const ulong id=(ulong)get_global_id(0);
185
   __private const ulong size=(ulong)get_global_size(0);
186
   __private const ulong block=(ulong)space/(ulong)size;
187
   
188
   __private uint z=seed_z-(uint)id;
189
   __private uint w=seed_w+(uint)id;
190

191
   __private uint jsr=seed_z;
192
   __private uint jcong=seed_w;
193

194
   for (__private ulong i=0;i<iterations;i++) {
195

196
      // Standard version does not work for several processes (some lost!)
197
      //__private uint position=(uint)(((ulong)space*(ulong)MWC)/(ulong)MAX);
198
      // Dense version
199
      //__private uint position=(uint)( ((ulong)CONG+MAX*(ulong)id)/(ulong)size*(ulong)space/(ulong)MAX );
200
      __private size_t position=(size_t)( ((ulong)MWC+(ulong)MAX*(ulong)id)*(ulong)block/(ulong)MAX );
201
      // Float version seems to be the best...
202
      //__private uint position=(uint)( block*(CONGfp+id) );
203

204
      s[position]++;
205
   }
206

207
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
208

209
}
210

211
__kernel void MainLoopLocal(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
212
{
213
   uint z=rotl(seed_z,get_local_id(0));
214
   uint w=rotr(seed_w,get_local_id(0));
215

216
   uint jsr=rotl(seed_z,get_local_id(0));
217
   uint jcong=rotr(seed_w,get_local_id(0));
218

219
   for (ulong i=0;i<iterations;i++) {
220

221
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=(uint)1;
222
   }
223

224

225
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
226
}
227

228
__kernel void MainLoopHybrid(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
229
{
230
   uint z=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
231
   uint w=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
232

233
   uint jsr=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
234
   uint jcong=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
235

236
   for (ulong i=0;i<iterations;i++) {
237

238
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
239

240
      barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
241
   }
242

243
      
244
}
245
"""
246

    
247
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
248

    
249
  # Avec PyCUDA autoinit, rien a faire !
250
  
251
  circleCU = cuda.InOut(circle)
252
  
253
  mod = SourceModule(KERNEL_CODE_CUDA)
254

    
255
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
256
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
257
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
258
  
259
  start = pycuda.driver.Event()
260
  stop = pycuda.driver.Event()
261
  
262
  MySplutter=numpy.zeros(steps)
263
  MyDuration=numpy.zeros(steps)
264

    
265
  if iterations%jobs==0:
266
    iterationsCL=numpy.uint64(iterations/jobs)
267
  else:
268
    iterationsCL=numpy.uint64(iterations/jobs+1)
269
    
270
  iterationsNew=iterationsCL*jobs
271

    
272
  for i in range(steps):
273

    
274
    Splutter=numpy.zeros(1024).astype(numpy.uint32)
275
    
276
    print Splutter
277

    
278
    SplutterCU = cuda.InOut(Splutter)
279

    
280
    start.record()
281
    start.synchronize()
282
    if ParaStyle=='Blocks':
283
      MetropolisBlocksCU(SplutterCU,
284
                         numpy.uint32(len(Splutter)),
285
                         numpy.uint64(iterationsCL),
286
                         numpy.uint32(nprnd(2**30/jobs)),
287
                         numpy.uint32(nprnd(2**30/jobs)),
288
                         grid=(jobs,1),
289
                         block=(1,1,1))
290
        
291
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
292
            (Alu,jobs,1,ParaStyle)      
293
    elif ParaStyle=='Hybrid':
294
      threads=BestThreadsNumber(jobs)
295
      MetropolisHybridCU(SplutterCU,
296
                         numpy.uint32(len(Splutter)),
297
                         numpy.uint64(iterationsCL),
298
                         numpy.uint32(nprnd(2**30/jobs)),
299
                         numpy.uint32(nprnd(2**30/jobs)),
300
                         grid=(jobs,1),
301
                         block=(threads,1,1))
302
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
303
            (Alu,jobs/threads,threads,ParaStyle)
304
    else:
305
      MetropolisJobsCU(SplutterCU,
306
                       numpy.uint32(len(Splutter)),
307
                       numpy.uint64(iterationsCL),
308
                       numpy.uint32(nprnd(2**30/jobs)),
309
                       numpy.uint32(nprnd(2**30/jobs)),
310
                       grid=(1,1),
311
                       block=(jobs,1,1))
312
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
313
            (Alu,jobs,1,ParaStyle)
314
    stop.record()
315
    stop.synchronize()
316
                
317
    elapsed = start.time_till(stop)*1e-3
318

    
319
    print Splutter,sum(Splutter)
320
    MySplutter[i]=numpy.median(Splutter)
321
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
322

    
323
    MyDuration[i]=elapsed
324

    
325
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
326
    #MyPi[i]=numpy.median(AllPi)
327
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
328

    
329

    
330
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
331

    
332
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
333

    
334

    
335
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
336
        
337
  # Initialisation des variables en les CASTant correctement
338

    
339
  if Device==0:
340
    print "Enter XPU selector based on ALU type: first selected"
341
    HasXPU=False
342
    # Default Device selection based on ALU Type
343
    for platform in cl.get_platforms():
344
      for device in platform.get_devices():
345
        deviceType=cl.device_type.to_string(device.type)
346
        deviceMemory=device.max_mem_alloc_size
347
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
348
          XPU=device
349
          print "GPU selected: ",device.name
350
          HasXPU=True
351
          MemoryXPU=deviceMemory
352
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
353
          XPU=device
354
          print "CPU selected: ",device.name
355
          HasXPU=True
356
          MemoryXPU=deviceMemory
357
  else:
358
    print "Enter XPU selector based on device number & ALU type"
359
    Id=1
360
    HasXPU=False
361
    # Primary Device selection based on Device Id
362
    for platform in cl.get_platforms():
363
      for device in platform.get_devices():
364
        deviceType=cl.device_type.to_string(device.type)
365
        deviceMemory=device.max_mem_alloc_size
366
        if Id==Device and Alu==deviceType and HasXPU==False:
367
          XPU=device
368
          print "CPU/GPU selected: ",device.name
369
          HasXPU=True
370
          MemoryXPU=deviceMemory
371
        Id=Id+1
372
    if HasXPU==False:
373
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
374
          (Device,Alu,Id-1)
375
      return(0,0,0)
376

    
377
  # Je cree le contexte et la queue pour son execution
378
  ctx = cl.Context([XPU])
379
  queue = cl.CommandQueue(ctx,
380
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
381

    
382
  # Je recupere les flag possibles pour les buffers
383
  mf = cl.mem_flags
384
        
385
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
386
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
387
  
388
  MyDuration=numpy.zeros(steps)
389
  
390
  if iterations%jobs==0:
391
    iterationsCL=numpy.uint64(iterations/jobs)
392
  else:
393
    iterationsCL=numpy.uint64(iterations/jobs+1)
394

    
395
  iterationsNew=numpy.uint64(iterationsCL*jobs)
396

    
397
  print iterations,iterationsCL,iterationsNew
398

    
399
  MySplutter=numpy.zeros(steps)
400

    
401
  MaxWorks=2**(int)(numpy.log2(MemoryXPU/4))/8
402
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU/4))
403

    
404
  for i in range(steps):
405
                
406
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
407
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
408
 
409
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
410
    Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
411

    
412
    print Splutter,len(Splutter)
413

    
414
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
415

    
416
    if ParaStyle=='Blocks':
417
      # Call OpenCL kernel
418
      # (1,) is Global work size (only 1 work size)
419
      # (1,) is local work size
420
      # circleCL is lattice translated in CL format
421
      # SeedZCL is lattice translated in CL format
422
      # SeedWCL is lattice translated in CL format
423
      # step is number of iterations
424
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
425
      #                                      SplutterCL,
426
      #                                      numpy.uint32(len(Splutter)),
427
      #                                      numpy.uint64(iterationsCL),
428
      #                                      numpy.uint32(nprnd(2**30/jobs)),
429
      #                                      numpy.uint32(nprnd(2**30/jobs)))
430
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
431
                                           SplutterCL,
432
                                           numpy.uint32(len(Splutter)),
433
                                           numpy.uint64(iterationsCL),
434
                                           numpy.uint32(521288629),
435
                                           numpy.uint32(362436069))
436
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
437
            (Alu,jobs,1,ParaStyle)
438
    elif ParaStyle=='Hybrid':
439
      threads=BestThreadsNumber(jobs)
440
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
441
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
442
                                           SplutterCL,
443
                                           numpy.uint32(len(Splutter)),
444
                                           numpy.uint64(iterationsCL),
445
                                           numpy.uint32(nprnd(2**30/jobs)),
446
                                           numpy.uint32(nprnd(2**30/jobs)))
447
        
448
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
449
            (Alu,jobs/threads,threads,ParaStyle)
450
    else:
451
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
452
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
453
                                          SplutterCL,
454
                                          numpy.uint32(len(Splutter)),
455
                                          numpy.uint64(iterationsCL),
456
                                          numpy.uint32(nprnd(2**30/jobs)),
457
                                          numpy.uint32(nprnd(2**30/jobs)))
458
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
459

    
460
    CLLaunch.wait()
461
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
462

    
463
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
464

    
465
    MyDuration[i]=elapsed
466
    print Splutter,sum(Splutter)
467
    MySplutter[i]=numpy.median(Splutter)
468
    print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
469

    
470
    SplutterCL.release()
471

    
472
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
473
        
474
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
475

    
476

    
477
def FitAndPrint(N,D,Curves):
478

    
479
  from scipy.optimize import curve_fit
480
  import matplotlib.pyplot as plt
481

    
482
  try:
483
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
484

    
485
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
486
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
487
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
488
    coeffs_Amdahl[0]=D[0]
489
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
490
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
491
  except:
492
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
493

    
494
  try:
495
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
496

    
497
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
498
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
499
    coeffs_AmdahlR[0]=D[0]
500
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
501
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
502

    
503
  except:
504
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
505

    
506
  try:
507
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
508

    
509
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
510
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
511
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
512
    coeffs_Mylq[0]=D[0]
513
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
514
                                                            coeffs_Mylq[1],
515
                                                            coeffs_Mylq[3],
516
                                                            coeffs_Mylq[2])
517
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
518
                coeffs_Mylq[3])
519
  except:
520
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
521

    
522
  try:
523
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
524

    
525
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
526
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
527
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
528
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
529
    coeffs_Mylq2[0]=D[0]
530
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
531
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
532
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
533

    
534
  except:
535
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
536

    
537
  if Curves:
538
    plt.xlabel("Number of Threads/work Items")
539
    plt.ylabel("Total Elapsed Time")
540

    
541
    Experience,=plt.plot(N,D,'ro') 
542
    try:
543
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
544
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
545
    except:
546
      print "Fit curves seem not to be available"
547

    
548
    plt.legend()
549
    plt.show()
550

    
551
if __name__=='__main__':
552

    
553
  # Set defaults values
554
  
555
  # Alu can be CPU, GPU or ACCELERATOR
556
  Alu='CPU'
557
  # Id of GPU : 1 is for first find !
558
  Device=0
559
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
560
  GpuStyle='OpenCL'
561
  # Parallel distribution can be on Threads or Blocks
562
  ParaStyle='Blocks'
563
  # Iterations is integer
564
  Iterations=100000000
565
  # JobStart in first number of Jobs to explore
566
  JobStart=1
567
  # JobEnd is last number of Jobs to explore
568
  JobEnd=16
569
  # JobStep is the step of Jobs to explore
570
  JobStep=1
571
  # Redo is the times to redo the test to improve metrology
572
  Redo=1
573
  # OutMetrology is method for duration estimation : False is GPU inside
574
  OutMetrology=False
575
  Metrology='InMetro'
576
  # Curves is True to print the curves
577
  Curves=False
578
  # Fit is True to print the curves
579
  Fit=False
580

    
581
  try:
582
    opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
583
  except getopt.GetoptError:
584
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
585
    sys.exit(2)
586
    
587
  for opt, arg in opts:
588
    if opt == '-h':
589
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
590

    
591
      print "\nInformations about devices detected under OpenCL:"
592
      # For PyOpenCL import
593
      try:
594
        import pyopencl as cl
595
        Id=1
596
        for platform in cl.get_platforms():
597
          for device in platform.get_devices():
598
            deviceType=cl.device_type.to_string(device.type)
599
            deviceMemory=device.max_mem_alloc_size
600
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
601
            Id=Id+1
602

    
603
        print
604
        sys.exit()
605
      except ImportError:
606
        print "Your platform does not seem to support OpenCL"
607
        
608
    elif opt == '-o':
609
      OutMetrology=True
610
      Metrology='OutMetro'
611
    elif opt == '-c':
612
      Curves=True
613
    elif opt == '-f':
614
      Fit=True
615
    elif opt in ("-a", "--alu"):
616
      Alu = arg
617
    elif opt in ("-d", "--device"):
618
      Device = int(arg)
619
    elif opt in ("-g", "--gpustyle"):
620
      GpuStyle = arg
621
    elif opt in ("-p", "--parastyle"):
622
      ParaStyle = arg
623
    elif opt in ("-i", "--iterations"):
624
      Iterations = numpy.uint64(arg)
625
    elif opt in ("-s", "--jobstart"):
626
      JobStart = int(arg)
627
    elif opt in ("-e", "--jobend"):
628
      JobEnd = int(arg)
629
    elif opt in ("-t", "--jobstep"):
630
      JobStep = int(arg)
631
    elif opt in ("-r", "--redo"):
632
      Redo = int(arg)
633

    
634
  if Alu=='CPU' and GpuStyle=='CUDA':
635
    print "Alu can't be CPU for CUDA, set Alu to GPU"
636
    Alu='GPU'
637

    
638
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
639
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
640
    ParaStyle='Threads'
641

    
642
  print "Compute unit : %s" % Alu
643
  print "Device Identification : %s" % Device
644
  print "GpuStyle used : %s" % GpuStyle
645
  print "Parallel Style used : %s" % ParaStyle
646
  print "Iterations : %s" % Iterations
647
  print "Number of threads on start : %s" % JobStart
648
  print "Number of threads on end : %s" % JobEnd
649
  print "Number of redo : %s" % Redo
650
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
651

    
652
  if GpuStyle=='CUDA':
653
    try:
654
      # For PyCUDA import
655
      import pycuda.driver as cuda
656
      import pycuda.gpuarray as gpuarray
657
      import pycuda.autoinit
658
      from pycuda.compiler import SourceModule
659
    except ImportError:
660
      print "Platform does not seem to support CUDA"
661

    
662
  if GpuStyle=='OpenCL':
663
    try:
664
      # For PyOpenCL import
665
      import pyopencl as cl
666
      Id=1
667
      for platform in cl.get_platforms():
668
        for device in platform.get_devices():
669
          deviceType=cl.device_type.to_string(device.type)
670
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
671
          if Id == Device:
672
            # Set the Alu as detected Device Type
673
            Alu=deviceType
674
          Id=Id+1
675
    except ImportError:
676
      print "Platform does not seem to support CUDA"
677
      
678
  average=numpy.array([]).astype(numpy.float32)
679
  median=numpy.array([]).astype(numpy.float32)
680
  stddev=numpy.array([]).astype(numpy.float32)
681

    
682
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
683

    
684
  Jobs=JobStart
685

    
686
  while Jobs <= JobEnd:
687
    avg,med,std=0,0,0
688
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
689
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
690

    
691
    if OutMetrology:
692
      duration=numpy.array([]).astype(numpy.float32)
693
      for i in range(Redo):
694
        start=time.time()
695
        if GpuStyle=='CUDA':
696
          try:
697
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
698
          except:
699
            print "Problem with %i // computations on Cuda" % Jobs
700
        elif GpuStyle=='OpenCL':
701
          try:
702
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
703
                                   Alu,Device)
704
          except:
705
            print "Problem with %i // computations on OpenCL" % Jobs            
706
        duration=numpy.append(duration,time.time()-start)
707
      if (a,m,s) != (0,0,0):
708
        avg=numpy.mean(duration)
709
        med=numpy.median(duration)
710
        std=numpy.std(duration)
711
      else:
712
        print "Values seem to be wrong..."
713
    else:
714
      if GpuStyle=='CUDA':
715
        try:
716
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
717
        except:
718
          print "Problem with %i // computations on Cuda" % Jobs
719
      elif GpuStyle=='OpenCL':
720
        try:
721
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
722
        except:
723
          print "Problem with %i // computations on OpenCL" % Jobs            
724

    
725
    if (avg,med,std) != (0,0,0):
726
      print "jobs,avg,med,std",Jobs,avg,med,std
727
      average=numpy.append(average,avg)
728
      median=numpy.append(median,med)
729
      stddev=numpy.append(stddev,std)
730
    else:
731
      print "Values seem to be wrong..."
732
    #THREADS*=2
733
    if len(average)!=0:
734
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
735
      ToSave=[ ExploredJobs,average,median,stddev ]
736
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
737
    Jobs+=JobStep
738

    
739
  if Fit:
740
    FitAndPrint(ExploredJobs,median,Curves)
741