Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 62

Historique | Voir | Annoter | Télécharger (25,22 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import math
23
from socket import gethostname
24

    
25
# find prime factors of a number
26
# Get for WWW :
27
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
28
def PrimeFactors(x):
29
  factorlist=numpy.array([]).astype('uint32')
30
  loop=2
31
  while loop<=x:
32
    if x%loop==0:
33
      x/=loop
34
      factorlist=numpy.append(factorlist,[loop])
35
    else:
36
      loop+=1
37
  return factorlist
38

    
39
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
40
# output is thread number
41
def BestThreadsNumber(jobs):
42
  factors=PrimeFactors(jobs)
43
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
44
  threads=1
45
  for factor in matrix.transpose().ravel():
46
    threads=threads*factor
47
    if threads*threads>jobs:
48
      break
49
  return(long(threads))
50

    
51
# Predicted Amdahl Law (Reduced with s=1-p)  
52
def AmdahlR(N, T1, p):
53
  return (T1*(1-p+p/N))
54

    
55
# Predicted Amdahl Law
56
def Amdahl(N, T1, s, p):
57
  return (T1*(s+p/N))
58

    
59
# Predicted Mylq Law with first order
60
def Mylq(N, T1,s,c,p):
61
  return (T1*(s+p/N)+c*N)
62

    
63
# Predicted Mylq Law with second order
64
def Mylq2(N, T1,s,c1,c2,p):
65
  return (T1*(s+p/N)+c1*N+c2*N*N)
66

    
67
KERNEL_CODE_CUDA="""
68

69
// Marsaglia RNG very simple implementation
70

71
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73
#define MWC   (znew+wnew)
74
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
75
#define CONG  (jcong=69069*jcong+1234567)
76
#define KISS  ((MWC^CONG)+SHR3)
77

78
#define MWCfp MWC * 2.328306435454494e-10f
79
#define KISSfp KISS * 2.328306435454494e-10f
80

81
#define MAX 4294967296
82

83
uint rotl(uint value, int shift) {
84
   return (value << shift) | (value >> (sizeof(value) * 8 - shift));
85
}
86
 
87
uint rotr(uint value, int shift) {
88
   return (value >> shift) | (value << (sizeof(value) * 8 - shift));
89
}
90

91
__global__ void MainLoopBlocks(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
92
{
93
   // uint z=rotl(seed_z,blockIdx.x);
94
   // uint w=rotr(seed_w,blockIdx.x);
95

96
   // uint jsr=rotl(seed_z,blockIdx.x);
97
   // uint jcong=rotr(seed_w,blockIdx.x);
98

99
   uint z=seed_z/(blockIdx.x+1);
100
   uint w=seed_w%(blockIdx.x+1);
101

102
   uint jsr=seed_z/(blockIdx.x+1);
103
   uint jcong=seed_w%(blockIdx.x+1);
104

105
   for (ulong i=0;i<iterations;i++) {
106

107
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
108

109
   }
110
   __threadfence_block();
111
}
112

113
__global__ void MainLoopThreads(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
114
{ 
115
   // uint z=rotl(seed_z,threadIdx.x);
116
   // uint w=rotr(seed_w,threadIdx.x);
117

118
   // uint jsr=rotl(seed_z,threadIdx.x);
119
   // uint jcong=rotr(seed_w,threadIdx.x);
120

121
   uint z=seed_z;
122
   uint w=seed_w;
123

124
   uint jsr=seed_z;
125
   uint jcong=seed_w;
126

127
   for (ulong i=0;i<iterations;i++) {
128

129
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
130
   }
131

132
   __syncthreads();
133
}
134

135
__global__ void MainLoopHybrid(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
136
{
137
   uint z=seed_z;
138
   uint w=seed_w;
139

140
   uint jsr=seed_z;
141
   uint jcong=seed_w;
142

143
   for (ulong i=0;i<iterations;i++) {
144

145
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
146
   }
147

148
   __syncthreads();
149

150
}
151
"""
152

    
153
KERNEL_CODE_OPENCL="""
154
// Marsaglia RNG very simple implementation
155
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
156
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
157

158
#define MWC   (znew+wnew)
159
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160
#define CONG  (jcong=69069*jcong+1234567)
161
#define KISS  ((MWC^CONG)+SHR3)
162

163
#define CONGfp CONG * 2.328306435454494e-10f
164
#define SHR3fp SHR3 * 2.328306435454494e-10f
165
#define MWCfp MWC * 2.328306435454494e-10f
166
#define KISSfp KISS * 2.328306435454494e-10f
167

168
#define MAX (ulong)4294967296
169

170
uint rotl(uint value, int shift) {
171
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
172
}
173
 
174
uint rotr(uint value, int shift) {
175
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
176
}
177

178
__kernel void MainLoopGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
179
{
180
   //__private const float id=(float)get_global_id(0);
181
   //__private const float size=(float)get_global_size(0);
182
   //__private const float block=space/size;
183

184
   __private const ulong id=(ulong)get_global_id(0);
185
   __private const ulong size=(ulong)get_global_size(0);
186
   __private const ulong block=(ulong)space/(ulong)size;
187
   
188
   __private uint z=seed_z-(uint)id;
189
   __private uint w=seed_w+(uint)id;
190

191
   __private uint jsr=seed_z;
192
   __private uint jcong=seed_w;
193

194
   for (__private ulong i=0;i<iterations;i++) {
195

196
      // Standard version does not work for several processes (some lost!) memory unconsistent
197
      //__private size_t position=(size_t)(((ulong)space*(ulong)MWC)/(ulong)MAX);
198
      
199
      // Dense version
200
      //__private size_t position=(size_t)( ((ulong)MWC+(ulong)id*(ulong)MAX)*(ulong)block/(ulong)MAX );
201

202
      // Sparse version
203
      //__private size_t position=(size_t)( ((ulong)MWC+(ulong)id*(ulong)MAX)*(ulong)block/(ulong)MAX );
204
      //__private size_t position=(size_t)( ((ulong)MWC*(block)+(ulong)id*(ulong)MAX)/(ulong)MAX );
205
      // First
206
      //__private size_t position=(size_t)( (ulong)(0)*(ulong)size+(ulong)id );
207
      // Last
208
      //__private size_t position=(size_t)( (ulong)(block-1)*(ulong)size+(ulong)id );
209
      // General
210
      __private size_t position=(size_t)( (ulong)MWC*(ulong)(block)/(ulong)MAX*(ulong)size+(ulong)id );
211

212
      // Float version seems to be the best...
213
      //__private uint position=(uint)( block*(CONGfp+id) );
214

215
      s[position]++;
216
   }
217

218
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
219

220
}
221

222
__kernel void MainLoopLocal(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
223
{
224
   uint z=rotl(seed_z,get_local_id(0));
225
   uint w=rotr(seed_w,get_local_id(0));
226

227
   uint jsr=rotl(seed_z,get_local_id(0));
228
   uint jcong=rotr(seed_w,get_local_id(0));
229

230
   for (ulong i=0;i<iterations;i++) {
231

232
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=(uint)1;
233
   }
234

235

236
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
237
}
238

239
__kernel void MainLoopHybrid(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
240
{
241
   uint z=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
242
   uint w=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
243

244
   uint jsr=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
245
   uint jcong=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
246

247
   for (ulong i=0;i<iterations;i++) {
248

249
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
250

251
      barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
252
   }
253

254
      
255
}
256
"""
257

    
258
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
259

    
260
  # Avec PyCUDA autoinit, rien a faire !
261
  
262
  circleCU = cuda.InOut(circle)
263
  
264
  mod = SourceModule(KERNEL_CODE_CUDA)
265

    
266
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
267
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
268
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
269
  
270
  start = pycuda.driver.Event()
271
  stop = pycuda.driver.Event()
272
  
273
  MySplutter=numpy.zeros(steps)
274
  MyDuration=numpy.zeros(steps)
275

    
276
  if iterations%jobs==0:
277
    iterationsCL=numpy.uint64(iterations/jobs)
278
  else:
279
    iterationsCL=numpy.uint64(iterations/jobs+1)
280
    
281
  iterationsNew=iterationsCL*jobs
282

    
283
  for i in range(steps):
284

    
285
    Splutter=numpy.zeros(1024).astype(numpy.uint32)
286
    
287
    print Splutter
288

    
289
    SplutterCU = cuda.InOut(Splutter)
290

    
291
    start.record()
292
    start.synchronize()
293
    if ParaStyle=='Blocks':
294
      MetropolisBlocksCU(SplutterCU,
295
                         numpy.uint32(len(Splutter)),
296
                         numpy.uint64(iterationsCL),
297
                         numpy.uint32(nprnd(2**30/jobs)),
298
                         numpy.uint32(nprnd(2**30/jobs)),
299
                         grid=(jobs,1),
300
                         block=(1,1,1))
301
        
302
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
303
            (Alu,jobs,1,ParaStyle)      
304
    elif ParaStyle=='Hybrid':
305
      threads=BestThreadsNumber(jobs)
306
      MetropolisHybridCU(SplutterCU,
307
                         numpy.uint32(len(Splutter)),
308
                         numpy.uint64(iterationsCL),
309
                         numpy.uint32(nprnd(2**30/jobs)),
310
                         numpy.uint32(nprnd(2**30/jobs)),
311
                         grid=(jobs,1),
312
                         block=(threads,1,1))
313
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
314
            (Alu,jobs/threads,threads,ParaStyle)
315
    else:
316
      MetropolisJobsCU(SplutterCU,
317
                       numpy.uint32(len(Splutter)),
318
                       numpy.uint64(iterationsCL),
319
                       numpy.uint32(nprnd(2**30/jobs)),
320
                       numpy.uint32(nprnd(2**30/jobs)),
321
                       grid=(1,1),
322
                       block=(jobs,1,1))
323
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
324
            (Alu,jobs,1,ParaStyle)
325
    stop.record()
326
    stop.synchronize()
327
                
328
    elapsed = start.time_till(stop)*1e-3
329

    
330
    print Splutter,sum(Splutter)
331
    MySplutter[i]=numpy.median(Splutter)
332
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
333

    
334
    MyDuration[i]=elapsed
335

    
336
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
337
    #MyPi[i]=numpy.median(AllPi)
338
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
339

    
340

    
341
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
342

    
343
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
344

    
345

    
346
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
347
        
348
  # Initialisation des variables en les CASTant correctement
349

    
350
  MaxMemoryXPU=0
351
  MinMemoryXPU=0
352

    
353
  if Device==0:
354
    print "Enter XPU selector based on ALU type: first selected"
355
    HasXPU=False
356
    # Default Device selection based on ALU Type
357
    for platform in cl.get_platforms():
358
      for device in platform.get_devices():
359
        deviceType=cl.device_type.to_string(device.type)
360
        deviceMemory=device.max_mem_alloc_size
361
        if deviceMemory>MaxMemoryXPU:
362
          MaxMemoryXPU=deviceMemory
363
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
364
          MinMemoryXPU=deviceMemory
365
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
366
          XPU=device
367
          print "GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
368
          HasXPU=True
369
          MemoryXPU=deviceMemory
370
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
371
          XPU=device
372
          print "CPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
373
          HasXPU=True
374
          MemoryXPU=deviceMemory
375
          
376
  else:
377
    print "Enter XPU selector based on device number & ALU type"
378
    Id=1
379
    HasXPU=False
380
    # Primary Device selection based on Device Id
381
    for platform in cl.get_platforms():
382
      for device in platform.get_devices():
383
        deviceType=cl.device_type.to_string(device.type)
384
        deviceMemory=device.max_mem_alloc_size
385
        if deviceMemory>MaxMemoryXPU:
386
          MaxMemoryXPU=deviceMemory
387
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
388
          MinMemoryXPU=deviceMemory
389
        if Id==Device and Alu==deviceType and HasXPU==False:
390
          XPU=device
391
          print "CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
392
          HasXPU=True
393
          MemoryXPU=deviceMemory
394
        Id=Id+1
395
    if HasXPU==False:
396
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
397
          (Device,Alu,Id-1)
398
      return(0,0,0)
399

    
400
  print "Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU)
401

    
402
  # Je cree le contexte et la queue pour son execution
403
  ctx = cl.Context([XPU])
404
  queue = cl.CommandQueue(ctx,
405
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
406

    
407
  # Je recupere les flag possibles pour les buffers
408
  mf = cl.mem_flags
409
        
410
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
411
  
412
  MyDuration=numpy.zeros(steps)
413
  
414
  if iterations%jobs==0:
415
    iterationsCL=numpy.uint64(iterations/jobs)
416
  else:
417
    iterationsCL=numpy.uint64(iterations/jobs+1)
418

    
419
  iterationsNew=numpy.uint64(iterationsCL*jobs)
420

    
421
  MySplutter=numpy.zeros(steps)
422

    
423
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
424
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
425
  
426
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
427
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
428

    
429
  for i in range(steps):
430
                
431
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
432
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
433
 
434
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
435

    
436
    Splutter[:]=0
437

    
438
    print Splutter,len(Splutter)
439

    
440
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
441

    
442
    if ParaStyle=='Blocks':
443
      # Call OpenCL kernel
444
      # (1,) is Global work size (only 1 work size)
445
      # (1,) is local work size
446
      # circleCL is lattice translated in CL format
447
      # SeedZCL is lattice translated in CL format
448
      # SeedWCL is lattice translated in CL format
449
      # step is number of iterations
450
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
451
      #                                      SplutterCL,
452
      #                                      numpy.uint32(len(Splutter)),
453
      #                                      numpy.uint64(iterationsCL),
454
      #                                      numpy.uint32(nprnd(2**30/jobs)),
455
      #                                      numpy.uint32(nprnd(2**30/jobs)))
456
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
457
                                           SplutterCL,
458
                                           numpy.uint32(len(Splutter)),
459
                                           numpy.uint64(iterationsCL),
460
                                           numpy.uint32(521288629),
461
                                           numpy.uint32(362436069))
462
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
463
            (Alu,jobs,1,ParaStyle)
464
    elif ParaStyle=='Hybrid':
465
      threads=BestThreadsNumber(jobs)
466
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
467
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
468
                                           SplutterCL,
469
                                           numpy.uint32(len(Splutter)),
470
                                           numpy.uint64(iterationsCL),
471
                                           numpy.uint32(nprnd(2**30/jobs)),
472
                                           numpy.uint32(nprnd(2**30/jobs)))
473
        
474
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
475
            (Alu,jobs/threads,threads,ParaStyle)
476
    else:
477
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
478
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
479
                                          SplutterCL,
480
                                          numpy.uint32(len(Splutter)),
481
                                          numpy.uint64(iterationsCL),
482
                                          numpy.uint32(nprnd(2**30/jobs)),
483
                                          numpy.uint32(nprnd(2**30/jobs)))
484
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
485

    
486
    CLLaunch.wait()
487
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
488

    
489
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
490

    
491
    MyDuration[i]=elapsed
492
    print Splutter,sum(Splutter)
493
    #MySplutter[i]=numpy.median(Splutter)
494
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
495

    
496
    SplutterCL.release()
497

    
498
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
499
        
500
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
501

    
502

    
503
def FitAndPrint(N,D,Curves):
504

    
505
  from scipy.optimize import curve_fit
506
  import matplotlib.pyplot as plt
507

    
508
  try:
509
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
510

    
511
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
512
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
513
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
514
    coeffs_Amdahl[0]=D[0]
515
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
516
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
517
  except:
518
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
519

    
520
  try:
521
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
522

    
523
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
524
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
525
    coeffs_AmdahlR[0]=D[0]
526
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
527
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
528

    
529
  except:
530
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
531

    
532
  try:
533
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
534

    
535
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
536
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
537
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
538
    coeffs_Mylq[0]=D[0]
539
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
540
                                                            coeffs_Mylq[1],
541
                                                            coeffs_Mylq[3],
542
                                                            coeffs_Mylq[2])
543
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
544
                coeffs_Mylq[3])
545
  except:
546
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
547

    
548
  try:
549
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
550

    
551
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
552
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
553
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
554
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
555
    coeffs_Mylq2[0]=D[0]
556
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
557
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
558
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
559

    
560
  except:
561
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
562

    
563
  if Curves:
564
    plt.xlabel("Number of Threads/work Items")
565
    plt.ylabel("Total Elapsed Time")
566

    
567
    Experience,=plt.plot(N,D,'ro') 
568
    try:
569
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
570
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
571
    except:
572
      print "Fit curves seem not to be available"
573

    
574
    plt.legend()
575
    plt.show()
576

    
577
if __name__=='__main__':
578

    
579
  # Set defaults values
580
  
581
  # Alu can be CPU, GPU or ACCELERATOR
582
  Alu='CPU'
583
  # Id of GPU : 1 is for first find !
584
  Device=0
585
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
586
  GpuStyle='OpenCL'
587
  # Parallel distribution can be on Threads or Blocks
588
  ParaStyle='Blocks'
589
  # Iterations is integer
590
  Iterations=100000000
591
  # JobStart in first number of Jobs to explore
592
  JobStart=1
593
  # JobEnd is last number of Jobs to explore
594
  JobEnd=16
595
  # JobStep is the step of Jobs to explore
596
  JobStep=1
597
  # Redo is the times to redo the test to improve metrology
598
  Redo=1
599
  # OutMetrology is method for duration estimation : False is GPU inside
600
  OutMetrology=False
601
  Metrology='InMetro'
602
  # Curves is True to print the curves
603
  Curves=False
604
  # Fit is True to print the curves
605
  Fit=False
606

    
607
  try:
608
    opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
609
  except getopt.GetoptError:
610
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
611
    sys.exit(2)
612
    
613
  for opt, arg in opts:
614
    if opt == '-h':
615
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
616

    
617
      print "\nInformations about devices detected under OpenCL:"
618
      # For PyOpenCL import
619
      try:
620
        import pyopencl as cl
621
        Id=1
622
        for platform in cl.get_platforms():
623
          for device in platform.get_devices():
624
            deviceType=cl.device_type.to_string(device.type)
625
            deviceMemory=device.max_mem_alloc_size
626
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
627
            Id=Id+1
628

    
629
        print
630
        sys.exit()
631
      except ImportError:
632
        print "Your platform does not seem to support OpenCL"
633
        
634
    elif opt == '-o':
635
      OutMetrology=True
636
      Metrology='OutMetro'
637
    elif opt == '-c':
638
      Curves=True
639
    elif opt == '-f':
640
      Fit=True
641
    elif opt in ("-a", "--alu"):
642
      Alu = arg
643
    elif opt in ("-d", "--device"):
644
      Device = int(arg)
645
    elif opt in ("-g", "--gpustyle"):
646
      GpuStyle = arg
647
    elif opt in ("-p", "--parastyle"):
648
      ParaStyle = arg
649
    elif opt in ("-i", "--iterations"):
650
      Iterations = numpy.uint64(arg)
651
    elif opt in ("-s", "--jobstart"):
652
      JobStart = int(arg)
653
    elif opt in ("-e", "--jobend"):
654
      JobEnd = int(arg)
655
    elif opt in ("-t", "--jobstep"):
656
      JobStep = int(arg)
657
    elif opt in ("-r", "--redo"):
658
      Redo = int(arg)
659

    
660
  if Alu=='CPU' and GpuStyle=='CUDA':
661
    print "Alu can't be CPU for CUDA, set Alu to GPU"
662
    Alu='GPU'
663

    
664
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
665
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
666
    ParaStyle='Threads'
667

    
668
  print "Compute unit : %s" % Alu
669
  print "Device Identification : %s" % Device
670
  print "GpuStyle used : %s" % GpuStyle
671
  print "Parallel Style used : %s" % ParaStyle
672
  print "Iterations : %s" % Iterations
673
  print "Number of threads on start : %s" % JobStart
674
  print "Number of threads on end : %s" % JobEnd
675
  print "Number of redo : %s" % Redo
676
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
677

    
678
  if GpuStyle=='CUDA':
679
    try:
680
      # For PyCUDA import
681
      import pycuda.driver as cuda
682
      import pycuda.gpuarray as gpuarray
683
      import pycuda.autoinit
684
      from pycuda.compiler import SourceModule
685
    except ImportError:
686
      print "Platform does not seem to support CUDA"
687

    
688
  if GpuStyle=='OpenCL':
689
    try:
690
      # For PyOpenCL import
691
      import pyopencl as cl
692
      Id=1
693
      for platform in cl.get_platforms():
694
        for device in platform.get_devices():
695
          deviceType=cl.device_type.to_string(device.type)
696
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
697
          if Id == Device:
698
            # Set the Alu as detected Device Type
699
            Alu=deviceType
700
          Id=Id+1
701
    except ImportError:
702
      print "Platform does not seem to support CUDA"
703
      
704
  average=numpy.array([]).astype(numpy.float32)
705
  median=numpy.array([]).astype(numpy.float32)
706
  stddev=numpy.array([]).astype(numpy.float32)
707

    
708
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
709

    
710
  Jobs=JobStart
711

    
712
  while Jobs <= JobEnd:
713
    avg,med,std=0,0,0
714
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
715
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
716

    
717
    if OutMetrology:
718
      duration=numpy.array([]).astype(numpy.float32)
719
      for i in range(Redo):
720
        start=time.time()
721
        if GpuStyle=='CUDA':
722
          try:
723
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
724
          except:
725
            print "Problem with %i // computations on Cuda" % Jobs
726
        elif GpuStyle=='OpenCL':
727
          try:
728
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
729
                                   Alu,Device)
730
          except:
731
            print "Problem with %i // computations on OpenCL" % Jobs            
732
        duration=numpy.append(duration,time.time()-start)
733
      if (a,m,s) != (0,0,0):
734
        avg=numpy.mean(duration)
735
        med=numpy.median(duration)
736
        std=numpy.std(duration)
737
      else:
738
        print "Values seem to be wrong..."
739
    else:
740
      if GpuStyle=='CUDA':
741
        try:
742
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
743
        except:
744
          print "Problem with %i // computations on Cuda" % Jobs
745
      elif GpuStyle=='OpenCL':
746
        try:
747
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
748
        except:
749
          print "Problem with %i // computations on OpenCL" % Jobs            
750

    
751
    if (avg,med,std) != (0,0,0):
752
      print "jobs,avg,med,std",Jobs,avg,med,std
753
      average=numpy.append(average,avg)
754
      median=numpy.append(median,med)
755
      stddev=numpy.append(stddev,std)
756
    else:
757
      print "Values seem to be wrong..."
758
    #THREADS*=2
759
    if len(average)!=0:
760
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
761
      ToSave=[ ExploredJobs,average,median,stddev ]
762
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
763
    Jobs+=JobStep
764

    
765
  if Fit:
766
    FitAndPrint(ExploredJobs,median,Curves)
767