Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 61

Historique | Voir | Annoter | Télécharger (24,71 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import math
23
from socket import gethostname
24

    
25
# find prime factors of a number
26
# Get for WWW :
27
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
28
def PrimeFactors(x):
29
  factorlist=numpy.array([]).astype('uint32')
30
  loop=2
31
  while loop<=x:
32
    if x%loop==0:
33
      x/=loop
34
      factorlist=numpy.append(factorlist,[loop])
35
    else:
36
      loop+=1
37
  return factorlist
38

    
39
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
40
# output is thread number
41
def BestThreadsNumber(jobs):
42
  factors=PrimeFactors(jobs)
43
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
44
  threads=1
45
  for factor in matrix.transpose().ravel():
46
    threads=threads*factor
47
    if threads*threads>jobs:
48
      break
49
  return(long(threads))
50

    
51
# Predicted Amdahl Law (Reduced with s=1-p)  
52
def AmdahlR(N, T1, p):
53
  return (T1*(1-p+p/N))
54

    
55
# Predicted Amdahl Law
56
def Amdahl(N, T1, s, p):
57
  return (T1*(s+p/N))
58

    
59
# Predicted Mylq Law with first order
60
def Mylq(N, T1,s,c,p):
61
  return (T1*(s+p/N)+c*N)
62

    
63
# Predicted Mylq Law with second order
64
def Mylq2(N, T1,s,c1,c2,p):
65
  return (T1*(s+p/N)+c1*N+c2*N*N)
66

    
67
KERNEL_CODE_CUDA="""
68

69
// Marsaglia RNG very simple implementation
70

71
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73
#define MWC   (znew+wnew)
74
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
75
#define CONG  (jcong=69069*jcong+1234567)
76
#define KISS  ((MWC^CONG)+SHR3)
77

78
#define MWCfp MWC * 2.328306435454494e-10f
79
#define KISSfp KISS * 2.328306435454494e-10f
80

81
#define MAX 4294967296
82

83
uint rotl(uint value, int shift) {
84
   return (value << shift) | (value >> (sizeof(value) * 8 - shift));
85
}
86
 
87
uint rotr(uint value, int shift) {
88
   return (value >> shift) | (value << (sizeof(value) * 8 - shift));
89
}
90

91
__global__ void MainLoopBlocks(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
92
{
93
   // uint z=rotl(seed_z,blockIdx.x);
94
   // uint w=rotr(seed_w,blockIdx.x);
95

96
   // uint jsr=rotl(seed_z,blockIdx.x);
97
   // uint jcong=rotr(seed_w,blockIdx.x);
98

99
   uint z=seed_z/(blockIdx.x+1);
100
   uint w=seed_w%(blockIdx.x+1);
101

102
   uint jsr=seed_z/(blockIdx.x+1);
103
   uint jcong=seed_w%(blockIdx.x+1);
104

105
   for (ulong i=0;i<iterations;i++) {
106

107
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
108

109
   }
110
   __threadfence_block();
111
}
112

113
__global__ void MainLoopThreads(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
114
{ 
115
   // uint z=rotl(seed_z,threadIdx.x);
116
   // uint w=rotr(seed_w,threadIdx.x);
117

118
   // uint jsr=rotl(seed_z,threadIdx.x);
119
   // uint jcong=rotr(seed_w,threadIdx.x);
120

121
   uint z=seed_z;
122
   uint w=seed_w;
123

124
   uint jsr=seed_z;
125
   uint jcong=seed_w;
126

127
   for (ulong i=0;i<iterations;i++) {
128

129
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
130
   }
131

132
   __syncthreads();
133
}
134

135
__global__ void MainLoopHybrid(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
136
{
137
   uint z=seed_z;
138
   uint w=seed_w;
139

140
   uint jsr=seed_z;
141
   uint jcong=seed_w;
142

143
   for (ulong i=0;i<iterations;i++) {
144

145
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
146
   }
147

148
   __syncthreads();
149

150
}
151
"""
152

    
153
KERNEL_CODE_OPENCL="""
154
// Marsaglia RNG very simple implementation
155
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
156
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
157

158
#define MWC   (znew+wnew)
159
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160
#define CONG  (jcong=69069*jcong+1234567)
161
#define KISS  ((MWC^CONG)+SHR3)
162

163
#define CONGfp CONG * 2.328306435454494e-10f
164
#define SHR3fp SHR3 * 2.328306435454494e-10f
165
#define MWCfp MWC * 2.328306435454494e-10f
166
#define KISSfp KISS * 2.328306435454494e-10f
167

168
#define MAX (ulong)4294967296
169

170
uint rotl(uint value, int shift) {
171
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
172
}
173
 
174
uint rotr(uint value, int shift) {
175
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
176
}
177

178
__kernel void MainLoopGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
179
{
180
   //__private const float id=(float)get_global_id(0);
181
   //__private const float size=(float)get_global_size(0);
182
   //__private const float block=space/size;
183

184
   __private const ulong id=(ulong)get_global_id(0);
185
   __private const ulong size=(ulong)get_global_size(0);
186
   __private const ulong block=(ulong)space/(ulong)size;
187
   
188
   __private uint z=seed_z-(uint)id;
189
   __private uint w=seed_w+(uint)id;
190

191
   __private uint jsr=seed_z;
192
   __private uint jcong=seed_w;
193

194
   for (__private ulong i=0;i<iterations;i++) {
195

196
      // Standard version does not work for several processes (some lost!)
197
      //__private uint position=(uint)(((ulong)space*(ulong)MWC)/(ulong)MAX);
198
      // Dense version
199
      //__private uint position=(uint)( ((ulong)CONG+MAX*(ulong)id)/(ulong)size*(ulong)space/(ulong)MAX );
200
      __private size_t position=(size_t)( ((ulong)MWC+(ulong)MAX*(ulong)id)*(ulong)block/(ulong)MAX );
201
      // Float version seems to be the best...
202
      //__private uint position=(uint)( block*(CONGfp+id) );
203

204
      s[position]++;
205
   }
206

207
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
208

209
}
210

211
__kernel void MainLoopLocal(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
212
{
213
   uint z=rotl(seed_z,get_local_id(0));
214
   uint w=rotr(seed_w,get_local_id(0));
215

216
   uint jsr=rotl(seed_z,get_local_id(0));
217
   uint jcong=rotr(seed_w,get_local_id(0));
218

219
   for (ulong i=0;i<iterations;i++) {
220

221
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=(uint)1;
222
   }
223

224

225
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
226
}
227

228
__kernel void MainLoopHybrid(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
229
{
230
   uint z=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
231
   uint w=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
232

233
   uint jsr=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
234
   uint jcong=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
235

236
   for (ulong i=0;i<iterations;i++) {
237

238
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
239

240
      barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
241
   }
242

243
      
244
}
245
"""
246

    
247
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
248

    
249
  # Avec PyCUDA autoinit, rien a faire !
250
  
251
  circleCU = cuda.InOut(circle)
252
  
253
  mod = SourceModule(KERNEL_CODE_CUDA)
254

    
255
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
256
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
257
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
258
  
259
  start = pycuda.driver.Event()
260
  stop = pycuda.driver.Event()
261
  
262
  MySplutter=numpy.zeros(steps)
263
  MyDuration=numpy.zeros(steps)
264

    
265
  if iterations%jobs==0:
266
    iterationsCL=numpy.uint64(iterations/jobs)
267
  else:
268
    iterationsCL=numpy.uint64(iterations/jobs+1)
269
    
270
  iterationsNew=iterationsCL*jobs
271

    
272
  for i in range(steps):
273

    
274
    Splutter=numpy.zeros(1024).astype(numpy.uint32)
275
    
276
    print Splutter
277

    
278
    SplutterCU = cuda.InOut(Splutter)
279

    
280
    start.record()
281
    start.synchronize()
282
    if ParaStyle=='Blocks':
283
      MetropolisBlocksCU(SplutterCU,
284
                         numpy.uint32(len(Splutter)),
285
                         numpy.uint64(iterationsCL),
286
                         numpy.uint32(nprnd(2**30/jobs)),
287
                         numpy.uint32(nprnd(2**30/jobs)),
288
                         grid=(jobs,1),
289
                         block=(1,1,1))
290
        
291
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
292
            (Alu,jobs,1,ParaStyle)      
293
    elif ParaStyle=='Hybrid':
294
      threads=BestThreadsNumber(jobs)
295
      MetropolisHybridCU(SplutterCU,
296
                         numpy.uint32(len(Splutter)),
297
                         numpy.uint64(iterationsCL),
298
                         numpy.uint32(nprnd(2**30/jobs)),
299
                         numpy.uint32(nprnd(2**30/jobs)),
300
                         grid=(jobs,1),
301
                         block=(threads,1,1))
302
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
303
            (Alu,jobs/threads,threads,ParaStyle)
304
    else:
305
      MetropolisJobsCU(SplutterCU,
306
                       numpy.uint32(len(Splutter)),
307
                       numpy.uint64(iterationsCL),
308
                       numpy.uint32(nprnd(2**30/jobs)),
309
                       numpy.uint32(nprnd(2**30/jobs)),
310
                       grid=(1,1),
311
                       block=(jobs,1,1))
312
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
313
            (Alu,jobs,1,ParaStyle)
314
    stop.record()
315
    stop.synchronize()
316
                
317
    elapsed = start.time_till(stop)*1e-3
318

    
319
    print Splutter,sum(Splutter)
320
    MySplutter[i]=numpy.median(Splutter)
321
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
322

    
323
    MyDuration[i]=elapsed
324

    
325
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
326
    #MyPi[i]=numpy.median(AllPi)
327
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
328

    
329

    
330
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
331

    
332
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
333

    
334

    
335
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
336
        
337
  # Initialisation des variables en les CASTant correctement
338

    
339
  MaxMemoryXPU=0
340
  MinMemoryXPU=0
341

    
342
  if Device==0:
343
    print "Enter XPU selector based on ALU type: first selected"
344
    HasXPU=False
345
    # Default Device selection based on ALU Type
346
    for platform in cl.get_platforms():
347
      for device in platform.get_devices():
348
        deviceType=cl.device_type.to_string(device.type)
349
        deviceMemory=device.max_mem_alloc_size
350
        if deviceMemory>MaxMemoryXPU:
351
          MaxMemoryXPU=deviceMemory
352
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
353
          MinMemoryXPU=deviceMemory
354
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
355
          XPU=device
356
          print "GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
357
          HasXPU=True
358
          MemoryXPU=deviceMemory
359
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
360
          XPU=device
361
          print "CPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
362
          HasXPU=True
363
          MemoryXPU=deviceMemory
364
          
365
  else:
366
    print "Enter XPU selector based on device number & ALU type"
367
    Id=1
368
    HasXPU=False
369
    # Primary Device selection based on Device Id
370
    for platform in cl.get_platforms():
371
      for device in platform.get_devices():
372
        deviceType=cl.device_type.to_string(device.type)
373
        deviceMemory=device.max_mem_alloc_size
374
        if deviceMemory>MaxMemoryXPU:
375
          MaxMemoryXPU=deviceMemory
376
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
377
          MinMemoryXPU=deviceMemory
378
        if Id==Device and Alu==deviceType and HasXPU==False:
379
          XPU=device
380
          print "CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
381
          HasXPU=True
382
          MemoryXPU=deviceMemory
383
        Id=Id+1
384
    if HasXPU==False:
385
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
386
          (Device,Alu,Id-1)
387
      return(0,0,0)
388

    
389
  print "Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU)
390

    
391
  # Je cree le contexte et la queue pour son execution
392
  ctx = cl.Context([XPU])
393
  queue = cl.CommandQueue(ctx,
394
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
395

    
396
  # Je recupere les flag possibles pour les buffers
397
  mf = cl.mem_flags
398
        
399
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
400
  
401
  MyDuration=numpy.zeros(steps)
402
  
403
  if iterations%jobs==0:
404
    iterationsCL=numpy.uint64(iterations/jobs)
405
  else:
406
    iterationsCL=numpy.uint64(iterations/jobs+1)
407

    
408
  iterationsNew=numpy.uint64(iterationsCL*jobs)
409

    
410
  MySplutter=numpy.zeros(steps)
411

    
412
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
413
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
414

    
415
  
416
  Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
417

    
418
  for i in range(steps):
419
                
420
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
421
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
422
 
423
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
424

    
425
    print Splutter,len(Splutter)
426

    
427
    Splutter[:]=0
428

    
429
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
430

    
431
    if ParaStyle=='Blocks':
432
      # Call OpenCL kernel
433
      # (1,) is Global work size (only 1 work size)
434
      # (1,) is local work size
435
      # circleCL is lattice translated in CL format
436
      # SeedZCL is lattice translated in CL format
437
      # SeedWCL is lattice translated in CL format
438
      # step is number of iterations
439
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
440
      #                                      SplutterCL,
441
      #                                      numpy.uint32(len(Splutter)),
442
      #                                      numpy.uint64(iterationsCL),
443
      #                                      numpy.uint32(nprnd(2**30/jobs)),
444
      #                                      numpy.uint32(nprnd(2**30/jobs)))
445
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
446
                                           SplutterCL,
447
                                           numpy.uint32(len(Splutter)),
448
                                           numpy.uint64(iterationsCL),
449
                                           numpy.uint32(521288629),
450
                                           numpy.uint32(362436069))
451
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
452
            (Alu,jobs,1,ParaStyle)
453
    elif ParaStyle=='Hybrid':
454
      threads=BestThreadsNumber(jobs)
455
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
456
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
457
                                           SplutterCL,
458
                                           numpy.uint32(len(Splutter)),
459
                                           numpy.uint64(iterationsCL),
460
                                           numpy.uint32(nprnd(2**30/jobs)),
461
                                           numpy.uint32(nprnd(2**30/jobs)))
462
        
463
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
464
            (Alu,jobs/threads,threads,ParaStyle)
465
    else:
466
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
467
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
468
                                          SplutterCL,
469
                                          numpy.uint32(len(Splutter)),
470
                                          numpy.uint64(iterationsCL),
471
                                          numpy.uint32(nprnd(2**30/jobs)),
472
                                          numpy.uint32(nprnd(2**30/jobs)))
473
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
474

    
475
    CLLaunch.wait()
476
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
477

    
478
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
479

    
480
    MyDuration[i]=elapsed
481
    #print Splutter,sum(Splutter)
482
    #MySplutter[i]=numpy.median(Splutter)
483
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
484

    
485
    SplutterCL.release()
486

    
487
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
488
        
489
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
490

    
491

    
492
def FitAndPrint(N,D,Curves):
493

    
494
  from scipy.optimize import curve_fit
495
  import matplotlib.pyplot as plt
496

    
497
  try:
498
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
499

    
500
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
501
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
502
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
503
    coeffs_Amdahl[0]=D[0]
504
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
505
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
506
  except:
507
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
508

    
509
  try:
510
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
511

    
512
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
513
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
514
    coeffs_AmdahlR[0]=D[0]
515
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
516
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
517

    
518
  except:
519
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
520

    
521
  try:
522
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
523

    
524
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
525
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
526
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
527
    coeffs_Mylq[0]=D[0]
528
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
529
                                                            coeffs_Mylq[1],
530
                                                            coeffs_Mylq[3],
531
                                                            coeffs_Mylq[2])
532
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
533
                coeffs_Mylq[3])
534
  except:
535
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
536

    
537
  try:
538
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
539

    
540
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
541
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
542
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
543
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
544
    coeffs_Mylq2[0]=D[0]
545
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
546
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
547
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
548

    
549
  except:
550
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
551

    
552
  if Curves:
553
    plt.xlabel("Number of Threads/work Items")
554
    plt.ylabel("Total Elapsed Time")
555

    
556
    Experience,=plt.plot(N,D,'ro') 
557
    try:
558
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
559
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
560
    except:
561
      print "Fit curves seem not to be available"
562

    
563
    plt.legend()
564
    plt.show()
565

    
566
if __name__=='__main__':
567

    
568
  # Set defaults values
569
  
570
  # Alu can be CPU, GPU or ACCELERATOR
571
  Alu='CPU'
572
  # Id of GPU : 1 is for first find !
573
  Device=0
574
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
575
  GpuStyle='OpenCL'
576
  # Parallel distribution can be on Threads or Blocks
577
  ParaStyle='Blocks'
578
  # Iterations is integer
579
  Iterations=100000000
580
  # JobStart in first number of Jobs to explore
581
  JobStart=1
582
  # JobEnd is last number of Jobs to explore
583
  JobEnd=16
584
  # JobStep is the step of Jobs to explore
585
  JobStep=1
586
  # Redo is the times to redo the test to improve metrology
587
  Redo=1
588
  # OutMetrology is method for duration estimation : False is GPU inside
589
  OutMetrology=False
590
  Metrology='InMetro'
591
  # Curves is True to print the curves
592
  Curves=False
593
  # Fit is True to print the curves
594
  Fit=False
595

    
596
  try:
597
    opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
598
  except getopt.GetoptError:
599
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
600
    sys.exit(2)
601
    
602
  for opt, arg in opts:
603
    if opt == '-h':
604
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
605

    
606
      print "\nInformations about devices detected under OpenCL:"
607
      # For PyOpenCL import
608
      try:
609
        import pyopencl as cl
610
        Id=1
611
        for platform in cl.get_platforms():
612
          for device in platform.get_devices():
613
            deviceType=cl.device_type.to_string(device.type)
614
            deviceMemory=device.max_mem_alloc_size
615
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
616
            Id=Id+1
617

    
618
        print
619
        sys.exit()
620
      except ImportError:
621
        print "Your platform does not seem to support OpenCL"
622
        
623
    elif opt == '-o':
624
      OutMetrology=True
625
      Metrology='OutMetro'
626
    elif opt == '-c':
627
      Curves=True
628
    elif opt == '-f':
629
      Fit=True
630
    elif opt in ("-a", "--alu"):
631
      Alu = arg
632
    elif opt in ("-d", "--device"):
633
      Device = int(arg)
634
    elif opt in ("-g", "--gpustyle"):
635
      GpuStyle = arg
636
    elif opt in ("-p", "--parastyle"):
637
      ParaStyle = arg
638
    elif opt in ("-i", "--iterations"):
639
      Iterations = numpy.uint64(arg)
640
    elif opt in ("-s", "--jobstart"):
641
      JobStart = int(arg)
642
    elif opt in ("-e", "--jobend"):
643
      JobEnd = int(arg)
644
    elif opt in ("-t", "--jobstep"):
645
      JobStep = int(arg)
646
    elif opt in ("-r", "--redo"):
647
      Redo = int(arg)
648

    
649
  if Alu=='CPU' and GpuStyle=='CUDA':
650
    print "Alu can't be CPU for CUDA, set Alu to GPU"
651
    Alu='GPU'
652

    
653
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
654
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
655
    ParaStyle='Threads'
656

    
657
  print "Compute unit : %s" % Alu
658
  print "Device Identification : %s" % Device
659
  print "GpuStyle used : %s" % GpuStyle
660
  print "Parallel Style used : %s" % ParaStyle
661
  print "Iterations : %s" % Iterations
662
  print "Number of threads on start : %s" % JobStart
663
  print "Number of threads on end : %s" % JobEnd
664
  print "Number of redo : %s" % Redo
665
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
666

    
667
  if GpuStyle=='CUDA':
668
    try:
669
      # For PyCUDA import
670
      import pycuda.driver as cuda
671
      import pycuda.gpuarray as gpuarray
672
      import pycuda.autoinit
673
      from pycuda.compiler import SourceModule
674
    except ImportError:
675
      print "Platform does not seem to support CUDA"
676

    
677
  if GpuStyle=='OpenCL':
678
    try:
679
      # For PyOpenCL import
680
      import pyopencl as cl
681
      Id=1
682
      for platform in cl.get_platforms():
683
        for device in platform.get_devices():
684
          deviceType=cl.device_type.to_string(device.type)
685
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
686
          if Id == Device:
687
            # Set the Alu as detected Device Type
688
            Alu=deviceType
689
          Id=Id+1
690
    except ImportError:
691
      print "Platform does not seem to support CUDA"
692
      
693
  average=numpy.array([]).astype(numpy.float32)
694
  median=numpy.array([]).astype(numpy.float32)
695
  stddev=numpy.array([]).astype(numpy.float32)
696

    
697
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
698

    
699
  Jobs=JobStart
700

    
701
  while Jobs <= JobEnd:
702
    avg,med,std=0,0,0
703
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
704
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
705

    
706
    if OutMetrology:
707
      duration=numpy.array([]).astype(numpy.float32)
708
      for i in range(Redo):
709
        start=time.time()
710
        if GpuStyle=='CUDA':
711
          try:
712
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
713
          except:
714
            print "Problem with %i // computations on Cuda" % Jobs
715
        elif GpuStyle=='OpenCL':
716
          try:
717
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
718
                                   Alu,Device)
719
          except:
720
            print "Problem with %i // computations on OpenCL" % Jobs            
721
        duration=numpy.append(duration,time.time()-start)
722
      if (a,m,s) != (0,0,0):
723
        avg=numpy.mean(duration)
724
        med=numpy.median(duration)
725
        std=numpy.std(duration)
726
      else:
727
        print "Values seem to be wrong..."
728
    else:
729
      if GpuStyle=='CUDA':
730
        try:
731
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
732
        except:
733
          print "Problem with %i // computations on Cuda" % Jobs
734
      elif GpuStyle=='OpenCL':
735
        try:
736
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
737
        except:
738
          print "Problem with %i // computations on OpenCL" % Jobs            
739

    
740
    if (avg,med,std) != (0,0,0):
741
      print "jobs,avg,med,std",Jobs,avg,med,std
742
      average=numpy.append(average,avg)
743
      median=numpy.append(median,med)
744
      stddev=numpy.append(stddev,std)
745
    else:
746
      print "Values seem to be wrong..."
747
    #THREADS*=2
748
    if len(average)!=0:
749
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
750
      ToSave=[ ExploredJobs,average,median,stddev ]
751
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
752
    Jobs+=JobStep
753

    
754
  if Fit:
755
    FitAndPrint(ExploredJobs,median,Curves)
756