Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 68

Historique | Voir | Annoter | Télécharger (27,62 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# http://mathema.tician.de/software/pyopencl
11
# 
12

    
13
# 2013-01-01 : problems with launch timeout
14
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
15
# Option "Interactive" "0" in /etc/X11/xorg.conf
16

    
17
# Marsaglia elements about RNG 
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import math
26
from socket import gethostname
27

    
28
# find prime factors of a number
29
# Get for WWW :
30
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
31
def PrimeFactors(x):
32
  factorlist=numpy.array([]).astype('uint32')
33
  loop=2
34
  while loop<=x:
35
    if x%loop==0:
36
      x/=loop
37
      factorlist=numpy.append(factorlist,[loop])
38
    else:
39
      loop+=1
40
  return factorlist
41
  
42
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
43
# output is thread number
44
def BestThreadsNumber(jobs):
45
  factors=PrimeFactors(jobs)
46
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
47
  threads=1
48
  for factor in matrix.transpose().ravel():
49
    threads=threads*factor
50
    if threads*threads>jobs:
51
      break
52
  return(long(threads))
53

    
54
# Predicted Amdahl Law (Reduced with s=1-p)  
55
def AmdahlR(N, T1, p):
56
  return (T1*(1-p+p/N))
57

    
58
# Predicted Amdahl Law
59
def Amdahl(N, T1, s, p):
60
  return (T1*(s+p/N))
61

    
62
# Predicted Mylq Law with first order
63
def Mylq(N, T1,s,c,p):
64
  return (T1*(s+p/N)+c*N)
65

    
66
# Predicted Mylq Law with second order
67
def Mylq2(N, T1,s,c1,c2,p):
68
  return (T1*(s+p/N)+c1*N+c2*N*N)
69

    
70
KERNEL_CODE_CUDA="""
71

72
// Marsaglia RNG very simple implementation
73

74
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
75
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
76

77
#define MWC   (znew+wnew)
78
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
79
#define CONG  (jcong=69069*jcong+1234567)
80
#define KISS  ((MWC^CONG)+SHR3)
81

82
#define CONGfp CONG * 2.328306435454494e-10f
83
#define SHR3fp SHR3 * 2.328306435454494e-10f
84
#define MWCfp MWC * 2.328306435454494e-10f
85
#define KISSfp KISS * 2.328306435454494e-10f
86

87
#define MAX (ulong)4294967296
88
#define UMAX (uint)2147483648
89

90
__global__ void SplutterGlobal(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
91
{
92
    const ulong id=(ulong)(blockIdx.x);
93
   
94
    uint z=seed_z-(uint)id;
95
    uint w=seed_w+(uint)id;
96

97
    uint jsr=seed_z;
98
    uint jcong=seed_w;
99

100
   for ( ulong i=0;i<iterations;i++) {
101

102
      // All version
103
      uint position=(uint)( ((ulong)MWC*(ulong)space)/MAX );
104

105
      // UMAX is set to avoid round over overflow
106
      atomicInc(&s[position],UMAX);
107
   }
108

109
   __syncthreads();
110
}
111

112
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
113
{
114
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
115
    const ulong size=(ulong)(gridDim.x*blockDim.x);
116
    const ulong block=(ulong)space/(ulong)size;
117
   
118
    uint z=seed_z-(uint)id;
119
    uint w=seed_w+(uint)id;
120

121
    uint jsr=seed_z;
122
    uint jcong=seed_w;
123

124
   for ( ulong i=0;i<iterations;i++) {
125

126
      // Dense version
127
       uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX );
128

129
      s[position]++;
130
   }
131

132
   __syncthreads();
133
}
134

135
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
136
{ 
137
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
138
    const ulong size=(ulong)(gridDim.x*blockDim.x);
139
    const ulong block=(ulong)space/(ulong)size;
140
   
141
    uint z=seed_z-(uint)id;
142
    uint w=seed_w+(uint)id;
143

144
    uint jsr=seed_z;
145
    uint jcong=seed_w;
146

147
   for ( ulong i=0;i<iterations;i++) {
148

149
      // Sparse version
150
       uint position=(uint)( (ulong)MWC*block/MAX*size+id );
151

152
      s[position]++;
153
   }
154

155
   __syncthreads();
156
}
157

158
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
159
{
160
    const ulong id=(ulong)(threadIdx.x);
161
    const ulong size=(ulong)(blockDim.x);
162
    const ulong block=(ulong)space/(ulong)size;
163
   
164
    uint z=seed_z-(uint)id;
165
    uint w=seed_w+(uint)id;
166

167
    uint jsr=seed_z;
168
    uint jcong=seed_w;
169

170
   for ( ulong i=0;i<iterations;i++) {
171

172
      // Dense version
173
       size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
174

175
      s[position]++;
176
   }
177

178

179
   __syncthreads();
180

181
}
182

183
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
184
{
185
    const ulong id=(ulong)threadIdx.x;
186
    const ulong size=(ulong)blockDim.x;
187
    const ulong block=(ulong)space/(ulong)size;
188
   
189
    uint z=seed_z-(uint)id;
190
    uint w=seed_w+(uint)id;
191

192
    uint jsr=seed_z;
193
    uint jcong=seed_w;
194

195
   for ( ulong i=0;i<iterations;i++) {
196

197
      // Sparse version
198
       size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
199

200
      s[position]++;
201
   }
202

203
   __syncthreads();
204

205
}
206

207
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
208
{
209
    const ulong id=(ulong)(blockIdx.x);
210
    const ulong size=(ulong)(gridDim.x);
211
    const ulong block=(ulong)space/(ulong)size;
212
   
213
    uint z=seed_z-(uint)id;
214
    uint w=seed_w+(uint)id;
215

216
    uint jsr=seed_z;
217
    uint jcong=seed_w;
218

219
   for ( ulong i=0;i<iterations;i++) {
220

221
      // Dense version
222
      size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
223

224
      s[position]++;
225
   }
226
      
227
   __syncthreads();
228
}
229

230
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
231
{
232
    const ulong id=(ulong)(blockIdx.x);
233
    const ulong size=(ulong)(gridDim.x);
234
    const ulong block=(ulong)space/(ulong)size;
235
   
236
    uint z=seed_z-(uint)id;
237
    uint w=seed_w+(uint)id;
238

239
    uint jsr=seed_z;
240
    uint jcong=seed_w;
241

242
   for ( ulong i=0;i<iterations;i++) {
243

244
      // Sparse version
245
      size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id );
246

247
      s[position]++;
248

249
   }
250

251
   //s[blockIdx.x]=blockIdx.x;
252
   __syncthreads();
253
}
254

255
"""
256

    
257
KERNEL_CODE_OPENCL="""
258
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
259

260
// Marsaglia RNG very simple implementation
261
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
262
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
263

264
#define MWC   (znew+wnew)
265
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
266
#define CONG  (jcong=69069*jcong+1234567)
267
#define KISS  ((MWC^CONG)+SHR3)
268

269
#define CONGfp CONG * 2.328306435454494e-10f
270
#define SHR3fp SHR3 * 2.328306435454494e-10f
271
#define MWCfp MWC * 2.328306435454494e-10f
272
#define KISSfp KISS * 2.328306435454494e-10f
273

274
#define MAX (ulong)4294967296
275

276
uint rotl(uint value, int shift) {
277
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
278
}
279
 
280
uint rotr(uint value, int shift) {
281
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
282
}
283

284
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
285
{
286
   __private const ulong id=(ulong)get_global_id(0);
287
   
288
   __private uint z=seed_z-(uint)id;
289
   __private uint w=seed_w+(uint)id;
290

291
   __private uint jsr=seed_z;
292
   __private uint jcong=seed_w;
293

294
   for (__private ulong i=0;i<iterations;i++) {
295

296
      // Dense version
297
      __private size_t position=(size_t)( MWC%space );
298

299
      atomic_inc(&s[position]);
300
   }
301

302
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
303

304
}
305

306
__kernel void SplutterLocal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
307
{
308
   __private const ulong id=(ulong)get_local_id(0);
309
   
310
   __private uint z=seed_z-(uint)id;
311
   __private uint w=seed_w+(uint)id;
312

313
   __private uint jsr=seed_z;
314
   __private uint jcong=seed_w;
315

316
   for (__private ulong i=0;i<iterations;i++) {
317

318
      // Dense version
319
      //__private size_t position=(size_t)( (MWC+id*block)%space );
320
      __private size_t position=(size_t)( MWC%space );
321

322
      atomic_inc(&s[position]);
323
   }
324

325
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
326

327
}
328

329
__kernel void SplutterHybrid(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
330
{
331
   __private const ulong id=(ulong)(get_global_id(0)+get_local_id(0));
332
   
333
   __private uint z=seed_z-(uint)id;
334
   __private uint w=seed_w+(uint)id;
335

336
   __private uint jsr=seed_z;
337
   __private uint jcong=seed_w;
338

339
   for (__private ulong i=0;i<iterations;i++) {
340

341
      // Dense version
342
      __private size_t position=(size_t)( MWC%space );
343

344
      atomic_inc(&s[position]);
345
   }
346
      
347
}
348

349
"""
350

    
351
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density,Memory):
352

    
353
  # Avec PyCUDA autoinit, rien a faire !
354

    
355
  circleCU = cuda.InOut(circle)
356
  
357
  mod = SourceModule(KERNEL_CODE_CUDA)
358

    
359
  if Density=='Dense':
360
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
361
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
362
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
363
  elif Density=='Sparse':
364
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
365
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
366
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
367
  else:
368
    MetropolisBlocksCU=mod.get_function("SplutterGlobal")
369
    
370
  start = pycuda.driver.Event()
371
  stop = pycuda.driver.Event()
372
  
373
  MySplutter=numpy.zeros(steps)
374
  MyDuration=numpy.zeros(steps)
375

    
376
  if iterations%jobs==0:
377
    iterationsCL=numpy.uint64(iterations/jobs)
378
  else:
379
    iterationsCL=numpy.uint64(iterations/jobs+1)
380
    
381
  iterationsNew=iterationsCL*jobs
382

    
383
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
384

    
385
  for i in range(steps):
386

    
387
    Splutter[:]=0
388
    
389
    print Splutter,len(Splutter)
390

    
391
    SplutterCU = cuda.InOut(Splutter)
392

    
393
    start.record()
394
    start.synchronize()
395
    if ParaStyle=='Blocks':
396
      MetropolisBlocksCU(SplutterCU,
397
                         numpy.uint32(len(Splutter)),
398
                         numpy.uint64(iterationsCL),
399
                         numpy.uint32(nprnd(2**30/jobs)),
400
                         numpy.uint32(nprnd(2**30/jobs)),
401
                         grid=(jobs,1),
402
                         block=(1,1,1))
403
        
404
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
405
            (Alu,jobs,1,ParaStyle)      
406
    elif ParaStyle=='Hybrid':
407
      threads=BestThreadsNumber(jobs)
408
      MetropolisHybridCU(SplutterCU,
409
                         numpy.uint32(len(Splutter)),
410
                         numpy.uint64(iterationsCL),
411
                         numpy.uint32(nprnd(2**30/jobs)),
412
                         numpy.uint32(nprnd(2**30/jobs)),
413
                         grid=(jobs,1),
414
                         block=(threads,1,1))
415
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
416
            (Alu,jobs/threads,threads,ParaStyle)
417
    else:
418
      MetropolisThreadsCU(SplutterCU,
419
                       numpy.uint32(len(Splutter)),
420
                       numpy.uint64(iterationsCL),
421
                       numpy.uint32(nprnd(2**30/jobs)),
422
                       numpy.uint32(nprnd(2**30/jobs)),
423
                       grid=(1,1),
424
                       block=(jobs,1,1))
425
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
426
            (Alu,1,jobs,ParaStyle)
427
    stop.record()
428
    stop.synchronize()
429
                
430
    elapsed = start.time_till(stop)*1e-3
431

    
432
    print Splutter,sum(Splutter)
433
    MySplutter[i]=numpy.median(Splutter)
434
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
435

    
436
    MyDuration[i]=elapsed
437

    
438
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
439
    #MyPi[i]=numpy.median(AllPi)
440
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
441

    
442

    
443
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
444

    
445
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
446

    
447

    
448
def MetropolisOpenCL(circle,iterations,steps,jobs,
449
                     ParaStyle,Alu,Device,Memory):
450
        
451
  # Initialisation des variables en les CASTant correctement
452

    
453
  MaxMemoryXPU=0
454
  MinMemoryXPU=0
455

    
456
  if Device==0:
457
    print "Enter XPU selector based on ALU type: first selected"
458
    HasXPU=False
459
    # Default Device selection based on ALU Type
460
    for platform in cl.get_platforms():
461
      for device in platform.get_devices():
462
        deviceType=cl.device_type.to_string(device.type)
463
        deviceMemory=device.max_mem_alloc_size
464
        if deviceMemory>MaxMemoryXPU:
465
          MaxMemoryXPU=deviceMemory
466
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
467
          MinMemoryXPU=deviceMemory
468
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
469
          XPU=device
470
          print "GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
471
          HasXPU=True
472
          MemoryXPU=deviceMemory
473
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
474
          XPU=device
475
          print "CPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
476
          HasXPU=True
477
          MemoryXPU=deviceMemory
478
          
479
  else:
480
    print "Enter XPU selector based on device number & ALU type"
481
    Id=1
482
    HasXPU=False
483
    # Primary Device selection based on Device Id
484
    for platform in cl.get_platforms():
485
      for device in platform.get_devices():
486
        deviceType=cl.device_type.to_string(device.type)
487
        deviceMemory=device.max_mem_alloc_size
488
        if deviceMemory>MaxMemoryXPU:
489
          MaxMemoryXPU=deviceMemory
490
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
491
          MinMemoryXPU=deviceMemory
492
        if Id==Device and Alu==deviceType and HasXPU==False:
493
          XPU=device
494
          print "CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
495
          HasXPU=True
496
          MemoryXPU=deviceMemory
497
        Id=Id+1
498
    if HasXPU==False:
499
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
500
          (Device,Alu,Id-1)
501
      return(0,0,0)
502

    
503
  print "Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU)
504

    
505
  # Je cree le contexte et la queue pour son execution
506
  ctx = cl.Context([XPU])
507
  queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
508
  
509
  # Je recupere les flag possibles pour les buffers
510
  mf = cl.mem_flags
511

    
512
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
513
      
514
  MyDuration=numpy.zeros(steps)
515
  
516
  if iterations%jobs==0:
517
    iterationsCL=numpy.uint64(iterations/jobs)
518
  else:
519
    iterationsCL=numpy.uint64(iterations/jobs+1)
520
    
521
  iterationsNew=numpy.uint64(iterationsCL*jobs)
522

    
523
  MySplutter=numpy.zeros(steps)
524

    
525
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
526
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
527
  
528
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
529
  #Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
530
  Splutter=numpy.zeros(Memory).astype(numpy.uint32)
531

    
532
  for i in range(steps):
533
                
534
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
535
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
536
 
537
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
538

    
539
    Splutter[:]=0
540

    
541
    print Splutter,len(Splutter)
542

    
543
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
544

    
545
    if ParaStyle=='Blocks':
546
      # Call OpenCL kernel
547
      # (1,) is Global work size (only 1 work size)
548
      # (1,) is local work size
549
      # circleCL is lattice translated in CL format
550
      # SeedZCL is lattice translated in CL format
551
      # SeedWCL is lattice translated in CL format
552
      # step is number of iterations
553
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
554
      #                                      SplutterCL,
555
      #                                      numpy.uint32(len(Splutter)),
556
      #                                      numpy.uint64(iterationsCL),
557
      #                                      numpy.uint32(nprnd(2**30/jobs)),
558
      #                                      numpy.uint32(nprnd(2**30/jobs)))
559
      CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
560
                                           SplutterCL,
561
                                           numpy.uint32(len(Splutter)),
562
                                           numpy.uint64(iterationsCL),
563
                                           numpy.uint32(nprnd(2**30/jobs)),
564
                                           numpy.uint32(nprnd(2**30/jobs)))
565
        
566
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
567
            (Alu,jobs,1,ParaStyle)
568
    elif ParaStyle=='Hybrid':
569
      #threads=BestThreadsNumber(jobs)
570
      threads=BestThreadsNumber(256)
571
      print "print",threads      
572
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
573
      CLLaunch=MetropolisCL.SplutterHybrid(queue,(jobs,),(threads,),
574
                                           SplutterCL,
575
                                           numpy.uint32(len(Splutter)),
576
                                           numpy.uint64(iterationsCL),
577
                                           numpy.uint32(nprnd(2**30/jobs)),
578
                                           numpy.uint32(nprnd(2**30/jobs)))
579
        
580
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
581
            (Alu,jobs/threads,threads,ParaStyle)
582
    else:
583
      # en OpenCL, necessaire de mettre un global_id identique au local_id
584
      CLLaunch=MetropolisCL.SplutterLocal(queue,(jobs,),(jobs,),
585
                                          SplutterCL,
586
                                          numpy.uint32(len(Splutter)),
587
                                          numpy.uint64(iterationsCL),
588
                                          numpy.uint32(nprnd(2**30/jobs)),
589
                                          numpy.uint32(nprnd(2**30/jobs)))
590
        
591
        
592
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
593

    
594
    CLLaunch.wait()
595
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
596

    
597
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
598

    
599
    MyDuration[i]=elapsed
600
    print Splutter,sum(Splutter)
601
    #MySplutter[i]=numpy.median(Splutter)
602
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
603
    
604
  SplutterCL.release()
605

    
606
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
607
        
608
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
609

    
610

    
611
def FitAndPrint(N,D,Curves):
612

    
613
  from scipy.optimize import curve_fit
614
  import matplotlib.pyplot as plt
615

    
616
  try:
617
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
618

    
619
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
620
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
621
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
622
    coeffs_Amdahl[0]=D[0]
623
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
624
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
625
  except:
626
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
627

    
628
  try:
629
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
630

    
631
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
632
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
633
    coeffs_AmdahlR[0]=D[0]
634
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
635
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
636

    
637
  except:
638
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
639

    
640
  try:
641
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
642

    
643
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
644
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
645
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
646
    coeffs_Mylq[0]=D[0]
647
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
648
                                                            coeffs_Mylq[1],
649
                                                            coeffs_Mylq[3],
650
                                                            coeffs_Mylq[2])
651
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
652
                coeffs_Mylq[3])
653
  except:
654
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
655

    
656
  try:
657
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
658

    
659
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
660
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
661
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
662
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
663
    coeffs_Mylq2[0]=D[0]
664
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
665
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
666
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
667

    
668
  except:
669
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
670

    
671
  if Curves:
672
    plt.xlabel("Number of Threads/work Items")
673
    plt.ylabel("Total Elapsed Time")
674

    
675
    Experience,=plt.plot(N,D,'ro') 
676
    try:
677
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
678
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
679
    except:
680
      print "Fit curves seem not to be available"
681

    
682
    plt.legend()
683
    plt.show()
684

    
685
if __name__=='__main__':
686

    
687
  # Set defaults values
688
  
689
  # Alu can be CPU, GPU or ACCELERATOR
690
  Alu='CPU'
691
  # Id of GPU : 1 is for first find !
692
  Device=0
693
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
694
  GpuStyle='OpenCL'
695
  # Parallel distribution can be on Threads or Blocks
696
  ParaStyle='Blocks'
697
  # Iterations is integer
698
  Iterations=10000000
699
  # JobStart in first number of Jobs to explore
700
  JobStart=1
701
  # JobEnd is last number of Jobs to explore
702
  JobEnd=16
703
  # JobStep is the step of Jobs to explore
704
  JobStep=1
705
  # Redo is the times to redo the test to improve metrology
706
  Redo=1
707
  # OutMetrology is method for duration estimation : False is GPU inside
708
  OutMetrology=False
709
  Metrology='InMetro'
710
  # Curves is True to print the curves
711
  Curves=False
712
  # Fit is True to print the curves
713
  Fit=False
714
  # Memory of vector explored
715
  Memory=1024
716

    
717
  try:
718
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
719
  except getopt.GetoptError:
720
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
721
    sys.exit(2)
722
    
723
  for opt, arg in opts:
724
    if opt == '-h':
725
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
726

    
727
      print "\nInformations about devices detected under OpenCL:"
728
      # For PyOpenCL import
729
      try:
730
        import pyopencl as cl
731
        Id=1
732
        for platform in cl.get_platforms():
733
          for device in platform.get_devices():
734
            deviceType=cl.device_type.to_string(device.type)
735
            deviceMemory=device.max_mem_alloc_size
736
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
737
            Id=Id+1
738

    
739
        print
740
        sys.exit()
741
      except ImportError:
742
        print "Your platform does not seem to support OpenCL"
743
        
744
    elif opt == '-o':
745
      OutMetrology=True
746
      Metrology='OutMetro'
747
    elif opt == '-c':
748
      Curves=True
749
    elif opt == '-f':
750
      Fit=True
751
    elif opt in ("-a", "--alu"):
752
      Alu = arg
753
    elif opt in ("-d", "--device"):
754
      Device = int(arg)
755
    elif opt in ("-g", "--gpustyle"):
756
      GpuStyle = arg
757
    elif opt in ("-p", "--parastyle"):
758
      ParaStyle = arg
759
    elif opt in ("-i", "--iterations"):
760
      Iterations = numpy.uint64(arg)
761
    elif opt in ("-s", "--jobstart"):
762
      JobStart = int(arg)
763
    elif opt in ("-e", "--jobend"):
764
      JobEnd = int(arg)
765
    elif opt in ("-t", "--jobstep"):
766
      JobStep = int(arg)
767
    elif opt in ("-r", "--redo"):
768
      Redo = int(arg)
769
    elif opt in ("-m", "--memory"):
770
      Memory = int(arg)
771

    
772
  print "Toto %s" % Alu
773

    
774
  if Alu=='CPU' and GpuStyle=='CUDA':
775
    print "Alu can't be CPU for CUDA, set Alu to GPU"
776
    Alu='GPU'
777

    
778
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
779
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
780
    ParaStyle='Blocks'
781

    
782
  print "Compute unit : %s" % Alu
783
  print "Device Identification : %s" % Device
784
  print "GpuStyle used : %s" % GpuStyle
785
  print "Parallel Style used : %s" % ParaStyle
786
  print "Iterations : %s" % Iterations
787
  print "Number of threads on start : %s" % JobStart
788
  print "Number of threads on end : %s" % JobEnd
789
  print "Number of redo : %s" % Redo
790
  print "Memory  : %s" % Memory
791
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
792

    
793
  if GpuStyle=='CUDA':
794
    try:
795
      # For PyCUDA import
796
      import pycuda.driver as cuda
797
      import pycuda.gpuarray as gpuarray
798
      import pycuda.autoinit
799
      from pycuda.compiler import SourceModule
800
    except ImportError:
801
      print "Platform does not seem to support CUDA"
802

    
803
  if GpuStyle=='OpenCL':
804
    try:
805
      # For PyOpenCL import
806
      import pyopencl as cl
807
      Id=1
808
      for platform in cl.get_platforms():
809
        for device in platform.get_devices():
810
          deviceType=cl.device_type.to_string(device.type)
811
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
812
          if Id == Device:
813
            # Set the Alu as detected Device Type
814
            Alu=deviceType
815
          Id=Id+1
816
    except ImportError:
817
      print "Platform does not seem to support CUDA"
818
      
819
  average=numpy.array([]).astype(numpy.float32)
820
  median=numpy.array([]).astype(numpy.float32)
821
  stddev=numpy.array([]).astype(numpy.float32)
822

    
823
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
824

    
825
  Jobs=JobStart
826

    
827
  while Jobs <= JobEnd:
828
    avg,med,std=0,0,0
829
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
830
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
831

    
832
    if OutMetrology:
833
      duration=numpy.array([]).astype(numpy.float32)
834
      for i in range(Redo):
835
        start=time.time()
836
        if GpuStyle=='CUDA':
837
          try:
838
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
839
                                 Memory)
840
          except:
841
            print "Problem with %i // computations on Cuda" % Jobs
842
        elif GpuStyle=='OpenCL':
843
          try:
844
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
845
                                   Alu,Device,Memory)
846
          except:
847
            print "Problem with %i // computations on OpenCL" % Jobs            
848
        duration=numpy.append(duration,time.time()-start)
849
      if (a,m,s) != (0,0,0):
850
        avg=numpy.mean(duration)
851
        med=numpy.median(duration)
852
        std=numpy.std(duration)
853
      else:
854
        print "Values seem to be wrong..."
855
    else:
856
      if GpuStyle=='CUDA':
857
        try:
858
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,
859
                                     Jobs,ParaStyle,Memory)
860
        except:
861
          print "Problem with %i // computations on Cuda" % Jobs
862
      elif GpuStyle=='OpenCL':
863
        try:
864
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,
865
                                       ParaStyle,Alu,Device,Memory)
866
        except:
867
          print "Problem with %i // computations on OpenCL" % Jobs            
868

    
869
    if (avg,med,std) != (0,0,0):
870
      print "jobs,avg,med,std",Jobs,avg,med,std
871
      average=numpy.append(average,avg)
872
      median=numpy.append(median,med)
873
      stddev=numpy.append(stddev,std)
874
    else:
875
      print "Values seem to be wrong..."
876
    #THREADS*=2
877
    if len(average)!=0:
878
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
879
      ToSave=[ ExploredJobs,average,median,stddev ]
880
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
881
    Jobs+=JobStep
882

    
883
  if Fit:
884
    FitAndPrint(ExploredJobs,median,Curves)
885