Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 63

Historique | Voir | Annoter | Télécharger (30,53 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import math
23
from socket import gethostname
24

    
25
# find prime factors of a number
26
# Get for WWW :
27
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
28
def PrimeFactors(x):
29
  factorlist=numpy.array([]).astype('uint32')
30
  loop=2
31
  while loop<=x:
32
    if x%loop==0:
33
      x/=loop
34
      factorlist=numpy.append(factorlist,[loop])
35
    else:
36
      loop+=1
37
  return factorlist
38

    
39
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
40
# output is thread number
41
def BestThreadsNumber(jobs):
42
  factors=PrimeFactors(jobs)
43
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
44
  threads=1
45
  for factor in matrix.transpose().ravel():
46
    threads=threads*factor
47
    if threads*threads>jobs:
48
      break
49
  return(long(threads))
50

    
51
# Predicted Amdahl Law (Reduced with s=1-p)  
52
def AmdahlR(N, T1, p):
53
  return (T1*(1-p+p/N))
54

    
55
# Predicted Amdahl Law
56
def Amdahl(N, T1, s, p):
57
  return (T1*(s+p/N))
58

    
59
# Predicted Mylq Law with first order
60
def Mylq(N, T1,s,c,p):
61
  return (T1*(s+p/N)+c*N)
62

    
63
# Predicted Mylq Law with second order
64
def Mylq2(N, T1,s,c1,c2,p):
65
  return (T1*(s+p/N)+c1*N+c2*N*N)
66

    
67
KERNEL_CODE_CUDA="""
68

69
// Marsaglia RNG very simple implementation
70

71
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73

74
#define MWC   (znew+wnew)
75
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
76
#define CONG  (jcong=69069*jcong+1234567)
77
#define KISS  ((MWC^CONG)+SHR3)
78

79
#define CONGfp CONG * 2.328306435454494e-10f
80
#define SHR3fp SHR3 * 2.328306435454494e-10f
81
#define MWCfp MWC * 2.328306435454494e-10f
82
#define KISSfp KISS * 2.328306435454494e-10f
83

84
#define MAX (ulong)4294967296
85

86
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
87
{
88
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
89
    const ulong size=(ulong)(gridDim.x*blockDim.x);
90
    const ulong block=(ulong)space/(ulong)size;
91
   
92
    uint z=seed_z-(uint)id;
93
    uint w=seed_w+(uint)id;
94

95
    uint jsr=seed_z;
96
    uint jcong=seed_w;
97

98
   for ( ulong i=0;i<iterations;i++) {
99

100
      // Dense version
101
       uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX );
102

103
      s[position]++;
104
   }
105

106
   __syncthreads();
107
}
108

109
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
110
{ 
111
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
112
    const ulong size=(ulong)(gridDim.x*blockDim.x);
113
    const ulong block=(ulong)space/(ulong)size;
114
   
115
    uint z=seed_z-(uint)id;
116
    uint w=seed_w+(uint)id;
117

118
    uint jsr=seed_z;
119
    uint jcong=seed_w;
120

121
   for ( ulong i=0;i<iterations;i++) {
122

123
      // Sparse version
124
       uint position=(uint)( (ulong)MWC*block/MAX*size+id );
125

126
      s[position]++;
127
   }
128

129
   __syncthreads();
130
}
131

132
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
133
{
134
    const ulong id=(ulong)(threadIdx.x);
135
    const ulong size=(ulong)(blockDim.x);
136
    const ulong block=(ulong)space/(ulong)size;
137
   
138
    uint z=seed_z-(uint)id;
139
    uint w=seed_w+(uint)id;
140

141
    uint jsr=seed_z;
142
    uint jcong=seed_w;
143

144
   for ( ulong i=0;i<iterations;i++) {
145

146
      // Dense version
147
       size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
148

149
      s[position]++;
150
   }
151

152

153
   __syncthreads();
154

155
}
156

157
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
158
{
159
    const ulong id=(ulong)threadIdx.x;
160
    const ulong size=(ulong)blockDim.x;
161
    const ulong block=(ulong)space/(ulong)size;
162
   
163
    uint z=seed_z-(uint)id;
164
    uint w=seed_w+(uint)id;
165

166
    uint jsr=seed_z;
167
    uint jcong=seed_w;
168

169
   for ( ulong i=0;i<iterations;i++) {
170

171
      // Sparse version
172
       size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
173

174
      s[position]++;
175
   }
176

177
   __syncthreads();
178

179
}
180

181
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
182
{
183
    const ulong id=(ulong)(blockIdx.x);
184
    const ulong size=(ulong)(gridDim.x);
185
    const ulong block=(ulong)space/(ulong)size;
186
   
187
    uint z=seed_z-(uint)id;
188
    uint w=seed_w+(uint)id;
189

190
    uint jsr=seed_z;
191
    uint jcong=seed_w;
192

193
   for ( ulong i=0;i<iterations;i++) {
194

195
      // Dense version
196
      size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
197

198
      s[position]++;
199
   }
200
      
201
   __syncthreads();
202
}
203

204
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
205
{
206
    const ulong id=(ulong)(blockIdx.x);
207
    const ulong size=(ulong)(gridDim.x);
208
    const ulong block=(ulong)space/(ulong)size;
209
   
210
    uint z=seed_z-(uint)id;
211
    uint w=seed_w+(uint)id;
212

213
    uint jsr=seed_z;
214
    uint jcong=seed_w;
215

216
   for ( ulong i=0;i<iterations;i++) {
217

218
      // Sparse version
219
      size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id );
220

221
      s[position]++;
222

223
   }
224

225
   //s[blockIdx.x]=blockIdx.x;
226
   __syncthreads();
227
}
228

229
"""
230

    
231
KERNEL_CODE_OPENCL="""
232
// Marsaglia RNG very simple implementation
233
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
234
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
235

236
#define MWC   (znew+wnew)
237
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
238
#define CONG  (jcong=69069*jcong+1234567)
239
#define KISS  ((MWC^CONG)+SHR3)
240

241
#define CONGfp CONG * 2.328306435454494e-10f
242
#define SHR3fp SHR3 * 2.328306435454494e-10f
243
#define MWCfp MWC * 2.328306435454494e-10f
244
#define KISSfp KISS * 2.328306435454494e-10f
245

246
#define MAX (ulong)4294967296
247

248
uint rotl(uint value, int shift) {
249
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
250
}
251
 
252
uint rotr(uint value, int shift) {
253
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
254
}
255

256
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
257
{
258
   __private const ulong id=(ulong)get_global_id(0);
259
   __private const ulong size=(ulong)get_global_size(0);
260
   __private const ulong block=(ulong)space/(ulong)size;
261
   
262
   __private uint z=seed_z-(uint)id;
263
   __private uint w=seed_w+(uint)id;
264

265
   __private uint jsr=seed_z;
266
   __private uint jcong=seed_w;
267

268
   for (__private ulong i=0;i<iterations;i++) {
269

270
      // Dense version
271
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
272

273
      s[position]++;
274
   }
275

276
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
277

278
}
279

280
__kernel void SplutterGlobalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
281
{
282
   __private const ulong id=(ulong)get_global_id(0);
283
   __private const ulong size=(ulong)get_global_size(0);
284
   __private const ulong block=(ulong)space/(ulong)size;
285
   
286
   __private uint z=seed_z-(uint)id;
287
   __private uint w=seed_w+(uint)id;
288

289
   __private uint jsr=seed_z;
290
   __private uint jcong=seed_w;
291

292
   for (__private ulong i=0;i<iterations;i++) {
293

294
      // Sparse version
295
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
296

297
      s[position]++;
298
   }
299

300
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
301

302
}
303

304
__kernel void SplutterLocalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
305
{
306
   __private const ulong id=(ulong)get_local_id(0);
307
   __private const ulong size=(ulong)get_local_size(0);
308
   __private const ulong block=(ulong)space/(ulong)size;
309
   
310
   __private uint z=seed_z-(uint)id;
311
   __private uint w=seed_w+(uint)id;
312

313
   __private uint jsr=seed_z;
314
   __private uint jcong=seed_w;
315

316
   for (__private ulong i=0;i<iterations;i++) {
317

318
      // Dense version
319
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
320

321
      s[position]++;
322
   }
323

324
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
325

326
}
327

328
__kernel void SplutterLocalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
329
{
330
   __private const ulong id=(ulong)get_local_id(0);
331
   __private const ulong size=(ulong)get_local_size(0);
332
   __private const ulong block=(ulong)space/(ulong)size;
333
   
334
   __private uint z=seed_z-(uint)id;
335
   __private uint w=seed_w+(uint)id;
336

337
   __private uint jsr=seed_z;
338
   __private uint jcong=seed_w;
339

340
   for (__private ulong i=0;i<iterations;i++) {
341

342
      // Sparse version
343
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
344

345
      s[position]++;
346
   }
347

348
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
349

350
}
351

352
__kernel void SplutterHybridDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
353
{
354
   __private const ulong id=(ulong)(get_global_id(0));
355
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
356
   __private const ulong block=(ulong)space/(ulong)size;
357
   
358
   __private uint z=seed_z-(uint)id;
359
   __private uint w=seed_w+(uint)id;
360

361
   __private uint jsr=seed_z;
362
   __private uint jcong=seed_w;
363

364
   for (__private ulong i=0;i<iterations;i++) {
365

366
      // Dense version
367
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
368

369
      s[position]++;
370
   }
371
      
372
}
373

374
__kernel void SplutterHybridSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
375
{
376
   __private const ulong id=(ulong)(get_global_id(0));
377
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
378
   __private const ulong block=(ulong)space/(ulong)size;
379
   
380
   __private uint z=seed_z-(uint)id;
381
   __private uint w=seed_w+(uint)id;
382

383
   __private uint jsr=seed_z;
384
   __private uint jcong=seed_w;
385

386
   for (__private ulong i=0;i<iterations;i++) {
387

388
      // Sparse version
389
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
390

391
      s[position]++;
392
   }
393
      
394
}
395

396
"""
397

    
398
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Dense):
399

    
400
  # Avec PyCUDA autoinit, rien a faire !
401
  
402
  circleCU = cuda.InOut(circle)
403
  
404
  print "prout"
405

    
406
  mod = SourceModule(KERNEL_CODE_CUDA)
407

    
408
  print "prout 2"
409

    
410
  if Dense:
411
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
412
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
413
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
414
  else:
415
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
416
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
417
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
418
  
419
  print "prout 3"
420

    
421
  start = pycuda.driver.Event()
422
  stop = pycuda.driver.Event()
423
  
424
  MySplutter=numpy.zeros(steps)
425
  MyDuration=numpy.zeros(steps)
426

    
427
  if iterations%jobs==0:
428
    iterationsCL=numpy.uint64(iterations/jobs)
429
  else:
430
    iterationsCL=numpy.uint64(iterations/jobs+1)
431
    
432
  iterationsNew=iterationsCL*jobs
433

    
434
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
435

    
436
  for i in range(steps):
437

    
438
    Splutter[:]=0
439
    
440
    print Splutter,len(Splutter)
441

    
442
    SplutterCU = cuda.InOut(Splutter)
443

    
444
    start.record()
445
    start.synchronize()
446
    if ParaStyle=='Blocks':
447
      MetropolisBlocksCU(SplutterCU,
448
                         numpy.uint32(len(Splutter)),
449
                         numpy.uint64(iterationsCL),
450
                         numpy.uint32(nprnd(2**30/jobs)),
451
                         numpy.uint32(nprnd(2**30/jobs)),
452
                         grid=(jobs,1),
453
                         block=(1,1,1))
454
        
455
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
456
            (Alu,jobs,1,ParaStyle)      
457
    elif ParaStyle=='Hybrid':
458
      threads=BestThreadsNumber(jobs)
459
      MetropolisHybridCU(SplutterCU,
460
                         numpy.uint32(len(Splutter)),
461
                         numpy.uint64(iterationsCL),
462
                         numpy.uint32(nprnd(2**30/jobs)),
463
                         numpy.uint32(nprnd(2**30/jobs)),
464
                         grid=(jobs,1),
465
                         block=(threads,1,1))
466
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
467
            (Alu,jobs/threads,threads,ParaStyle)
468
    else:
469
      MetropolisThreadsCU(SplutterCU,
470
                       numpy.uint32(len(Splutter)),
471
                       numpy.uint64(iterationsCL),
472
                       numpy.uint32(nprnd(2**30/jobs)),
473
                       numpy.uint32(nprnd(2**30/jobs)),
474
                       grid=(1,1),
475
                       block=(jobs,1,1))
476
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
477
            (Alu,1,jobs,ParaStyle)
478
    stop.record()
479
    stop.synchronize()
480
                
481
    elapsed = start.time_till(stop)*1e-3
482

    
483
    print Splutter,sum(Splutter)
484
    MySplutter[i]=numpy.median(Splutter)
485
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
486

    
487
    MyDuration[i]=elapsed
488

    
489
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
490
    #MyPi[i]=numpy.median(AllPi)
491
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
492

    
493

    
494
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
495

    
496
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
497

    
498

    
499
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,Dense):
500
        
501
  # Initialisation des variables en les CASTant correctement
502

    
503
  MaxMemoryXPU=0
504
  MinMemoryXPU=0
505

    
506
  if Device==0:
507
    print "Enter XPU selector based on ALU type: first selected"
508
    HasXPU=False
509
    # Default Device selection based on ALU Type
510
    for platform in cl.get_platforms():
511
      for device in platform.get_devices():
512
        deviceType=cl.device_type.to_string(device.type)
513
        deviceMemory=device.max_mem_alloc_size
514
        if deviceMemory>MaxMemoryXPU:
515
          MaxMemoryXPU=deviceMemory
516
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
517
          MinMemoryXPU=deviceMemory
518
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
519
          XPU=device
520
          print "GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
521
          HasXPU=True
522
          MemoryXPU=deviceMemory
523
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
524
          XPU=device
525
          print "CPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
526
          HasXPU=True
527
          MemoryXPU=deviceMemory
528
          
529
  else:
530
    print "Enter XPU selector based on device number & ALU type"
531
    Id=1
532
    HasXPU=False
533
    # Primary Device selection based on Device Id
534
    for platform in cl.get_platforms():
535
      for device in platform.get_devices():
536
        deviceType=cl.device_type.to_string(device.type)
537
        deviceMemory=device.max_mem_alloc_size
538
        if deviceMemory>MaxMemoryXPU:
539
          MaxMemoryXPU=deviceMemory
540
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
541
          MinMemoryXPU=deviceMemory
542
        if Id==Device and Alu==deviceType and HasXPU==False:
543
          XPU=device
544
          print "CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
545
          HasXPU=True
546
          MemoryXPU=deviceMemory
547
        Id=Id+1
548
    if HasXPU==False:
549
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
550
          (Device,Alu,Id-1)
551
      return(0,0,0)
552

    
553
  print "Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU)
554

    
555
  # Je cree le contexte et la queue pour son execution
556
  ctx = cl.Context([XPU])
557
  queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
558
  
559
  # Je recupere les flag possibles pour les buffers
560
  mf = cl.mem_flags
561

    
562
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
563
      
564
  MyDuration=numpy.zeros(steps)
565
  
566
  if iterations%jobs==0:
567
    iterationsCL=numpy.uint64(iterations/jobs)
568
  else:
569
    iterationsCL=numpy.uint64(iterations/jobs+1)
570
    
571
  iterationsNew=numpy.uint64(iterationsCL*jobs)
572

    
573
  MySplutter=numpy.zeros(steps)
574

    
575
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
576
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
577
  
578
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
579
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
580

    
581
  for i in range(steps):
582
                
583
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
584
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
585
 
586
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
587

    
588
    Splutter[:]=0
589

    
590
    print Splutter,len(Splutter)
591

    
592
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
593

    
594
    if ParaStyle=='Blocks':
595
      # Call OpenCL kernel
596
      # (1,) is Global work size (only 1 work size)
597
      # (1,) is local work size
598
      # circleCL is lattice translated in CL format
599
      # SeedZCL is lattice translated in CL format
600
      # SeedWCL is lattice translated in CL format
601
      # step is number of iterations
602
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
603
      #                                      SplutterCL,
604
      #                                      numpy.uint32(len(Splutter)),
605
      #                                      numpy.uint64(iterationsCL),
606
      #                                      numpy.uint32(nprnd(2**30/jobs)),
607
      #                                      numpy.uint32(nprnd(2**30/jobs)))
608
      if Dense:
609
        CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None,
610
                                                  SplutterCL,
611
                                                  numpy.uint32(len(Splutter)),
612
                                                  numpy.uint64(iterationsCL),
613
                                                  numpy.uint32(521288629),
614
                                                  numpy.uint32(362436069))
615
      else:
616
        CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None,
617
                                                   SplutterCL,
618
                                                   numpy.uint32(len(Splutter)),
619
                                                   numpy.uint64(iterationsCL),
620
                                                   numpy.uint32(521288629),
621
                                                   numpy.uint32(362436069))
622
        
623
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
624
            (Alu,jobs,1,ParaStyle)
625
    elif ParaStyle=='Hybrid':
626
      threads=BestThreadsNumber(jobs)
627
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
628
      if Dense:
629
        CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,),
630
                                                  SplutterCL,
631
                                                  numpy.uint32(len(Splutter)),
632
                                                  numpy.uint64(iterationsCL),
633
                                                  numpy.uint32(nprnd(2**30/jobs)),
634
                                                  numpy.uint32(nprnd(2**30/jobs)))
635
      else:
636
        CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,),
637
                                                   SplutterCL,
638
                                                   numpy.uint32(len(Splutter)),
639
                                                   numpy.uint64(iterationsCL),
640
                                                   numpy.uint32(nprnd(2**30/jobs)),
641
                                                   numpy.uint32(nprnd(2**30/jobs)))
642
        
643
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
644
            (Alu,jobs/threads,threads,ParaStyle)
645
    else:
646
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
647
      if Dense:
648
        CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,),
649
                                                 SplutterCL,
650
                                                 numpy.uint32(len(Splutter)),
651
                                                 numpy.uint64(iterationsCL),
652
                                                 numpy.uint32(nprnd(2**30/jobs)),
653
                                                 numpy.uint32(nprnd(2**30/jobs)))
654
      else:
655
        CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,),
656
                                                  SplutterCL,
657
                                                  numpy.uint32(len(Splutter)),
658
                                                  numpy.uint64(iterationsCL),
659
                                                  numpy.uint32(nprnd(2**30/jobs)),
660
                                                  numpy.uint32(nprnd(2**30/jobs)))
661
        
662
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
663

    
664
    CLLaunch.wait()
665
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
666

    
667
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
668

    
669
    MyDuration[i]=elapsed
670
    print Splutter,sum(Splutter)
671
    #MySplutter[i]=numpy.median(Splutter)
672
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
673

    
674
    SplutterCL.release()
675

    
676
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
677
        
678
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
679

    
680

    
681
def FitAndPrint(N,D,Curves):
682

    
683
  from scipy.optimize import curve_fit
684
  import matplotlib.pyplot as plt
685

    
686
  try:
687
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
688

    
689
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
690
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
691
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
692
    coeffs_Amdahl[0]=D[0]
693
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
694
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
695
  except:
696
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
697

    
698
  try:
699
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
700

    
701
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
702
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
703
    coeffs_AmdahlR[0]=D[0]
704
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
705
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
706

    
707
  except:
708
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
709

    
710
  try:
711
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
712

    
713
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
714
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
715
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
716
    coeffs_Mylq[0]=D[0]
717
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
718
                                                            coeffs_Mylq[1],
719
                                                            coeffs_Mylq[3],
720
                                                            coeffs_Mylq[2])
721
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
722
                coeffs_Mylq[3])
723
  except:
724
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
725

    
726
  try:
727
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
728

    
729
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
730
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
731
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
732
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
733
    coeffs_Mylq2[0]=D[0]
734
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
735
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
736
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
737

    
738
  except:
739
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
740

    
741
  if Curves:
742
    plt.xlabel("Number of Threads/work Items")
743
    plt.ylabel("Total Elapsed Time")
744

    
745
    Experience,=plt.plot(N,D,'ro') 
746
    try:
747
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
748
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
749
    except:
750
      print "Fit curves seem not to be available"
751

    
752
    plt.legend()
753
    plt.show()
754

    
755
if __name__=='__main__':
756

    
757
  # Set defaults values
758
  
759
  # Alu can be CPU, GPU or ACCELERATOR
760
  Alu='CPU'
761
  # Id of GPU : 1 is for first find !
762
  Device=0
763
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
764
  GpuStyle='OpenCL'
765
  # Parallel distribution can be on Threads or Blocks
766
  ParaStyle='Blocks'
767
  # Iterations is integer
768
  Iterations=100000000
769
  # JobStart in first number of Jobs to explore
770
  JobStart=1
771
  # JobEnd is last number of Jobs to explore
772
  JobEnd=16
773
  # JobStep is the step of Jobs to explore
774
  JobStep=1
775
  # Redo is the times to redo the test to improve metrology
776
  Redo=1
777
  # OutMetrology is method for duration estimation : False is GPU inside
778
  OutMetrology=False
779
  Metrology='InMetro'
780
  # Curves is True to print the curves
781
  Curves=False
782
  # Fit is True to print the curves
783
  Fit=False
784
  # Spluttering is Dense by default
785
  Dense=True
786

    
787
  try:
788
    opts, args = getopt.getopt(sys.argv[1:],"hocfvwa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
789
  except getopt.GetoptError:
790
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
791
    sys.exit(2)
792
    
793
  for opt, arg in opts:
794
    if opt == '-h':
795
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law)  -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
796

    
797
      print "\nInformations about devices detected under OpenCL:"
798
      # For PyOpenCL import
799
      try:
800
        import pyopencl as cl
801
        Id=1
802
        for platform in cl.get_platforms():
803
          for device in platform.get_devices():
804
            deviceType=cl.device_type.to_string(device.type)
805
            deviceMemory=device.max_mem_alloc_size
806
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
807
            Id=Id+1
808

    
809
        print
810
        sys.exit()
811
      except ImportError:
812
        print "Your platform does not seem to support OpenCL"
813
        
814
    elif opt == '-o':
815
      OutMetrology=True
816
      Metrology='OutMetro'
817
    elif opt == '-c':
818
      Curves=True
819
    elif opt == '-v':
820
      Dense=True
821
    elif opt == '-w':
822
      Dense=False
823
    elif opt == '-f':
824
      Fit=True
825
    elif opt in ("-a", "--alu"):
826
      Alu = arg
827
    elif opt in ("-d", "--device"):
828
      Device = int(arg)
829
    elif opt in ("-g", "--gpustyle"):
830
      GpuStyle = arg
831
    elif opt in ("-p", "--parastyle"):
832
      ParaStyle = arg
833
    elif opt in ("-i", "--iterations"):
834
      Iterations = numpy.uint64(arg)
835
    elif opt in ("-s", "--jobstart"):
836
      JobStart = int(arg)
837
    elif opt in ("-e", "--jobend"):
838
      JobEnd = int(arg)
839
    elif opt in ("-t", "--jobstep"):
840
      JobStep = int(arg)
841
    elif opt in ("-r", "--redo"):
842
      Redo = int(arg)
843

    
844
  if Alu=='CPU' and GpuStyle=='CUDA':
845
    print "Alu can't be CPU for CUDA, set Alu to GPU"
846
    Alu='GPU'
847

    
848
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
849
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
850
    ParaStyle='Threads'
851

    
852
  print "Compute unit : %s" % Alu
853
  print "Device Identification : %s" % Device
854
  print "GpuStyle used : %s" % GpuStyle
855
  print "Parallel Style used : %s" % ParaStyle
856
  print "Dense (or Sparse) Spluttering : %r" % Dense
857
  print "Iterations : %s" % Iterations
858
  print "Number of threads on start : %s" % JobStart
859
  print "Number of threads on end : %s" % JobEnd
860
  print "Number of redo : %s" % Redo
861
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
862

    
863
  if GpuStyle=='CUDA':
864
    try:
865
      # For PyCUDA import
866
      import pycuda.driver as cuda
867
      import pycuda.gpuarray as gpuarray
868
      import pycuda.autoinit
869
      from pycuda.compiler import SourceModule
870
    except ImportError:
871
      print "Platform does not seem to support CUDA"
872

    
873
  if GpuStyle=='OpenCL':
874
    try:
875
      # For PyOpenCL import
876
      import pyopencl as cl
877
      Id=1
878
      for platform in cl.get_platforms():
879
        for device in platform.get_devices():
880
          deviceType=cl.device_type.to_string(device.type)
881
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
882
          if Id == Device:
883
            # Set the Alu as detected Device Type
884
            Alu=deviceType
885
          Id=Id+1
886
    except ImportError:
887
      print "Platform does not seem to support CUDA"
888
      
889
  average=numpy.array([]).astype(numpy.float32)
890
  median=numpy.array([]).astype(numpy.float32)
891
  stddev=numpy.array([]).astype(numpy.float32)
892

    
893
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
894

    
895
  Jobs=JobStart
896

    
897
  while Jobs <= JobEnd:
898
    avg,med,std=0,0,0
899
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
900
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
901

    
902
    if OutMetrology:
903
      duration=numpy.array([]).astype(numpy.float32)
904
      for i in range(Redo):
905
        start=time.time()
906
        if GpuStyle=='CUDA':
907
          try:
908
            print "toto"
909
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Dense)
910
          except:
911
            print "Problem with %i // computations on Cuda" % Jobs
912
        elif GpuStyle=='OpenCL':
913
          try:
914
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
915
                                   Alu,Device,Dense)
916
          except:
917
            print "Problem with %i // computations on OpenCL" % Jobs            
918
        duration=numpy.append(duration,time.time()-start)
919
      if (a,m,s) != (0,0,0):
920
        avg=numpy.mean(duration)
921
        med=numpy.median(duration)
922
        std=numpy.std(duration)
923
      else:
924
        print "Values seem to be wrong..."
925
    else:
926
      if GpuStyle=='CUDA':
927
        try:
928
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,Dense)
929
        except:
930
          print "Problem with %i // computations on Cuda" % Jobs
931
      elif GpuStyle=='OpenCL':
932
        try:
933
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,Dense)
934
        except:
935
          print "Problem with %i // computations on OpenCL" % Jobs            
936

    
937
    if (avg,med,std) != (0,0,0):
938
      print "jobs,avg,med,std",Jobs,avg,med,std
939
      average=numpy.append(average,avg)
940
      median=numpy.append(median,med)
941
      stddev=numpy.append(stddev,std)
942
    else:
943
      print "Values seem to be wrong..."
944
    #THREADS*=2
945
    if len(average)!=0:
946
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
947
      ToSave=[ ExploredJobs,average,median,stddev ]
948
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
949
    Jobs+=JobStep
950

    
951
  if Fit:
952
    FitAndPrint(ExploredJobs,median,Curves)
953