Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 67

Historique | Voir | Annoter | Télécharger (32,13 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# http://mathema.tician.de/software/pyopencl
11
# 
12

    
13
# 2013-01-01 : problems with launch timeout
14
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
15
# Option "Interactive" "0" in /etc/X11/xorg.conf
16

    
17
# Marsaglia elements about RNG 
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import math
26
from socket import gethostname
27

    
28
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
29
# output is thread number
30
def BestThreadsNumber(jobs):
31
  factors=PrimeFactors(jobs)
32
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
33
  threads=1
34
  for factor in matrix.transpose().ravel():
35
    threads=threads*factor
36
    if threads*threads>jobs:
37
      break
38
  return(long(threads))
39

    
40
# Predicted Amdahl Law (Reduced with s=1-p)  
41
def AmdahlR(N, T1, p):
42
  return (T1*(1-p+p/N))
43

    
44
# Predicted Amdahl Law
45
def Amdahl(N, T1, s, p):
46
  return (T1*(s+p/N))
47

    
48
# Predicted Mylq Law with first order
49
def Mylq(N, T1,s,c,p):
50
  return (T1*(s+p/N)+c*N)
51

    
52
# Predicted Mylq Law with second order
53
def Mylq2(N, T1,s,c1,c2,p):
54
  return (T1*(s+p/N)+c1*N+c2*N*N)
55

    
56
prout="""
57

58
"""
59

    
60

    
61
KERNEL_CODE_CUDA="""
62

63
// Marsaglia RNG very simple implementation
64

65
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
66
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
67

68
#define MWC   (znew+wnew)
69
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
70
#define CONG  (jcong=69069*jcong+1234567)
71
#define KISS  ((MWC^CONG)+SHR3)
72

73
#define CONGfp CONG * 2.328306435454494e-10f
74
#define SHR3fp SHR3 * 2.328306435454494e-10f
75
#define MWCfp MWC * 2.328306435454494e-10f
76
#define KISSfp KISS * 2.328306435454494e-10f
77

78
#define MAX (ulong)4294967296
79
#define UMAX (uint)2147483648
80

81
__global__ void SplutterGlobal(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
82
{
83
    const ulong id=(ulong)(blockIdx.x);
84
   
85
    uint z=seed_z-(uint)id;
86
    uint w=seed_w+(uint)id;
87

88
    uint jsr=seed_z;
89
    uint jcong=seed_w;
90

91
   for ( ulong i=0;i<iterations;i++) {
92

93
      // All version
94
      uint position=(uint)( ((ulong)MWC*(ulong)space)/MAX );
95

96
      // UMAX is set to avoid round over overflow
97
      atomicInc(&s[position],UMAX);
98
   }
99

100
   __syncthreads();
101
}
102

103
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
104
{
105
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
106
    const ulong size=(ulong)(gridDim.x*blockDim.x);
107
    const ulong block=(ulong)space/(ulong)size;
108
   
109
    uint z=seed_z-(uint)id;
110
    uint w=seed_w+(uint)id;
111

112
    uint jsr=seed_z;
113
    uint jcong=seed_w;
114

115
   for ( ulong i=0;i<iterations;i++) {
116

117
      // Dense version
118
       uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX );
119

120
      s[position]++;
121
   }
122

123
   __syncthreads();
124
}
125

126
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
127
{ 
128
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
129
    const ulong size=(ulong)(gridDim.x*blockDim.x);
130
    const ulong block=(ulong)space/(ulong)size;
131
   
132
    uint z=seed_z-(uint)id;
133
    uint w=seed_w+(uint)id;
134

135
    uint jsr=seed_z;
136
    uint jcong=seed_w;
137

138
   for ( ulong i=0;i<iterations;i++) {
139

140
      // Sparse version
141
       uint position=(uint)( (ulong)MWC*block/MAX*size+id );
142

143
      s[position]++;
144
   }
145

146
   __syncthreads();
147
}
148

149
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
150
{
151
    const ulong id=(ulong)(threadIdx.x);
152
    const ulong size=(ulong)(blockDim.x);
153
    const ulong block=(ulong)space/(ulong)size;
154
   
155
    uint z=seed_z-(uint)id;
156
    uint w=seed_w+(uint)id;
157

158
    uint jsr=seed_z;
159
    uint jcong=seed_w;
160

161
   for ( ulong i=0;i<iterations;i++) {
162

163
      // Dense version
164
       size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
165

166
      s[position]++;
167
   }
168

169

170
   __syncthreads();
171

172
}
173

174
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
175
{
176
    const ulong id=(ulong)threadIdx.x;
177
    const ulong size=(ulong)blockDim.x;
178
    const ulong block=(ulong)space/(ulong)size;
179
   
180
    uint z=seed_z-(uint)id;
181
    uint w=seed_w+(uint)id;
182

183
    uint jsr=seed_z;
184
    uint jcong=seed_w;
185

186
   for ( ulong i=0;i<iterations;i++) {
187

188
      // Sparse version
189
       size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
190

191
      s[position]++;
192
   }
193

194
   __syncthreads();
195

196
}
197

198
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
199
{
200
    const ulong id=(ulong)(blockIdx.x);
201
    const ulong size=(ulong)(gridDim.x);
202
    const ulong block=(ulong)space/(ulong)size;
203
   
204
    uint z=seed_z-(uint)id;
205
    uint w=seed_w+(uint)id;
206

207
    uint jsr=seed_z;
208
    uint jcong=seed_w;
209

210
   for ( ulong i=0;i<iterations;i++) {
211

212
      // Dense version
213
      size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
214

215
      s[position]++;
216
   }
217
      
218
   __syncthreads();
219
}
220

221
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
222
{
223
    const ulong id=(ulong)(blockIdx.x);
224
    const ulong size=(ulong)(gridDim.x);
225
    const ulong block=(ulong)space/(ulong)size;
226
   
227
    uint z=seed_z-(uint)id;
228
    uint w=seed_w+(uint)id;
229

230
    uint jsr=seed_z;
231
    uint jcong=seed_w;
232

233
   for ( ulong i=0;i<iterations;i++) {
234

235
      // Sparse version
236
      size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id );
237

238
      s[position]++;
239

240
   }
241

242
   //s[blockIdx.x]=blockIdx.x;
243
   __syncthreads();
244
}
245

246
"""
247

    
248
KERNEL_CODE_OPENCL="""
249
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
250

251
// Marsaglia RNG very simple implementation
252
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
253
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
254

255
#define MWC   (znew+wnew)
256
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
257
#define CONG  (jcong=69069*jcong+1234567)
258
#define KISS  ((MWC^CONG)+SHR3)
259

260
#define CONGfp CONG * 2.328306435454494e-10f
261
#define SHR3fp SHR3 * 2.328306435454494e-10f
262
#define MWCfp MWC * 2.328306435454494e-10f
263
#define KISSfp KISS * 2.328306435454494e-10f
264

265
#define MAX (ulong)4294967296
266

267
uint rotl(uint value, int shift) {
268
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
269
}
270
 
271
uint rotr(uint value, int shift) {
272
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
273
}
274

275
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
276
{
277
   __private const ulong id=(ulong)get_global_id(0);
278
   __private const ulong size=(ulong)get_global_size(0);
279
   __private const ulong block=(ulong)space/(ulong)size;
280
   
281
   __private uint z=seed_z-(uint)id;
282
   __private uint w=seed_w+(uint)id;
283

284
   __private uint jsr=seed_z;
285
   __private uint jcong=seed_w;
286

287
   for (__private ulong i=0;i<iterations;i++) {
288

289
      // Dense version
290
      __private size_t position=(size_t)( ((ulong)MWC*(ulong)space)/MAX );
291

292
      atomic_inc(&s[position]);
293
   }
294

295
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
296

297
}
298

299
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
300
{
301
   __private const ulong id=(ulong)get_global_id(0);
302
   __private const ulong size=(ulong)get_global_size(0);
303
   __private const ulong block=(ulong)space/(ulong)size;
304
   
305
   __private uint z=seed_z-(uint)id;
306
   __private uint w=seed_w+(uint)id;
307

308
   __private uint jsr=seed_z;
309
   __private uint jcong=seed_w;
310

311
   for (__private ulong i=0;i<iterations;i++) {
312

313
      // Dense version
314
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
315

316
      s[position]++;
317
   }
318

319
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
320

321
}
322

323
__kernel void SplutterGlobalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
324
{
325
   __private const ulong id=(ulong)get_global_id(0);
326
   __private const ulong size=(ulong)get_global_size(0);
327
   __private const ulong block=(ulong)space/(ulong)size;
328
   
329
   __private uint z=seed_z-(uint)id;
330
   __private uint w=seed_w+(uint)id;
331

332
   __private uint jsr=seed_z;
333
   __private uint jcong=seed_w;
334

335
   for (__private ulong i=0;i<iterations;i++) {
336

337
      // Sparse version
338
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
339

340
      s[position]++;
341
   }
342

343
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
344

345
}
346

347
__kernel void SplutterLocalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
348
{
349
   __private const ulong id=(ulong)get_local_id(0);
350
   __private const ulong size=(ulong)get_local_size(0);
351
   __private const ulong block=(ulong)space/(ulong)size;
352
   
353
   __private uint z=seed_z-(uint)id;
354
   __private uint w=seed_w+(uint)id;
355

356
   __private uint jsr=seed_z;
357
   __private uint jcong=seed_w;
358

359
   for (__private ulong i=0;i<iterations;i++) {
360

361
      // Dense version
362
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
363

364
      s[position]++;
365
   }
366

367
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
368

369
}
370

371
__kernel void SplutterLocalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
372
{
373
   __private const ulong id=(ulong)get_local_id(0);
374
   __private const ulong size=(ulong)get_local_size(0);
375
   __private const ulong block=(ulong)space/(ulong)size;
376
   
377
   __private uint z=seed_z-(uint)id;
378
   __private uint w=seed_w+(uint)id;
379

380
   __private uint jsr=seed_z;
381
   __private uint jcong=seed_w;
382

383
   for (__private ulong i=0;i<iterations;i++) {
384

385
      // Sparse version
386
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
387

388
      s[position]++;
389
   }
390

391
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
392

393
}
394

395
__kernel void SplutterHybridDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
396
{
397
   __private const ulong id=(ulong)(get_global_id(0));
398
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
399
   __private const ulong block=(ulong)space/(ulong)size;
400
   
401
   __private uint z=seed_z-(uint)id;
402
   __private uint w=seed_w+(uint)id;
403

404
   __private uint jsr=seed_z;
405
   __private uint jcong=seed_w;
406

407
   for (__private ulong i=0;i<iterations;i++) {
408

409
      // Dense version
410
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
411

412
      s[position]++;
413
   }
414
      
415
}
416

417
__kernel void SplutterHybridSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
418
{
419
   __private const ulong id=(ulong)(get_global_id(0));
420
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
421
   __private const ulong block=(ulong)space/(ulong)size;
422
   
423
   __private uint z=seed_z-(uint)id;
424
   __private uint w=seed_w+(uint)id;
425

426
   __private uint jsr=seed_z;
427
   __private uint jcong=seed_w;
428

429
   for (__private ulong i=0;i<iterations;i++) {
430

431
      // Sparse version
432
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
433

434
      s[position]++;
435
   }
436
      
437
}
438

439
"""
440

    
441
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density):
442

    
443
  # Avec PyCUDA autoinit, rien a faire !
444

    
445
  circleCU = cuda.InOut(circle)
446
  
447
  mod = SourceModule(KERNEL_CODE_CUDA)
448

    
449
  if Density=='Dense':
450
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
451
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
452
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
453
  elif Density=='Sparse':
454
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
455
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
456
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
457
  else:
458
    MetropolisBlocksCU=mod.get_function("SplutterGlobal")
459
    
460
  start = pycuda.driver.Event()
461
  stop = pycuda.driver.Event()
462
  
463
  MySplutter=numpy.zeros(steps)
464
  MyDuration=numpy.zeros(steps)
465

    
466
  if iterations%jobs==0:
467
    iterationsCL=numpy.uint64(iterations/jobs)
468
  else:
469
    iterationsCL=numpy.uint64(iterations/jobs+1)
470
    
471
  iterationsNew=iterationsCL*jobs
472

    
473
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
474

    
475
  for i in range(steps):
476

    
477
    Splutter[:]=0
478
    
479
    print Splutter,len(Splutter)
480

    
481
    SplutterCU = cuda.InOut(Splutter)
482

    
483
    start.record()
484
    start.synchronize()
485
    if ParaStyle=='Blocks':
486
      MetropolisBlocksCU(SplutterCU,
487
                         numpy.uint32(len(Splutter)),
488
                         numpy.uint64(iterationsCL),
489
                         numpy.uint32(nprnd(2**30/jobs)),
490
                         numpy.uint32(nprnd(2**30/jobs)),
491
                         grid=(jobs,1),
492
                         block=(1,1,1))
493
        
494
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
495
            (Alu,jobs,1,ParaStyle)      
496
    elif ParaStyle=='Hybrid':
497
      threads=BestThreadsNumber(jobs)
498
      MetropolisHybridCU(SplutterCU,
499
                         numpy.uint32(len(Splutter)),
500
                         numpy.uint64(iterationsCL),
501
                         numpy.uint32(nprnd(2**30/jobs)),
502
                         numpy.uint32(nprnd(2**30/jobs)),
503
                         grid=(jobs,1),
504
                         block=(threads,1,1))
505
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
506
            (Alu,jobs/threads,threads,ParaStyle)
507
    else:
508
      MetropolisThreadsCU(SplutterCU,
509
                       numpy.uint32(len(Splutter)),
510
                       numpy.uint64(iterationsCL),
511
                       numpy.uint32(nprnd(2**30/jobs)),
512
                       numpy.uint32(nprnd(2**30/jobs)),
513
                       grid=(1,1),
514
                       block=(jobs,1,1))
515
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
516
            (Alu,1,jobs,ParaStyle)
517
    stop.record()
518
    stop.synchronize()
519
                
520
    elapsed = start.time_till(stop)*1e-3
521

    
522
    print Splutter,sum(Splutter)
523
    MySplutter[i]=numpy.median(Splutter)
524
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
525

    
526
    MyDuration[i]=elapsed
527

    
528
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
529
    #MyPi[i]=numpy.median(AllPi)
530
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
531

    
532

    
533
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
534

    
535
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
536

    
537

    
538
def MetropolisOpenCL(circle,iterations,steps,jobs,
539
                     ParaStyle,Alu,Device,Density):
540
        
541
  # Initialisation des variables en les CASTant correctement
542

    
543
  MaxMemoryXPU=0
544
  MinMemoryXPU=0
545

    
546
  if Device==0:
547
    print "Enter XPU selector based on ALU type: first selected"
548
    HasXPU=False
549
    # Default Device selection based on ALU Type
550
    for platform in cl.get_platforms():
551
      for device in platform.get_devices():
552
        deviceType=cl.device_type.to_string(device.type)
553
        deviceMemory=device.max_mem_alloc_size
554
        if deviceMemory>MaxMemoryXPU:
555
          MaxMemoryXPU=deviceMemory
556
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
557
          MinMemoryXPU=deviceMemory
558
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
559
          XPU=device
560
          print "GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
561
          HasXPU=True
562
          MemoryXPU=deviceMemory
563
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
564
          XPU=device
565
          print "CPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
566
          HasXPU=True
567
          MemoryXPU=deviceMemory
568
          
569
  else:
570
    print "Enter XPU selector based on device number & ALU type"
571
    Id=1
572
    HasXPU=False
573
    # Primary Device selection based on Device Id
574
    for platform in cl.get_platforms():
575
      for device in platform.get_devices():
576
        deviceType=cl.device_type.to_string(device.type)
577
        deviceMemory=device.max_mem_alloc_size
578
        if deviceMemory>MaxMemoryXPU:
579
          MaxMemoryXPU=deviceMemory
580
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
581
          MinMemoryXPU=deviceMemory
582
        if Id==Device and Alu==deviceType and HasXPU==False:
583
          XPU=device
584
          print "CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
585
          HasXPU=True
586
          MemoryXPU=deviceMemory
587
        Id=Id+1
588
    if HasXPU==False:
589
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
590
          (Device,Alu,Id-1)
591
      return(0,0,0)
592

    
593
  print "Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU)
594

    
595
  # Je cree le contexte et la queue pour son execution
596
  ctx = cl.Context([XPU])
597
  queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
598
  
599
  # Je recupere les flag possibles pour les buffers
600
  mf = cl.mem_flags
601

    
602
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
603
      
604
  MyDuration=numpy.zeros(steps)
605
  
606
  if iterations%jobs==0:
607
    iterationsCL=numpy.uint64(iterations/jobs)
608
  else:
609
    iterationsCL=numpy.uint64(iterations/jobs+1)
610
    
611
  iterationsNew=numpy.uint64(iterationsCL*jobs)
612

    
613
  MySplutter=numpy.zeros(steps)
614

    
615
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
616
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
617
  
618
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
619
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
620

    
621
  for i in range(steps):
622
                
623
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
624
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
625
 
626
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
627

    
628
    Splutter[:]=0
629

    
630
    print Splutter,len(Splutter)
631

    
632
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
633

    
634
    if ParaStyle=='Blocks':
635
      # Call OpenCL kernel
636
      # (1,) is Global work size (only 1 work size)
637
      # (1,) is local work size
638
      # circleCL is lattice translated in CL format
639
      # SeedZCL is lattice translated in CL format
640
      # SeedWCL is lattice translated in CL format
641
      # step is number of iterations
642
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
643
      #                                      SplutterCL,
644
      #                                      numpy.uint32(len(Splutter)),
645
      #                                      numpy.uint64(iterationsCL),
646
      #                                      numpy.uint32(nprnd(2**30/jobs)),
647
      #                                      numpy.uint32(nprnd(2**30/jobs)))
648
      if Density=='Dense':
649
        CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None,
650
                                                  SplutterCL,
651
                                                  numpy.uint32(len(Splutter)),
652
                                                  numpy.uint64(iterationsCL),
653
                                                  numpy.uint32(521288629),
654
                                                  numpy.uint32(362436069))
655
      elif Density=='Sparse':
656
        CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None,
657
                                                   SplutterCL,
658
                                                   numpy.uint32(len(Splutter)),
659
                                                   numpy.uint64(iterationsCL),
660
                                                   numpy.uint32(521288629),
661
                                                   numpy.uint32(362436069))
662
        
663
      else:
664
        CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
665
                                             SplutterCL,
666
                                             numpy.uint32(len(Splutter)),
667
                                             numpy.uint64(iterationsCL),
668
                                             numpy.uint32(521288629),
669
                                             numpy.uint32(362436069))
670
        
671
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
672
            (Alu,jobs,1,ParaStyle)
673
    elif ParaStyle=='Hybrid':
674
      threads=BestThreadsNumber(jobs)
675
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
676
      if Density=='Dense':
677
        CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,),
678
                                                  SplutterCL,
679
                                                  numpy.uint32(len(Splutter)),
680
                                                  numpy.uint64(iterationsCL),
681
                                                  numpy.uint32(nprnd(2**30/jobs)),
682
                                                  numpy.uint32(nprnd(2**30/jobs)))
683
      elif Density=='Sparse':
684
        CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,),
685
                                                   SplutterCL,
686
                                                   numpy.uint32(len(Splutter)),
687
                                                   numpy.uint64(iterationsCL),
688
                                                   numpy.uint32(nprnd(2**30/jobs)),
689
                                                   numpy.uint32(nprnd(2**30/jobs)))
690
        
691
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
692
            (Alu,jobs/threads,threads,ParaStyle)
693
    else:
694
      # en OpenCL, necessaire de mettre un global_id identique au local_id
695
      if Density=='Dense':
696
        CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,),
697
                                                 SplutterCL,
698
                                                 numpy.uint32(len(Splutter)),
699
                                                 numpy.uint64(iterationsCL),
700
                                                 numpy.uint32(nprnd(2**30/jobs)),
701
                                                 numpy.uint32(nprnd(2**30/jobs)))
702
      elif Density=='Sparse':
703
        CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,),
704
                                                  SplutterCL,
705
                                                  numpy.uint32(len(Splutter)),
706
                                                  numpy.uint64(iterationsCL),
707
                                                  numpy.uint32(nprnd(2**30/jobs)),
708
                                                  numpy.uint32(nprnd(2**30/jobs)))
709
        
710
        
711
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
712

    
713
    CLLaunch.wait()
714
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
715

    
716
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
717

    
718
    MyDuration[i]=elapsed
719
    print Splutter,sum(Splutter)
720
    #MySplutter[i]=numpy.median(Splutter)
721
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
722

    
723
    SplutterCL.release()
724

    
725
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
726
        
727
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
728

    
729

    
730
def FitAndPrint(N,D,Curves):
731

    
732
  from scipy.optimize import curve_fit
733
  import matplotlib.pyplot as plt
734

    
735
  try:
736
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
737

    
738
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
739
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
740
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
741
    coeffs_Amdahl[0]=D[0]
742
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
743
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
744
  except:
745
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
746

    
747
  try:
748
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
749

    
750
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
751
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
752
    coeffs_AmdahlR[0]=D[0]
753
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
754
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
755

    
756
  except:
757
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
758

    
759
  try:
760
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
761

    
762
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
763
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
764
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
765
    coeffs_Mylq[0]=D[0]
766
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
767
                                                            coeffs_Mylq[1],
768
                                                            coeffs_Mylq[3],
769
                                                            coeffs_Mylq[2])
770
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
771
                coeffs_Mylq[3])
772
  except:
773
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
774

    
775
  try:
776
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
777

    
778
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
779
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
780
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
781
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
782
    coeffs_Mylq2[0]=D[0]
783
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
784
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
785
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
786

    
787
  except:
788
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
789

    
790
  if Curves:
791
    plt.xlabel("Number of Threads/work Items")
792
    plt.ylabel("Total Elapsed Time")
793

    
794
    Experience,=plt.plot(N,D,'ro') 
795
    try:
796
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
797
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
798
    except:
799
      print "Fit curves seem not to be available"
800

    
801
    plt.legend()
802
    plt.show()
803

    
804
if __name__=='__main__':
805

    
806
  # Set defaults values
807
  
808
  # Alu can be CPU, GPU or ACCELERATOR
809
  Alu='CPU'
810
  # Id of GPU : 1 is for first find !
811
  Device=0
812
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
813
  GpuStyle='OpenCL'
814
  # Parallel distribution can be on Threads or Blocks
815
  ParaStyle='Blocks'
816
  # Iterations is integer
817
  Iterations=10000000
818
  # JobStart in first number of Jobs to explore
819
  JobStart=1
820
  # JobEnd is last number of Jobs to explore
821
  JobEnd=16
822
  # JobStep is the step of Jobs to explore
823
  JobStep=1
824
  # Redo is the times to redo the test to improve metrology
825
  Redo=1
826
  # OutMetrology is method for duration estimation : False is GPU inside
827
  OutMetrology=False
828
  Metrology='InMetro'
829
  # Curves is True to print the curves
830
  Curves=False
831
  # Fit is True to print the curves
832
  Fit=False
833
  # Spluttering is Dense by default
834
  Density='All'
835

    
836
  try:
837
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:y:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","density="])
838
  except getopt.GetoptError:
839
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
840
    sys.exit(2)
841
    
842
  for opt, arg in opts:
843
    if opt == '-h':
844
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law)  -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
845

    
846
      print "\nInformations about devices detected under OpenCL:"
847
      # For PyOpenCL import
848
      try:
849
        import pyopencl as cl
850
        Id=1
851
        for platform in cl.get_platforms():
852
          for device in platform.get_devices():
853
            deviceType=cl.device_type.to_string(device.type)
854
            deviceMemory=device.max_mem_alloc_size
855
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
856
            Id=Id+1
857

    
858
        print
859
        sys.exit()
860
      except ImportError:
861
        print "Your platform does not seem to support OpenCL"
862
        
863
    elif opt == '-o':
864
      OutMetrology=True
865
      Metrology='OutMetro'
866
    elif opt == '-c':
867
      Curves=True
868
    elif opt in ("-y", "--density"):
869
      Density = arg
870
    elif opt == '-f':
871
      Fit=True
872
    elif opt in ("-a", "--alu"):
873
      Alu = arg
874
    elif opt in ("-d", "--device"):
875
      Device = int(arg)
876
    elif opt in ("-g", "--gpustyle"):
877
      GpuStyle = arg
878
    elif opt in ("-p", "--parastyle"):
879
      ParaStyle = arg
880
    elif opt in ("-i", "--iterations"):
881
      Iterations = numpy.uint64(arg)
882
    elif opt in ("-s", "--jobstart"):
883
      JobStart = int(arg)
884
    elif opt in ("-e", "--jobend"):
885
      JobEnd = int(arg)
886
    elif opt in ("-t", "--jobstep"):
887
      JobStep = int(arg)
888
    elif opt in ("-r", "--redo"):
889
      Redo = int(arg)
890

    
891
  print "Toto %s" % Alu
892

    
893
  if Alu=='CPU' and GpuStyle=='CUDA':
894
    print "Alu can't be CPU for CUDA, set Alu to GPU"
895
    Alu='GPU'
896

    
897
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
898
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
899
    ParaStyle='Blocks'
900

    
901
  print "Compute unit : %s" % Alu
902
  print "Device Identification : %s" % Device
903
  print "GpuStyle used : %s" % GpuStyle
904
  print "Parallel Style used : %s" % ParaStyle
905
  print "Density Spluttering : %s" % Density
906
  print "Iterations : %s" % Iterations
907
  print "Number of threads on start : %s" % JobStart
908
  print "Number of threads on end : %s" % JobEnd
909
  print "Number of redo : %s" % Redo
910
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
911

    
912
  if GpuStyle=='CUDA':
913
    try:
914
      # For PyCUDA import
915
      import pycuda.driver as cuda
916
      import pycuda.gpuarray as gpuarray
917
      import pycuda.autoinit
918
      from pycuda.compiler import SourceModule
919
    except ImportError:
920
      print "Platform does not seem to support CUDA"
921

    
922
  if GpuStyle=='OpenCL':
923
    try:
924
      # For PyOpenCL import
925
      import pyopencl as cl
926
      Id=1
927
      for platform in cl.get_platforms():
928
        for device in platform.get_devices():
929
          deviceType=cl.device_type.to_string(device.type)
930
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
931
          if Id == Device:
932
            # Set the Alu as detected Device Type
933
            Alu=deviceType
934
          Id=Id+1
935
    except ImportError:
936
      print "Platform does not seem to support CUDA"
937
      
938
  average=numpy.array([]).astype(numpy.float32)
939
  median=numpy.array([]).astype(numpy.float32)
940
  stddev=numpy.array([]).astype(numpy.float32)
941

    
942
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
943

    
944
  Jobs=JobStart
945

    
946
  while Jobs <= JobEnd:
947
    avg,med,std=0,0,0
948
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
949
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
950

    
951
    if OutMetrology:
952
      duration=numpy.array([]).astype(numpy.float32)
953
      for i in range(Redo):
954
        start=time.time()
955
        if GpuStyle=='CUDA':
956
          try:
957
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Density)
958
          except:
959
            print "Problem with %i // computations on Cuda" % Jobs
960
        elif GpuStyle=='OpenCL':
961
          try:
962
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
963
                                   Alu,Device,Density)
964
          except:
965
            print "Problem with %i // computations on OpenCL" % Jobs            
966
        duration=numpy.append(duration,time.time()-start)
967
      if (a,m,s) != (0,0,0):
968
        avg=numpy.mean(duration)
969
        med=numpy.median(duration)
970
        std=numpy.std(duration)
971
      else:
972
        print "Values seem to be wrong..."
973
    else:
974
      if GpuStyle=='CUDA':
975
        try:
976
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,
977
                                     Jobs,ParaStyle,Density)
978
        except:
979
          print "Problem with %i // computations on Cuda" % Jobs
980
      elif GpuStyle=='OpenCL':
981
        try:
982
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,
983
                                       Jobs,ParaStyle,Alu,Device,Density)
984
        except:
985
          print "Problem with %i // computations on OpenCL" % Jobs            
986

    
987
    if (avg,med,std) != (0,0,0):
988
      print "jobs,avg,med,std",Jobs,avg,med,std
989
      average=numpy.append(average,avg)
990
      median=numpy.append(median,med)
991
      stddev=numpy.append(stddev,std)
992
    else:
993
      print "Values seem to be wrong..."
994
    #THREADS*=2
995
    if len(average)!=0:
996
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
997
      ToSave=[ ExploredJobs,average,median,stddev ]
998
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
999
    Jobs+=JobStep
1000

    
1001
  if Fit:
1002
    FitAndPrint(ExploredJobs,median,Curves)
1003