Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 66

Historique | Voir | Annoter | Télécharger (32,47 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# http://mathema.tician.de/software/pyopencl
11
# 
12

    
13
# 2013-01-01 : problems with launch timeout
14
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
15
# Option "Interactive" "0" in /etc/X11/xorg.conf
16

    
17
# Marsaglia elements about RNG 
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import math
26
from socket import gethostname
27

    
28
# find prime factors of a number
29
# Get for WWW :
30
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
31
def PrimeFactors(x):
32
  factorlist=numpy.array([]).astype('uint32')
33
  loop=2
34
  while loop<=x:
35
    if x%loop==0:
36
      x/=loop
37
      factorlist=numpy.append(factorlist,[loop])
38
    else:
39
      loop+=1
40
  return factorlist
41

    
42
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
43
# output is thread number
44
def BestThreadsNumber(jobs):
45
  factors=PrimeFactors(jobs)
46
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
47
  threads=1
48
  for factor in matrix.transpose().ravel():
49
    threads=threads*factor
50
    if threads*threads>jobs:
51
      break
52
  return(long(threads))
53

    
54
# Predicted Amdahl Law (Reduced with s=1-p)  
55
def AmdahlR(N, T1, p):
56
  return (T1*(1-p+p/N))
57

    
58
# Predicted Amdahl Law
59
def Amdahl(N, T1, s, p):
60
  return (T1*(s+p/N))
61

    
62
# Predicted Mylq Law with first order
63
def Mylq(N, T1,s,c,p):
64
  return (T1*(s+p/N)+c*N)
65

    
66
# Predicted Mylq Law with second order
67
def Mylq2(N, T1,s,c1,c2,p):
68
  return (T1*(s+p/N)+c1*N+c2*N*N)
69

    
70
prout="""
71

72
"""
73

    
74

    
75
KERNEL_CODE_CUDA="""
76

77
// Marsaglia RNG very simple implementation
78

79
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
80
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
81

82
#define MWC   (znew+wnew)
83
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
84
#define CONG  (jcong=69069*jcong+1234567)
85
#define KISS  ((MWC^CONG)+SHR3)
86

87
#define CONGfp CONG * 2.328306435454494e-10f
88
#define SHR3fp SHR3 * 2.328306435454494e-10f
89
#define MWCfp MWC * 2.328306435454494e-10f
90
#define KISSfp KISS * 2.328306435454494e-10f
91

92
#define MAX (ulong)4294967296
93
#define UMAX (uint)2147483648
94

95
__global__ void SplutterGlobal(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
96
{
97
    const ulong id=(ulong)(blockIdx.x);
98
   
99
    uint z=seed_z-(uint)id;
100
    uint w=seed_w+(uint)id;
101

102
    uint jsr=seed_z;
103
    uint jcong=seed_w;
104

105
   for ( ulong i=0;i<iterations;i++) {
106

107
      // All version
108
      uint position=(uint)( ((ulong)MWC*(ulong)space)/MAX );
109

110
      // UMAX is set to avoid round over overflow
111
      atomicInc(&s[position],UMAX);
112
   }
113

114
   __syncthreads();
115
}
116

117
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
118
{
119
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
120
    const ulong size=(ulong)(gridDim.x*blockDim.x);
121
    const ulong block=(ulong)space/(ulong)size;
122
   
123
    uint z=seed_z-(uint)id;
124
    uint w=seed_w+(uint)id;
125

126
    uint jsr=seed_z;
127
    uint jcong=seed_w;
128

129
   for ( ulong i=0;i<iterations;i++) {
130

131
      // Dense version
132
       uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX );
133

134
      s[position]++;
135
   }
136

137
   __syncthreads();
138
}
139

140
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
141
{ 
142
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
143
    const ulong size=(ulong)(gridDim.x*blockDim.x);
144
    const ulong block=(ulong)space/(ulong)size;
145
   
146
    uint z=seed_z-(uint)id;
147
    uint w=seed_w+(uint)id;
148

149
    uint jsr=seed_z;
150
    uint jcong=seed_w;
151

152
   for ( ulong i=0;i<iterations;i++) {
153

154
      // Sparse version
155
       uint position=(uint)( (ulong)MWC*block/MAX*size+id );
156

157
      s[position]++;
158
   }
159

160
   __syncthreads();
161
}
162

163
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
164
{
165
    const ulong id=(ulong)(threadIdx.x);
166
    const ulong size=(ulong)(blockDim.x);
167
    const ulong block=(ulong)space/(ulong)size;
168
   
169
    uint z=seed_z-(uint)id;
170
    uint w=seed_w+(uint)id;
171

172
    uint jsr=seed_z;
173
    uint jcong=seed_w;
174

175
   for ( ulong i=0;i<iterations;i++) {
176

177
      // Dense version
178
       size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
179

180
      s[position]++;
181
   }
182

183

184
   __syncthreads();
185

186
}
187

188
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
189
{
190
    const ulong id=(ulong)threadIdx.x;
191
    const ulong size=(ulong)blockDim.x;
192
    const ulong block=(ulong)space/(ulong)size;
193
   
194
    uint z=seed_z-(uint)id;
195
    uint w=seed_w+(uint)id;
196

197
    uint jsr=seed_z;
198
    uint jcong=seed_w;
199

200
   for ( ulong i=0;i<iterations;i++) {
201

202
      // Sparse version
203
       size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
204

205
      s[position]++;
206
   }
207

208
   __syncthreads();
209

210
}
211

212
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
213
{
214
    const ulong id=(ulong)(blockIdx.x);
215
    const ulong size=(ulong)(gridDim.x);
216
    const ulong block=(ulong)space/(ulong)size;
217
   
218
    uint z=seed_z-(uint)id;
219
    uint w=seed_w+(uint)id;
220

221
    uint jsr=seed_z;
222
    uint jcong=seed_w;
223

224
   for ( ulong i=0;i<iterations;i++) {
225

226
      // Dense version
227
      size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
228

229
      s[position]++;
230
   }
231
      
232
   __syncthreads();
233
}
234

235
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
236
{
237
    const ulong id=(ulong)(blockIdx.x);
238
    const ulong size=(ulong)(gridDim.x);
239
    const ulong block=(ulong)space/(ulong)size;
240
   
241
    uint z=seed_z-(uint)id;
242
    uint w=seed_w+(uint)id;
243

244
    uint jsr=seed_z;
245
    uint jcong=seed_w;
246

247
   for ( ulong i=0;i<iterations;i++) {
248

249
      // Sparse version
250
      size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id );
251

252
      s[position]++;
253

254
   }
255

256
   //s[blockIdx.x]=blockIdx.x;
257
   __syncthreads();
258
}
259

260
"""
261

    
262
KERNEL_CODE_OPENCL="""
263
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
264

265
// Marsaglia RNG very simple implementation
266
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
267
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
268

269
#define MWC   (znew+wnew)
270
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
271
#define CONG  (jcong=69069*jcong+1234567)
272
#define KISS  ((MWC^CONG)+SHR3)
273

274
#define CONGfp CONG * 2.328306435454494e-10f
275
#define SHR3fp SHR3 * 2.328306435454494e-10f
276
#define MWCfp MWC * 2.328306435454494e-10f
277
#define KISSfp KISS * 2.328306435454494e-10f
278

279
#define MAX (ulong)4294967296
280

281
uint rotl(uint value, int shift) {
282
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
283
}
284
 
285
uint rotr(uint value, int shift) {
286
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
287
}
288

289
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
290
{
291
   __private const ulong id=(ulong)get_global_id(0);
292
   __private const ulong size=(ulong)get_global_size(0);
293
   __private const ulong block=(ulong)space/(ulong)size;
294
   
295
   __private uint z=seed_z-(uint)id;
296
   __private uint w=seed_w+(uint)id;
297

298
   __private uint jsr=seed_z;
299
   __private uint jcong=seed_w;
300

301
   for (__private ulong i=0;i<iterations;i++) {
302

303
      // Dense version
304
      __private size_t position=(size_t)( ((ulong)MWC*(ulong)space)/MAX );
305

306
      atomic_inc(&s[position]);
307
   }
308

309
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
310

311
}
312

313
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
314
{
315
   __private const ulong id=(ulong)get_global_id(0);
316
   __private const ulong size=(ulong)get_global_size(0);
317
   __private const ulong block=(ulong)space/(ulong)size;
318
   
319
   __private uint z=seed_z-(uint)id;
320
   __private uint w=seed_w+(uint)id;
321

322
   __private uint jsr=seed_z;
323
   __private uint jcong=seed_w;
324

325
   for (__private ulong i=0;i<iterations;i++) {
326

327
      // Dense version
328
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
329

330
      s[position]++;
331
   }
332

333
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
334

335
}
336

337
__kernel void SplutterGlobalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
338
{
339
   __private const ulong id=(ulong)get_global_id(0);
340
   __private const ulong size=(ulong)get_global_size(0);
341
   __private const ulong block=(ulong)space/(ulong)size;
342
   
343
   __private uint z=seed_z-(uint)id;
344
   __private uint w=seed_w+(uint)id;
345

346
   __private uint jsr=seed_z;
347
   __private uint jcong=seed_w;
348

349
   for (__private ulong i=0;i<iterations;i++) {
350

351
      // Sparse version
352
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
353

354
      s[position]++;
355
   }
356

357
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
358

359
}
360

361
__kernel void SplutterLocalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
362
{
363
   __private const ulong id=(ulong)get_local_id(0);
364
   __private const ulong size=(ulong)get_local_size(0);
365
   __private const ulong block=(ulong)space/(ulong)size;
366
   
367
   __private uint z=seed_z-(uint)id;
368
   __private uint w=seed_w+(uint)id;
369

370
   __private uint jsr=seed_z;
371
   __private uint jcong=seed_w;
372

373
   for (__private ulong i=0;i<iterations;i++) {
374

375
      // Dense version
376
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
377

378
      s[position]++;
379
   }
380

381
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
382

383
}
384

385
__kernel void SplutterLocalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
386
{
387
   __private const ulong id=(ulong)get_local_id(0);
388
   __private const ulong size=(ulong)get_local_size(0);
389
   __private const ulong block=(ulong)space/(ulong)size;
390
   
391
   __private uint z=seed_z-(uint)id;
392
   __private uint w=seed_w+(uint)id;
393

394
   __private uint jsr=seed_z;
395
   __private uint jcong=seed_w;
396

397
   for (__private ulong i=0;i<iterations;i++) {
398

399
      // Sparse version
400
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
401

402
      s[position]++;
403
   }
404

405
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
406

407
}
408

409
__kernel void SplutterHybridDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
410
{
411
   __private const ulong id=(ulong)(get_global_id(0));
412
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
413
   __private const ulong block=(ulong)space/(ulong)size;
414
   
415
   __private uint z=seed_z-(uint)id;
416
   __private uint w=seed_w+(uint)id;
417

418
   __private uint jsr=seed_z;
419
   __private uint jcong=seed_w;
420

421
   for (__private ulong i=0;i<iterations;i++) {
422

423
      // Dense version
424
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
425

426
      s[position]++;
427
   }
428
      
429
}
430

431
__kernel void SplutterHybridSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
432
{
433
   __private const ulong id=(ulong)(get_global_id(0));
434
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
435
   __private const ulong block=(ulong)space/(ulong)size;
436
   
437
   __private uint z=seed_z-(uint)id;
438
   __private uint w=seed_w+(uint)id;
439

440
   __private uint jsr=seed_z;
441
   __private uint jcong=seed_w;
442

443
   for (__private ulong i=0;i<iterations;i++) {
444

445
      // Sparse version
446
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
447

448
      s[position]++;
449
   }
450
      
451
}
452

453
"""
454

    
455
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density):
456

    
457
  # Avec PyCUDA autoinit, rien a faire !
458

    
459
  circleCU = cuda.InOut(circle)
460
  
461
  mod = SourceModule(KERNEL_CODE_CUDA)
462

    
463
  if Density=='Dense':
464
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
465
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
466
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
467
  elif Density=='Sparse':
468
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
469
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
470
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
471
  else:
472
    MetropolisBlocksCU=mod.get_function("SplutterGlobal")
473
    
474
  start = pycuda.driver.Event()
475
  stop = pycuda.driver.Event()
476
  
477
  MySplutter=numpy.zeros(steps)
478
  MyDuration=numpy.zeros(steps)
479

    
480
  if iterations%jobs==0:
481
    iterationsCL=numpy.uint64(iterations/jobs)
482
  else:
483
    iterationsCL=numpy.uint64(iterations/jobs+1)
484
    
485
  iterationsNew=iterationsCL*jobs
486

    
487
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
488

    
489
  for i in range(steps):
490

    
491
    Splutter[:]=0
492
    
493
    print Splutter,len(Splutter)
494

    
495
    SplutterCU = cuda.InOut(Splutter)
496

    
497
    start.record()
498
    start.synchronize()
499
    if ParaStyle=='Blocks':
500
      MetropolisBlocksCU(SplutterCU,
501
                         numpy.uint32(len(Splutter)),
502
                         numpy.uint64(iterationsCL),
503
                         numpy.uint32(nprnd(2**30/jobs)),
504
                         numpy.uint32(nprnd(2**30/jobs)),
505
                         grid=(jobs,1),
506
                         block=(1,1,1))
507
        
508
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
509
            (Alu,jobs,1,ParaStyle)      
510
    elif ParaStyle=='Hybrid':
511
      threads=BestThreadsNumber(jobs)
512
      MetropolisHybridCU(SplutterCU,
513
                         numpy.uint32(len(Splutter)),
514
                         numpy.uint64(iterationsCL),
515
                         numpy.uint32(nprnd(2**30/jobs)),
516
                         numpy.uint32(nprnd(2**30/jobs)),
517
                         grid=(jobs,1),
518
                         block=(threads,1,1))
519
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
520
            (Alu,jobs/threads,threads,ParaStyle)
521
    else:
522
      MetropolisThreadsCU(SplutterCU,
523
                       numpy.uint32(len(Splutter)),
524
                       numpy.uint64(iterationsCL),
525
                       numpy.uint32(nprnd(2**30/jobs)),
526
                       numpy.uint32(nprnd(2**30/jobs)),
527
                       grid=(1,1),
528
                       block=(jobs,1,1))
529
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
530
            (Alu,1,jobs,ParaStyle)
531
    stop.record()
532
    stop.synchronize()
533
                
534
    elapsed = start.time_till(stop)*1e-3
535

    
536
    print Splutter,sum(Splutter)
537
    MySplutter[i]=numpy.median(Splutter)
538
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
539

    
540
    MyDuration[i]=elapsed
541

    
542
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
543
    #MyPi[i]=numpy.median(AllPi)
544
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
545

    
546

    
547
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
548

    
549
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
550

    
551

    
552
def MetropolisOpenCL(circle,iterations,steps,jobs,
553
                     ParaStyle,Alu,Device,Density):
554
        
555
  # Initialisation des variables en les CASTant correctement
556

    
557
  MaxMemoryXPU=0
558
  MinMemoryXPU=0
559

    
560
  if Device==0:
561
    print "Enter XPU selector based on ALU type: first selected"
562
    HasXPU=False
563
    # Default Device selection based on ALU Type
564
    for platform in cl.get_platforms():
565
      for device in platform.get_devices():
566
        deviceType=cl.device_type.to_string(device.type)
567
        deviceMemory=device.max_mem_alloc_size
568
        if deviceMemory>MaxMemoryXPU:
569
          MaxMemoryXPU=deviceMemory
570
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
571
          MinMemoryXPU=deviceMemory
572
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
573
          XPU=device
574
          print "GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
575
          HasXPU=True
576
          MemoryXPU=deviceMemory
577
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
578
          XPU=device
579
          print "CPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
580
          HasXPU=True
581
          MemoryXPU=deviceMemory
582
          
583
  else:
584
    print "Enter XPU selector based on device number & ALU type"
585
    Id=1
586
    HasXPU=False
587
    # Primary Device selection based on Device Id
588
    for platform in cl.get_platforms():
589
      for device in platform.get_devices():
590
        deviceType=cl.device_type.to_string(device.type)
591
        deviceMemory=device.max_mem_alloc_size
592
        if deviceMemory>MaxMemoryXPU:
593
          MaxMemoryXPU=deviceMemory
594
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
595
          MinMemoryXPU=deviceMemory
596
        if Id==Device and Alu==deviceType and HasXPU==False:
597
          XPU=device
598
          print "CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
599
          HasXPU=True
600
          MemoryXPU=deviceMemory
601
        Id=Id+1
602
    if HasXPU==False:
603
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
604
          (Device,Alu,Id-1)
605
      return(0,0,0)
606

    
607
  print "Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU)
608

    
609
  # Je cree le contexte et la queue pour son execution
610
  ctx = cl.Context([XPU])
611
  queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
612
  
613
  # Je recupere les flag possibles pour les buffers
614
  mf = cl.mem_flags
615

    
616
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
617
      
618
  MyDuration=numpy.zeros(steps)
619
  
620
  if iterations%jobs==0:
621
    iterationsCL=numpy.uint64(iterations/jobs)
622
  else:
623
    iterationsCL=numpy.uint64(iterations/jobs+1)
624
    
625
  iterationsNew=numpy.uint64(iterationsCL*jobs)
626

    
627
  MySplutter=numpy.zeros(steps)
628

    
629
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
630
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
631
  
632
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
633
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
634

    
635
  for i in range(steps):
636
                
637
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
638
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
639
 
640
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
641

    
642
    Splutter[:]=0
643

    
644
    print Splutter,len(Splutter)
645

    
646
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
647

    
648
    if ParaStyle=='Blocks':
649
      # Call OpenCL kernel
650
      # (1,) is Global work size (only 1 work size)
651
      # (1,) is local work size
652
      # circleCL is lattice translated in CL format
653
      # SeedZCL is lattice translated in CL format
654
      # SeedWCL is lattice translated in CL format
655
      # step is number of iterations
656
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
657
      #                                      SplutterCL,
658
      #                                      numpy.uint32(len(Splutter)),
659
      #                                      numpy.uint64(iterationsCL),
660
      #                                      numpy.uint32(nprnd(2**30/jobs)),
661
      #                                      numpy.uint32(nprnd(2**30/jobs)))
662
      if Density=='Dense':
663
        CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None,
664
                                                  SplutterCL,
665
                                                  numpy.uint32(len(Splutter)),
666
                                                  numpy.uint64(iterationsCL),
667
                                                  numpy.uint32(521288629),
668
                                                  numpy.uint32(362436069))
669
      elif Density=='Sparse':
670
        CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None,
671
                                                   SplutterCL,
672
                                                   numpy.uint32(len(Splutter)),
673
                                                   numpy.uint64(iterationsCL),
674
                                                   numpy.uint32(521288629),
675
                                                   numpy.uint32(362436069))
676
        
677
      else:
678
        CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
679
                                             SplutterCL,
680
                                             numpy.uint32(len(Splutter)),
681
                                             numpy.uint64(iterationsCL),
682
                                             numpy.uint32(521288629),
683
                                             numpy.uint32(362436069))
684
        
685
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
686
            (Alu,jobs,1,ParaStyle)
687
    elif ParaStyle=='Hybrid':
688
      threads=BestThreadsNumber(jobs)
689
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
690
      if Density=='Dense':
691
        CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,),
692
                                                  SplutterCL,
693
                                                  numpy.uint32(len(Splutter)),
694
                                                  numpy.uint64(iterationsCL),
695
                                                  numpy.uint32(nprnd(2**30/jobs)),
696
                                                  numpy.uint32(nprnd(2**30/jobs)))
697
      elif Density=='Sparse':
698
        CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,),
699
                                                   SplutterCL,
700
                                                   numpy.uint32(len(Splutter)),
701
                                                   numpy.uint64(iterationsCL),
702
                                                   numpy.uint32(nprnd(2**30/jobs)),
703
                                                   numpy.uint32(nprnd(2**30/jobs)))
704
        
705
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
706
            (Alu,jobs/threads,threads,ParaStyle)
707
    else:
708
      # en OpenCL, necessaire de mettre un global_id identique au local_id
709
      if Density=='Dense':
710
        CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,),
711
                                                 SplutterCL,
712
                                                 numpy.uint32(len(Splutter)),
713
                                                 numpy.uint64(iterationsCL),
714
                                                 numpy.uint32(nprnd(2**30/jobs)),
715
                                                 numpy.uint32(nprnd(2**30/jobs)))
716
      elif Density=='Sparse':
717
        CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,),
718
                                                  SplutterCL,
719
                                                  numpy.uint32(len(Splutter)),
720
                                                  numpy.uint64(iterationsCL),
721
                                                  numpy.uint32(nprnd(2**30/jobs)),
722
                                                  numpy.uint32(nprnd(2**30/jobs)))
723
        
724
        
725
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
726

    
727
    CLLaunch.wait()
728
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
729

    
730
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
731

    
732
    MyDuration[i]=elapsed
733
    print Splutter,sum(Splutter)
734
    #MySplutter[i]=numpy.median(Splutter)
735
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
736

    
737
    SplutterCL.release()
738

    
739
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
740
        
741
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
742

    
743

    
744
def FitAndPrint(N,D,Curves):
745

    
746
  from scipy.optimize import curve_fit
747
  import matplotlib.pyplot as plt
748

    
749
  try:
750
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
751

    
752
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
753
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
754
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
755
    coeffs_Amdahl[0]=D[0]
756
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
757
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
758
  except:
759
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
760

    
761
  try:
762
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
763

    
764
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
765
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
766
    coeffs_AmdahlR[0]=D[0]
767
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
768
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
769

    
770
  except:
771
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
772

    
773
  try:
774
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
775

    
776
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
777
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
778
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
779
    coeffs_Mylq[0]=D[0]
780
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
781
                                                            coeffs_Mylq[1],
782
                                                            coeffs_Mylq[3],
783
                                                            coeffs_Mylq[2])
784
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
785
                coeffs_Mylq[3])
786
  except:
787
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
788

    
789
  try:
790
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
791

    
792
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
793
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
794
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
795
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
796
    coeffs_Mylq2[0]=D[0]
797
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
798
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
799
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
800

    
801
  except:
802
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
803

    
804
  if Curves:
805
    plt.xlabel("Number of Threads/work Items")
806
    plt.ylabel("Total Elapsed Time")
807

    
808
    Experience,=plt.plot(N,D,'ro') 
809
    try:
810
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
811
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
812
    except:
813
      print "Fit curves seem not to be available"
814

    
815
    plt.legend()
816
    plt.show()
817

    
818
if __name__=='__main__':
819

    
820
  # Set defaults values
821
  
822
  # Alu can be CPU, GPU or ACCELERATOR
823
  Alu='CPU'
824
  # Id of GPU : 1 is for first find !
825
  Device=0
826
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
827
  GpuStyle='OpenCL'
828
  # Parallel distribution can be on Threads or Blocks
829
  ParaStyle='Blocks'
830
  # Iterations is integer
831
  Iterations=10000000
832
  # JobStart in first number of Jobs to explore
833
  JobStart=1
834
  # JobEnd is last number of Jobs to explore
835
  JobEnd=16
836
  # JobStep is the step of Jobs to explore
837
  JobStep=1
838
  # Redo is the times to redo the test to improve metrology
839
  Redo=1
840
  # OutMetrology is method for duration estimation : False is GPU inside
841
  OutMetrology=False
842
  Metrology='InMetro'
843
  # Curves is True to print the curves
844
  Curves=False
845
  # Fit is True to print the curves
846
  Fit=False
847
  # Spluttering is Dense by default
848
  Density='All'
849

    
850
  try:
851
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:y:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","density="])
852
  except getopt.GetoptError:
853
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
854
    sys.exit(2)
855
    
856
  for opt, arg in opts:
857
    if opt == '-h':
858
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law)  -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
859

    
860
      print "\nInformations about devices detected under OpenCL:"
861
      # For PyOpenCL import
862
      try:
863
        import pyopencl as cl
864
        Id=1
865
        for platform in cl.get_platforms():
866
          for device in platform.get_devices():
867
            deviceType=cl.device_type.to_string(device.type)
868
            deviceMemory=device.max_mem_alloc_size
869
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
870
            Id=Id+1
871

    
872
        print
873
        sys.exit()
874
      except ImportError:
875
        print "Your platform does not seem to support OpenCL"
876
        
877
    elif opt == '-o':
878
      OutMetrology=True
879
      Metrology='OutMetro'
880
    elif opt == '-c':
881
      Curves=True
882
    elif opt in ("-y", "--density"):
883
      Density = arg
884
    elif opt == '-f':
885
      Fit=True
886
    elif opt in ("-a", "--alu"):
887
      Alu = arg
888
    elif opt in ("-d", "--device"):
889
      Device = int(arg)
890
    elif opt in ("-g", "--gpustyle"):
891
      GpuStyle = arg
892
    elif opt in ("-p", "--parastyle"):
893
      ParaStyle = arg
894
    elif opt in ("-i", "--iterations"):
895
      Iterations = numpy.uint64(arg)
896
    elif opt in ("-s", "--jobstart"):
897
      JobStart = int(arg)
898
    elif opt in ("-e", "--jobend"):
899
      JobEnd = int(arg)
900
    elif opt in ("-t", "--jobstep"):
901
      JobStep = int(arg)
902
    elif opt in ("-r", "--redo"):
903
      Redo = int(arg)
904

    
905
  print "Toto %s" % Alu
906

    
907
  if Alu=='CPU' and GpuStyle=='CUDA':
908
    print "Alu can't be CPU for CUDA, set Alu to GPU"
909
    Alu='GPU'
910

    
911
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
912
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
913
    ParaStyle='Blocks'
914

    
915
  print "Compute unit : %s" % Alu
916
  print "Device Identification : %s" % Device
917
  print "GpuStyle used : %s" % GpuStyle
918
  print "Parallel Style used : %s" % ParaStyle
919
  print "Density Spluttering : %s" % Density
920
  print "Iterations : %s" % Iterations
921
  print "Number of threads on start : %s" % JobStart
922
  print "Number of threads on end : %s" % JobEnd
923
  print "Number of redo : %s" % Redo
924
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
925

    
926
  if GpuStyle=='CUDA':
927
    try:
928
      # For PyCUDA import
929
      import pycuda.driver as cuda
930
      import pycuda.gpuarray as gpuarray
931
      import pycuda.autoinit
932
      from pycuda.compiler import SourceModule
933
    except ImportError:
934
      print "Platform does not seem to support CUDA"
935

    
936
  if GpuStyle=='OpenCL':
937
    try:
938
      # For PyOpenCL import
939
      import pyopencl as cl
940
      Id=1
941
      for platform in cl.get_platforms():
942
        for device in platform.get_devices():
943
          deviceType=cl.device_type.to_string(device.type)
944
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
945
          if Id == Device:
946
            # Set the Alu as detected Device Type
947
            Alu=deviceType
948
          Id=Id+1
949
    except ImportError:
950
      print "Platform does not seem to support CUDA"
951
      
952
  average=numpy.array([]).astype(numpy.float32)
953
  median=numpy.array([]).astype(numpy.float32)
954
  stddev=numpy.array([]).astype(numpy.float32)
955

    
956
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
957

    
958
  Jobs=JobStart
959

    
960
  while Jobs <= JobEnd:
961
    avg,med,std=0,0,0
962
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
963
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
964

    
965
    if OutMetrology:
966
      duration=numpy.array([]).astype(numpy.float32)
967
      for i in range(Redo):
968
        start=time.time()
969
        if GpuStyle=='CUDA':
970
          try:
971
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Density)
972
          except:
973
            print "Problem with %i // computations on Cuda" % Jobs
974
        elif GpuStyle=='OpenCL':
975
          try:
976
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
977
                                   Alu,Device,Density)
978
          except:
979
            print "Problem with %i // computations on OpenCL" % Jobs            
980
        duration=numpy.append(duration,time.time()-start)
981
      if (a,m,s) != (0,0,0):
982
        avg=numpy.mean(duration)
983
        med=numpy.median(duration)
984
        std=numpy.std(duration)
985
      else:
986
        print "Values seem to be wrong..."
987
    else:
988
      if GpuStyle=='CUDA':
989
        try:
990
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,
991
                                     Jobs,ParaStyle,Density)
992
        except:
993
          print "Problem with %i // computations on Cuda" % Jobs
994
      elif GpuStyle=='OpenCL':
995
        try:
996
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,
997
                                       Jobs,ParaStyle,Alu,Device,Density)
998
        except:
999
          print "Problem with %i // computations on OpenCL" % Jobs            
1000

    
1001
    if (avg,med,std) != (0,0,0):
1002
      print "jobs,avg,med,std",Jobs,avg,med,std
1003
      average=numpy.append(average,avg)
1004
      median=numpy.append(median,med)
1005
      stddev=numpy.append(stddev,std)
1006
    else:
1007
      print "Values seem to be wrong..."
1008
    #THREADS*=2
1009
    if len(average)!=0:
1010
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
1011
      ToSave=[ ExploredJobs,average,median,stddev ]
1012
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
1013
    Jobs+=JobStep
1014

    
1015
  if Fit:
1016
    FitAndPrint(ExploredJobs,median,Curves)
1017