Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 202

Historique | Voir | Annoter | Télécharger (27,66 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# http://mathema.tician.de/software/pyopencl
11
# 
12

    
13
# 2013-01-01 : problems with launch timeout
14
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
15
# Option "Interactive" "0" in /etc/X11/xorg.conf
16

    
17
# Marsaglia elements about RNG 
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import math
26
from socket import gethostname
27

    
28
# find prime factors of a number
29
# Get for WWW :
30
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
31
def PrimeFactors(x):
32
  factorlist=numpy.array([]).astype('uint32')
33
  loop=2
34
  while loop<=x:
35
    if x%loop==0:
36
      x/=loop
37
      factorlist=numpy.append(factorlist,[loop])
38
    else:
39
      loop+=1
40
  return factorlist
41
  
42
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
43
# output is thread number
44
def BestThreadsNumber(jobs):
45
  factors=PrimeFactors(jobs)
46
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
47
  threads=1
48
  for factor in matrix.transpose().ravel():
49
    threads=threads*factor
50
    if threads*threads>jobs:
51
      break
52
  return(long(threads))
53

    
54
# Predicted Amdahl Law (Reduced with s=1-p)  
55
def AmdahlR(N, T1, p):
56
  return (T1*(1-p+p/N))
57

    
58
# Predicted Amdahl Law
59
def Amdahl(N, T1, s, p):
60
  return (T1*(s+p/N))
61

    
62
# Predicted Mylq Law with first order
63
def Mylq(N, T1,s,c,p):
64
  return (T1*(s+p/N)+c*N)
65

    
66
# Predicted Mylq Law with second order
67
def Mylq2(N, T1,s,c1,c2,p):
68
  return (T1*(s+p/N)+c1*N+c2*N*N)
69

    
70
KERNEL_CODE_CUDA="""
71

72
// Marsaglia RNG very simple implementation
73

74
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
75
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
76

77
#define MWC   (znew+wnew)
78
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
79
#define CONG  (jcong=69069*jcong+1234567)
80
#define KISS  ((MWC^CONG)+SHR3)
81

82
#define CONGfp CONG * 2.328306435454494e-10f
83
#define SHR3fp SHR3 * 2.328306435454494e-10f
84
#define MWCfp MWC * 2.328306435454494e-10f
85
#define KISSfp KISS * 2.328306435454494e-10f
86

87
#define MAX (ulong)4294967296
88
#define UMAX (uint)2147483648
89

90
__global__ void SplutterGlobal(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
91
{
92
    const ulong id=(ulong)(blockIdx.x);
93
   
94
    uint z=seed_z-(uint)id;
95
    uint w=seed_w+(uint)id;
96

97
    uint jsr=seed_z;
98
    uint jcong=seed_w;
99

100
   for ( ulong i=0;i<iterations;i++) {
101

102
      // All version
103
      uint position=(uint)( ((ulong)MWC*(ulong)space)/MAX );
104

105
      // UMAX is set to avoid round over overflow
106
      atomicInc(&s[position],UMAX);
107
   }
108

109
   __syncthreads();
110
}
111

112
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
113
{
114
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
115
    const ulong size=(ulong)(gridDim.x*blockDim.x);
116
    const ulong block=(ulong)space/(ulong)size;
117
   
118
    uint z=seed_z-(uint)id;
119
    uint w=seed_w+(uint)id;
120

121
    uint jsr=seed_z;
122
    uint jcong=seed_w;
123

124
   for ( ulong i=0;i<iterations;i++) {
125

126
      // Dense version
127
       uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX );
128

129
      s[position]++;
130
   }
131

132
   __syncthreads();
133
}
134

135
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
136
{ 
137
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
138
    const ulong size=(ulong)(gridDim.x*blockDim.x);
139
    const ulong block=(ulong)space/(ulong)size;
140
   
141
    uint z=seed_z-(uint)id;
142
    uint w=seed_w+(uint)id;
143

144
    uint jsr=seed_z;
145
    uint jcong=seed_w;
146

147
   for ( ulong i=0;i<iterations;i++) {
148

149
      // Sparse version
150
       uint position=(uint)( (ulong)MWC*block/MAX*size+id );
151

152
      s[position]++;
153
   }
154

155
   __syncthreads();
156
}
157

158
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
159
{
160
    const ulong id=(ulong)(threadIdx.x);
161
    const ulong size=(ulong)(blockDim.x);
162
    const ulong block=(ulong)space/(ulong)size;
163
   
164
    uint z=seed_z-(uint)id;
165
    uint w=seed_w+(uint)id;
166

167
    uint jsr=seed_z;
168
    uint jcong=seed_w;
169

170
   for ( ulong i=0;i<iterations;i++) {
171

172
      // Dense version
173
       size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
174

175
      s[position]++;
176
   }
177

178

179
   __syncthreads();
180

181
}
182

183
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
184
{
185
    const ulong id=(ulong)threadIdx.x;
186
    const ulong size=(ulong)blockDim.x;
187
    const ulong block=(ulong)space/(ulong)size;
188
   
189
    uint z=seed_z-(uint)id;
190
    uint w=seed_w+(uint)id;
191

192
    uint jsr=seed_z;
193
    uint jcong=seed_w;
194

195
   for ( ulong i=0;i<iterations;i++) {
196

197
      // Sparse version
198
       size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
199

200
      s[position]++;
201
   }
202

203
   __syncthreads();
204

205
}
206

207
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
208
{
209
    const ulong id=(ulong)(blockIdx.x);
210
    const ulong size=(ulong)(gridDim.x);
211
    const ulong block=(ulong)space/(ulong)size;
212
   
213
    uint z=seed_z-(uint)id;
214
    uint w=seed_w+(uint)id;
215

216
    uint jsr=seed_z;
217
    uint jcong=seed_w;
218

219
   for ( ulong i=0;i<iterations;i++) {
220

221
      // Dense version
222
      size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
223

224
      s[position]++;
225
   }
226
      
227
   __syncthreads();
228
}
229

230
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
231
{
232
    const ulong id=(ulong)(blockIdx.x);
233
    const ulong size=(ulong)(gridDim.x);
234
    const ulong block=(ulong)space/(ulong)size;
235
   
236
    uint z=seed_z-(uint)id;
237
    uint w=seed_w+(uint)id;
238

239
    uint jsr=seed_z;
240
    uint jcong=seed_w;
241

242
   for ( ulong i=0;i<iterations;i++) {
243

244
      // Sparse version
245
      size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id );
246

247
      s[position]++;
248

249
   }
250

251
   //s[blockIdx.x]=blockIdx.x;
252
   __syncthreads();
253
}
254

255
"""
256

    
257
KERNEL_CODE_OPENCL="""
258
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
259

260
// Marsaglia RNG very simple implementation
261
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
262
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
263

264
#define MWC   (znew+wnew)
265
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
266
#define CONG  (jcong=69069*jcong+1234567)
267
#define KISS  ((MWC^CONG)+SHR3)
268

269
#define CONGfp CONG * 2.328306435454494e-10f
270
#define SHR3fp SHR3 * 2.328306435454494e-10f
271
#define MWCfp MWC * 2.328306435454494e-10f
272
#define KISSfp KISS * 2.328306435454494e-10f
273

274
#define MAX (ulong)4294967296
275

276
uint rotl(uint value, int shift) {
277
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
278
}
279
 
280
uint rotr(uint value, int shift) {
281
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
282
}
283

284
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
285
{
286
   __private const ulong id=(ulong)get_global_id(0);
287
   
288
   __private uint z=seed_z-(uint)id;
289
   __private uint w=seed_w+(uint)id;
290

291
   __private uint jsr=seed_z;
292
   __private uint jcong=seed_w;
293

294
   for (__private ulong i=0;i<iterations;i++) {
295

296
      // Dense version
297
      __private size_t position=(size_t)( MWC%space );
298

299
      atomic_inc(&s[position]);
300
   }
301

302
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
303

304
}
305

306
__kernel void SplutterLocal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
307
{
308
   __private const ulong id=(ulong)get_local_id(0);
309
   
310
   __private uint z=seed_z-(uint)id;
311
   __private uint w=seed_w+(uint)id;
312

313
   __private uint jsr=seed_z;
314
   __private uint jcong=seed_w;
315

316
   for (__private ulong i=0;i<iterations;i++) {
317

318
      // Dense version
319
      //__private size_t position=(size_t)( (MWC+id*block)%space );
320
      __private size_t position=(size_t)( MWC%space );
321

322
      atomic_inc(&s[position]);
323
   }
324

325
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
326

327
}
328

329
__kernel void SplutterHybrid(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
330
{
331
   __private const ulong id=(ulong)(get_global_id(0)+get_local_id(0));
332
   
333
   __private uint z=seed_z-(uint)id;
334
   __private uint w=seed_w+(uint)id;
335

336
   __private uint jsr=seed_z;
337
   __private uint jcong=seed_w;
338

339
   for (__private ulong i=0;i<iterations;i++) {
340

341
      // Dense version
342
      __private size_t position=(size_t)( MWC%space );
343

344
      atomic_inc(&s[position]);
345
   }
346
      
347
}
348

349
"""
350

    
351
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density,Memory):
352

    
353
  # Avec PyCUDA autoinit, rien a faire !
354

    
355
  circleCU = cuda.InOut(circle)
356
  
357
  mod = SourceModule(KERNEL_CODE_CUDA)
358

    
359
  if Density=='Dense':
360
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
361
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
362
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
363
  elif Density=='Sparse':
364
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
365
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
366
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
367
  else:
368
    MetropolisBlocksCU=mod.get_function("SplutterGlobal")
369
    
370
  start = pycuda.driver.Event()
371
  stop = pycuda.driver.Event()
372
  
373
  MySplutter=numpy.zeros(steps)
374
  MyDuration=numpy.zeros(steps)
375

    
376
  if iterations%jobs==0:
377
    iterationsCL=numpy.uint64(iterations/jobs)
378
  else:
379
    iterationsCL=numpy.uint64(iterations/jobs+1)
380
    
381
  iterationsNew=iterationsCL*jobs
382

    
383
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
384

    
385
  for i in range(steps):
386

    
387
    start_time=time.time()
388
    Splutter[:]=0
389
    
390
    print Splutter,len(Splutter)
391

    
392
    SplutterCU = cuda.InOut(Splutter)
393

    
394
    start.record()
395
    start.synchronize()
396
    if ParaStyle=='Blocks':
397
      MetropolisBlocksCU(SplutterCU,
398
                         numpy.uint32(len(Splutter)),
399
                         numpy.uint64(iterationsCL),
400
                         numpy.uint32(nprnd(2**30/jobs)),
401
                         numpy.uint32(nprnd(2**30/jobs)),
402
                         grid=(jobs,1),
403
                         block=(1,1,1))
404
        
405
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
406
            (Alu,jobs,1,ParaStyle)      
407
    elif ParaStyle=='Hybrid':
408
      threads=BestThreadsNumber(jobs)
409
      MetropolisHybridCU(SplutterCU,
410
                         numpy.uint32(len(Splutter)),
411
                         numpy.uint64(iterationsCL),
412
                         numpy.uint32(nprnd(2**30/jobs)),
413
                         numpy.uint32(nprnd(2**30/jobs)),
414
                         grid=(jobs,1),
415
                         block=(threads,1,1))
416
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
417
            (Alu,jobs/threads,threads,ParaStyle)
418
    else:
419
      MetropolisThreadsCU(SplutterCU,
420
                       numpy.uint32(len(Splutter)),
421
                       numpy.uint64(iterationsCL),
422
                       numpy.uint32(nprnd(2**30/jobs)),
423
                       numpy.uint32(nprnd(2**30/jobs)),
424
                       grid=(1,1),
425
                       block=(jobs,1,1))
426
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
427
            (Alu,1,jobs,ParaStyle)
428
    stop.record()
429
    stop.synchronize()
430
                
431
#    elapsed = start.time_till(stop)*1e-3
432
    elapsed = time.time()-start_time
433

    
434
    print Splutter,sum(Splutter)
435
    MySplutter[i]=numpy.median(Splutter)
436
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
437

    
438
    MyDuration[i]=elapsed
439

    
440
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
441
    #MyPi[i]=numpy.median(AllPi)
442
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
443

    
444

    
445
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
446

    
447
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
448

    
449

    
450
def MetropolisOpenCL(circle,iterations,steps,jobs,
451
                     ParaStyle,Alu,Device,Memory):
452
        
453
  # Initialisation des variables en les CASTant correctement
454

    
455
  MaxMemoryXPU=0
456
  MinMemoryXPU=0
457

    
458
  if Device==0:
459
    print "Enter XPU selector based on ALU type: first selected"
460
    HasXPU=False
461
    # Default Device selection based on ALU Type
462
    for platform in cl.get_platforms():
463
      for device in platform.get_devices():
464
        #deviceType=cl.device_type.to_string(device.type)
465
        deviceMemory=device.max_mem_alloc_size
466
        if deviceMemory>MaxMemoryXPU:
467
          MaxMemoryXPU=deviceMemory
468
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
469
          MinMemoryXPU=deviceMemory
470
        if not HasXPU:        
471
          XPU=device
472
          print "XPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
473
          HasXPU=True
474
          MemoryXPU=deviceMemory
475
          
476
  else:
477
    print "Enter XPU selector based on device number & ALU type"
478
    Id=1
479
    HasXPU=False
480
    # Primary Device selection based on Device Id
481
    for platform in cl.get_platforms():
482
      for device in platform.get_devices():
483
        #deviceType=cl.device_type.to_string(device.type)
484
        deviceMemory=device.max_mem_alloc_size
485
        if deviceMemory>MaxMemoryXPU:
486
          MaxMemoryXPU=deviceMemory
487
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
488
          MinMemoryXPU=deviceMemory
489
        if Id==Device  and HasXPU==False:
490
          XPU=device
491
          print "CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name)
492
          HasXPU=True
493
          MemoryXPU=deviceMemory
494
        Id=Id+1
495
    if HasXPU==False:
496
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
497
          (Device,Alu,Id-1)
498
      return(0,0,0)
499

    
500
  print "Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU)
501

    
502
  # Je cree le contexte et la queue pour son execution
503
  ctx = cl.Context([XPU])
504
  queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
505
  
506
  # Je recupere les flag possibles pour les buffers
507
  mf = cl.mem_flags
508

    
509
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
510
      
511
  MyDuration=numpy.zeros(steps)
512
  
513
  if iterations%jobs==0:
514
    iterationsCL=numpy.uint64(iterations/jobs)
515
  else:
516
    iterationsCL=numpy.uint64(iterations/jobs+1)
517
    
518
  iterationsNew=numpy.uint64(iterationsCL*jobs)
519

    
520
  MySplutter=numpy.zeros(steps)
521

    
522
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
523
  print MaxWorks,2**(int)(numpy.log2(MemoryXPU))
524
  
525
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
526
  #Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
527
  Splutter=numpy.zeros(Memory).astype(numpy.uint32)
528

    
529
  for i in range(steps):
530
                
531
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
532
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
533
 
534
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
535

    
536
    Splutter[:]=0
537

    
538
    print Splutter,len(Splutter)
539

    
540
    h2d_time=time.time()
541
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
542
    print('From Host to Device time %f' % (time.time()-h2d_time))
543

    
544
    start_time=time.time()
545
    if ParaStyle=='Blocks':
546
      # Call OpenCL kernel
547
      # (1,) is Global work size (only 1 work size)
548
      # (1,) is local work size
549
      # circleCL is lattice translated in CL format
550
      # SeedZCL is lattice translated in CL format
551
      # SeedWCL is lattice translated in CL format
552
      # step is number of iterations
553
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
554
      #                                      SplutterCL,
555
      #                                      numpy.uint32(len(Splutter)),
556
      #                                      numpy.uint64(iterationsCL),
557
      #                                      numpy.uint32(nprnd(2**30/jobs)),
558
      #                                      numpy.uint32(nprnd(2**30/jobs)))
559
      CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
560
                                           SplutterCL,
561
                                           numpy.uint32(len(Splutter)),
562
                                           numpy.uint64(iterationsCL),
563
                                           numpy.uint32(nprnd(2**30/jobs)),
564
                                           numpy.uint32(nprnd(2**30/jobs)))
565
        
566
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
567
            (Alu,jobs,1,ParaStyle)
568
    elif ParaStyle=='Hybrid':
569
      #threads=BestThreadsNumber(jobs)
570
      threads=BestThreadsNumber(256)
571
      print "print",threads      
572
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
573
      CLLaunch=MetropolisCL.SplutterHybrid(queue,(jobs,),(threads,),
574
                                           SplutterCL,
575
                                           numpy.uint32(len(Splutter)),
576
                                           numpy.uint64(iterationsCL),
577
                                           numpy.uint32(nprnd(2**30/jobs)),
578
                                           numpy.uint32(nprnd(2**30/jobs)))
579
        
580
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
581
            (Alu,jobs/threads,threads,ParaStyle)
582
    else:
583
      # en OpenCL, necessaire de mettre un global_id identique au local_id
584
      CLLaunch=MetropolisCL.SplutterLocal(queue,(jobs,),(jobs,),
585
                                          SplutterCL,
586
                                          numpy.uint32(len(Splutter)),
587
                                          numpy.uint64(iterationsCL),
588
                                          numpy.uint32(nprnd(2**30/jobs)),
589
                                          numpy.uint32(nprnd(2**30/jobs)))
590
        
591
        
592
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
593

    
594
    CLLaunch.wait()
595
    d2h_time=time.time()
596
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
597
    print('From Device to Host %f' % (time.time()-d2h_time))
598
    
599
#    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
600
    elapsed = time.time()-start_time
601
    print('Elapsed compute time %f' % elapsed)
602

    
603
    MyDuration[i]=elapsed
604
    #print Splutter,sum(Splutter)
605
    #MySplutter[i]=numpy.median(Splutter)
606
    #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
607
    
608
  SplutterCL.release()
609

    
610
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
611
        
612
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
613

    
614

    
615
def FitAndPrint(N,D,Curves):
616

    
617
  from scipy.optimize import curve_fit
618
  import matplotlib.pyplot as plt
619

    
620
  try:
621
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
622

    
623
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
624
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
625
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
626
    coeffs_Amdahl[0]=D[0]
627
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
628
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
629
  except:
630
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
631

    
632
  try:
633
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
634

    
635
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
636
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
637
    coeffs_AmdahlR[0]=D[0]
638
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
639
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
640

    
641
  except:
642
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
643

    
644
  try:
645
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
646

    
647
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
648
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
649
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
650
    coeffs_Mylq[0]=D[0]
651
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
652
                                                            coeffs_Mylq[1],
653
                                                            coeffs_Mylq[3],
654
                                                            coeffs_Mylq[2])
655
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
656
                coeffs_Mylq[3])
657
  except:
658
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
659

    
660
  try:
661
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
662

    
663
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
664
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
665
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
666
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
667
    coeffs_Mylq2[0]=D[0]
668
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
669
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
670
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
671

    
672
  except:
673
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
674

    
675
  if Curves:
676
    plt.xlabel("Number of Threads/work Items")
677
    plt.ylabel("Total Elapsed Time")
678

    
679
    Experience,=plt.plot(N,D,'ro') 
680
    try:
681
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
682
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
683
    except:
684
      print "Fit curves seem not to be available"
685

    
686
    plt.legend()
687
    plt.show()
688

    
689
if __name__=='__main__':
690

    
691
  # Set defaults values
692
  
693
  # Alu can be CPU, GPU or ACCELERATOR
694
  Alu='CPU'
695
  # Id of GPU : 1 is for first find !
696
  Device=0
697
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
698
  GpuStyle='OpenCL'
699
  # Parallel distribution can be on Threads or Blocks
700
  ParaStyle='Blocks'
701
  # Iterations is integer
702
  Iterations=10000000
703
  # JobStart in first number of Jobs to explore
704
  JobStart=1
705
  # JobEnd is last number of Jobs to explore
706
  JobEnd=16
707
  # JobStep is the step of Jobs to explore
708
  JobStep=1
709
  # Redo is the times to redo the test to improve metrology
710
  Redo=1
711
  # OutMetrology is method for duration estimation : False is GPU inside
712
  OutMetrology=False
713
  Metrology='InMetro'
714
  # Curves is True to print the curves
715
  Curves=False
716
  # Fit is True to print the curves
717
  Fit=False
718
  # Memory of vector explored
719
  Memory=1024
720

    
721
  try:
722
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
723
  except getopt.GetoptError:
724
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
725
    sys.exit(2)
726
    
727
  for opt, arg in opts:
728
    if opt == '-h':
729
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
730

    
731
      print "\nInformations about devices detected under OpenCL:"
732
      # For PyOpenCL import
733
      try:
734
        import pyopencl as cl
735
        Id=1
736
        for platform in cl.get_platforms():
737
          for device in platform.get_devices():
738
            #deviceType=cl.device_type.to_string(device.type)
739
            deviceMemory=device.max_mem_alloc_size
740
            print "Device #%i from %s with memory %i : %s" % (Id,platform.vendor,deviceMemory,device.name.lstrip())
741
            Id=Id+1
742

    
743
        print
744
        sys.exit()
745
      except ImportError:
746
        print "Your platform does not seem to support OpenCL"
747
        
748
    elif opt == '-o':
749
      OutMetrology=True
750
      Metrology='OutMetro'
751
    elif opt == '-c':
752
      Curves=True
753
    elif opt == '-f':
754
      Fit=True
755
    elif opt in ("-a", "--alu"):
756
      Alu = arg
757
    elif opt in ("-d", "--device"):
758
      Device = int(arg)
759
    elif opt in ("-g", "--gpustyle"):
760
      GpuStyle = arg
761
    elif opt in ("-p", "--parastyle"):
762
      ParaStyle = arg
763
    elif opt in ("-i", "--iterations"):
764
      Iterations = numpy.uint64(arg)
765
    elif opt in ("-s", "--jobstart"):
766
      JobStart = int(arg)
767
    elif opt in ("-e", "--jobend"):
768
      JobEnd = int(arg)
769
    elif opt in ("-t", "--jobstep"):
770
      JobStep = int(arg)
771
    elif opt in ("-r", "--redo"):
772
      Redo = int(arg)
773
    elif opt in ("-m", "--memory"):
774
      Memory = int(arg)
775

    
776
  if Alu=='CPU' and GpuStyle=='CUDA':
777
    print "Alu can't be CPU for CUDA, set Alu to GPU"
778
    Alu='GPU'
779

    
780
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
781
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
782
    ParaStyle='Blocks'
783

    
784
  print "Compute unit : %s" % Alu
785
  print "Device Identification : %s" % Device
786
  print "GpuStyle used : %s" % GpuStyle
787
  print "Parallel Style used : %s" % ParaStyle
788
  print "Iterations : %s" % Iterations
789
  print "Number of threads on start : %s" % JobStart
790
  print "Number of threads on end : %s" % JobEnd
791
  print "Number of redo : %s" % Redo
792
  print "Memory  : %s" % Memory
793
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
794

    
795
  if GpuStyle=='CUDA':
796
    try:
797
      # For PyCUDA import
798
      import pycuda.driver as cuda
799
      import pycuda.gpuarray as gpuarray
800
      import pycuda.autoinit
801
      from pycuda.compiler import SourceModule
802
    except ImportError:
803
      print "Platform does not seem to support CUDA"
804

    
805
  if GpuStyle=='OpenCL':
806
    try:
807
      # For PyOpenCL import
808
      import pyopencl as cl
809
      Id=1
810
      for platform in cl.get_platforms():
811
        for device in platform.get_devices():
812
          #deviceType=cl.device_type.to_string(device.type)
813
          print "Device #%i : %s" % (Id,device.name)
814
          if Id == Device:
815
            # Set the Alu as detected Device Type
816
            Alu='xPU'
817
          Id=Id+1
818
    except ImportError:
819
      print "Platform does not seem to support CUDA"
820
      
821
  average=numpy.array([]).astype(numpy.float32)
822
  median=numpy.array([]).astype(numpy.float32)
823
  stddev=numpy.array([]).astype(numpy.float32)
824

    
825
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
826

    
827
  Jobs=JobStart
828

    
829
  while Jobs <= JobEnd:
830
    avg,med,std=0,0,0
831
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
832
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
833

    
834
    if OutMetrology:
835
      duration=numpy.array([]).astype(numpy.float32)
836
      for i in range(Redo):
837
        start=time.time()
838
        if GpuStyle=='CUDA':
839
          try:
840
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
841
                                 Memory)
842
          except:
843
            print "Problem with %i // computations on Cuda" % Jobs
844
        elif GpuStyle=='OpenCL':
845
          try:
846
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
847
                                   Alu,Device,Memory)
848
          except:
849
            print "Problem with %i // computations on OpenCL" % Jobs            
850
        duration=numpy.append(duration,time.time()-start)
851
      if (a,m,s) != (0,0,0):
852
        avg=numpy.mean(duration)
853
        med=numpy.median(duration)
854
        std=numpy.std(duration)
855
      else:
856
        print "Values seem to be wrong..."
857
    else:
858
      if GpuStyle=='CUDA':
859
        try:
860
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,
861
                                     Jobs,ParaStyle,Memory)
862
        except:
863
          print "Problem with %i // computations on Cuda" % Jobs
864
      elif GpuStyle=='OpenCL':
865
        try:
866
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,
867
                                       ParaStyle,Alu,Device,Memory)
868
        except:
869
          print "Problem with %i // computations on OpenCL" % Jobs            
870

    
871
    if (avg,med,std) != (0,0,0):
872
      print "jobs,avg,med,std",Jobs,avg,med,std
873
      average=numpy.append(average,avg)
874
      median=numpy.append(median,med)
875
      stddev=numpy.append(stddev,std)
876
    else:
877
      print "Values seem to be wrong..."
878
    #THREADS*=2
879
    if len(average)!=0:
880
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
881
      ToSave=[ ExploredJobs,average,median,stddev ]
882
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
883
    Jobs+=JobStep
884

    
885
  if Fit:
886
    FitAndPrint(ExploredJobs,median,Curves)
887