Révision 57

Splutter/GPU/SplutterGPU.py (revision 57)
1
#!/usr/bin/env python
2

  
3
#
4
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

  
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

  
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import math
23
from socket import gethostname
24

  
25
# find prime factors of a number
26
# Get for WWW :
27
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
28
def PrimeFactors(x):
29
  factorlist=numpy.array([]).astype('uint32')
30
  loop=2
31
  while loop<=x:
32
    if x%loop==0:
33
      x/=loop
34
      factorlist=numpy.append(factorlist,[loop])
35
    else:
36
      loop+=1
37
  return factorlist
38

  
39
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
40
# output is thread number
41
def BestThreadsNumber(jobs):
42
  factors=PrimeFactors(jobs)
43
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
44
  threads=1
45
  for factor in matrix.transpose().ravel():
46
    threads=threads*factor
47
    if threads*threads>jobs:
48
      break
49
  return(long(threads))
50

  
51
# Predicted Amdahl Law (Reduced with s=1-p)  
52
def AmdahlR(N, T1, p):
53
  return (T1*(1-p+p/N))
54

  
55
# Predicted Amdahl Law
56
def Amdahl(N, T1, s, p):
57
  return (T1*(s+p/N))
58

  
59
# Predicted Mylq Law with first order
60
def Mylq(N, T1,s,c,p):
61
  return (T1*(s+p/N)+c*N)
62

  
63
# Predicted Mylq Law with second order
64
def Mylq2(N, T1,s,c1,c2,p):
65
  return (T1*(s+p/N)+c1*N+c2*N*N)
66

  
67
KERNEL_CODE_CUDA="""
68

  
69
// Marsaglia RNG very simple implementation
70

  
71
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73
#define MWC   (znew+wnew)
74
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
75
#define CONG  (jcong=69069*jcong+1234567)
76
#define KISS  ((MWC^CONG)+SHR3)
77

  
78
#define MWCfp MWC * 2.328306435454494e-10f
79
#define KISSfp KISS * 2.328306435454494e-10f
80

  
81
#define MAX 4294967296
82

  
83
uint rotl(uint value, int shift) {
84
   return (value << shift) | (value >> (sizeof(value) * 8 - shift));
85
}
86
 
87
uint rotr(uint value, int shift) {
88
   return (value >> shift) | (value << (sizeof(value) * 8 - shift));
89
}
90

  
91
__global__ void MainLoopBlocks(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
92
{
93
   // uint z=rotl(seed_z,blockIdx.x);
94
   // uint w=rotr(seed_w,blockIdx.x);
95

  
96
   // uint jsr=rotl(seed_z,blockIdx.x);
97
   // uint jcong=rotr(seed_w,blockIdx.x);
98

  
99
   uint z=seed_z/(blockIdx.x+1);
100
   uint w=seed_w%(blockIdx.x+1);
101

  
102
   uint jsr=seed_z/(blockIdx.x+1);
103
   uint jcong=seed_w%(blockIdx.x+1);
104

  
105
   for (ulong i=0;i<iterations;i++) {
106

  
107
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
108

  
109
   }
110
   __threadfence_block();
111
}
112

  
113
__global__ void MainLoopThreads(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
114
{ 
115
   // uint z=rotl(seed_z,threadIdx.x);
116
   // uint w=rotr(seed_w,threadIdx.x);
117

  
118
   // uint jsr=rotl(seed_z,threadIdx.x);
119
   // uint jcong=rotr(seed_w,threadIdx.x);
120

  
121
   uint z=seed_z;
122
   uint w=seed_w;
123

  
124
   uint jsr=seed_z;
125
   uint jcong=seed_w;
126

  
127
   for (ulong i=0;i<iterations;i++) {
128

  
129
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
130
   }
131

  
132
   __syncthreads();
133
}
134

  
135
__global__ void MainLoopHybrid(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
136
{
137
//   uint z=rotl(seed_z,blockDim.x*blockIdx.x+threadIdx.x threadIdx.x);
138
//   uint w=rotr(seed_w,blockDim.x*blockIdx.x+threadIdx.x);
139

  
140
//   uint jsr=rotl(seed_z,blockDim.x*blockIdx.x+threadIdx.x);
141
//   uint jcong=rotr(seed_w,blockDim.x*blockIdx.x+threadIdx.x);
142

  
143
   uint z=seed_z;
144
   uint w=seed_w;
145

  
146
   uint jsr=seed_z;
147
   uint jcong=seed_w;
148

  
149
   for (ulong i=0;i<iterations;i++) {
150

  
151
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
152
   }
153

  
154
   __syncthreads();
155

  
156
}
157
"""
158

  
159
KERNEL_CODE_OPENCL="""
160
// Marsaglia RNG very simple implementation
161
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
162
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
163

  
164
#define MWC   (znew+wnew)
165
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
166
#define CONG  (jcong=69069*jcong+1234567)
167
#define KISS  ((MWC^CONG)+SHR3)
168

  
169
#define CONGfp CONG * 2.328306435454494e-10f
170
#define SHR3fp SHR3 * 2.328306435454494e-10f
171
#define MWCfp MWC * 2.328306435454494e-10f
172
#define KISSfp KISS * 2.328306435454494e-10f
173

  
174
#define MAX 4294967296
175

  
176
uint rotl(uint value, int shift) {
177
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
178
}
179
 
180
uint rotr(uint value, int shift) {
181
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
182
}
183

  
184
__kernel void MainLoopGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
185
{
186
   __private const float id=(float)get_global_id(0);
187
   __private const float size=(float)get_global_size(0);
188
   __private const float block=space/size;
189

  
190
   __private uint z=seed_z;
191
   __private uint w=seed_w;
192

  
193
   __private uint jsr=seed_z;
194
   __private uint jcong=seed_w;
195

  
196
   for (__private ulong i=0;i<iterations;i++) {
197

  
198
      // Standard version does not work for several processes (some lost!)
199
      //__private uint position=(uint)(((ulong)space*(ulong)MWC)/(ulong)MAX);
200
      // Dense version
201
      __private uint position=(uint)( (ulong)space*((ulong)CONG+MAX*(ulong)id)/(ulong)size/(ulong)MAX );
202
      // Float version seems to be the best...
203
      //__private uint position=(uint)( block*(CONGfp+id) );
204

  
205
      s[position]++;
206
   }
207

  
208
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
209

  
210
}
211

  
212
__kernel void MainLoopLocal(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
213
{
214
   uint z=rotl(seed_z,get_local_id(0));
215
   uint w=rotr(seed_w,get_local_id(0));
216

  
217
   uint jsr=rotl(seed_z,get_local_id(0));
218
   uint jcong=rotr(seed_w,get_local_id(0));
219

  
220
   for (ulong i=0;i<iterations;i++) {
221

  
222
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=(uint)1;
223
   }
224

  
225

  
226
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
227
}
228

  
229
__kernel void MainLoopHybrid(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
230
{
231
   uint z=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
232
   uint w=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
233

  
234
   uint jsr=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
235
   uint jcong=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
236

  
237
   for (ulong i=0;i<iterations;i++) {
238

  
239
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
240

  
241
      barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
242
   }
243

  
244
      
245
}
246
"""
247

  
248
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
249

  
250
  # Avec PyCUDA autoinit, rien a faire !
251
  
252
  circleCU = cuda.InOut(circle)
253
  
254
  mod = SourceModule(KERNEL_CODE_CUDA)
255

  
256
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
257
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
258
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
259
  
260
  start = pycuda.driver.Event()
261
  stop = pycuda.driver.Event()
262
  
263
  MySplutter=numpy.zeros(steps)
264
  MyDuration=numpy.zeros(steps)
265

  
266
  if iterations%jobs==0:
267
    iterationsCL=numpy.uint64(iterations/jobs)
268
  else:
269
    iterationsCL=numpy.uint64(iterations/jobs+1)
270
    
271
  iterationsNew=iterationsCL*jobs
272

  
273
  for i in range(steps):
274

  
275
    Splutter=numpy.zeros(1024).astype(numpy.uint32)
276
    
277
    print Splutter
278

  
279
    SplutterCU = cuda.InOut(Splutter)
280

  
281
    start.record()
282
    start.synchronize()
283
    if ParaStyle=='Blocks':
284
      MetropolisBlocksCU(SplutterCU,
285
                         numpy.uint32(len(Splutter)),
286
                         numpy.uint64(iterationsCL),
287
                         numpy.uint32(nprnd(2**30/jobs)),
288
                         numpy.uint32(nprnd(2**30/jobs)),
289
                         grid=(jobs,1),
290
                         block=(1,1,1))
291
        
292
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
293
            (Alu,jobs,1,ParaStyle)      
294
    elif ParaStyle=='Hybrid':
295
      threads=BestThreadsNumber(jobs)
296
      MetropolisHybridCU(SplutterCU,
297
                         numpy.uint32(len(Splutter)),
298
                         numpy.uint64(iterationsCL),
299
                         numpy.uint32(nprnd(2**30/jobs)),
300
                         numpy.uint32(nprnd(2**30/jobs)),
301
                         grid=(jobs,1),
302
                         block=(threads,1,1))
303
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
304
            (Alu,jobs/threads,threads,ParaStyle)
305
    else:
306
      MetropolisJobsCU(SplutterCU,
307
                       numpy.uint32(len(Splutter)),
308
                       numpy.uint64(iterationsCL),
309
                       numpy.uint32(nprnd(2**30/jobs)),
310
                       numpy.uint32(nprnd(2**30/jobs)),
311
                       grid=(1,1),
312
                       block=(jobs,1,1))
313
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
314
            (Alu,jobs,1,ParaStyle)
315
    stop.record()
316
    stop.synchronize()
317
                
318
    elapsed = start.time_till(stop)*1e-3
319

  
320
    print Splutter,sum(Splutter)
321
    MySplutter[i]=numpy.median(Splutter)
322
    print numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter)
323

  
324
    MyDuration[i]=elapsed
325

  
326
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
327
    #MyPi[i]=numpy.median(AllPi)
328
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
329

  
330

  
331
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
332

  
333
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
334

  
335

  
336
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
337
	
338
  # Initialisation des variables en les CASTant correctement
339

  
340
  if Device==0:
341
    print "Enter XPU selector based on ALU type: first selected"
342
    HasXPU=False
343
    # Default Device selection based on ALU Type
344
    for platform in cl.get_platforms():
345
      for device in platform.get_devices():
346
        deviceType=cl.device_type.to_string(device.type)
347
        deviceMemory=device.max_mem_alloc_size
348
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
349
          XPU=device
350
          print "GPU selected: ",device.name
351
          HasXPU=True
352
          MemoryXPU=deviceMemory
353
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
354
          XPU=device
355
          print "CPU selected: ",device.name
356
          HasXPU=True
357
          MemoryXPU=deviceMemory
358
  else:
359
    print "Enter XPU selector based on device number & ALU type"
360
    Id=1
361
    HasXPU=False
362
    # Primary Device selection based on Device Id
363
    for platform in cl.get_platforms():
364
      for device in platform.get_devices():
365
        deviceType=cl.device_type.to_string(device.type)
366
        deviceMemory=device.max_mem_alloc_size
367
        if Id==Device and Alu==deviceType and HasXPU==False:
368
          XPU=device
369
          print "CPU/GPU selected: ",device.name
370
          HasXPU=True
371
          MemoryXPU=deviceMemory
372
        Id=Id+1
373
    if HasXPU==False:
374
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
375
          (Device,Alu,Id-1)
376
      return(0,0,0)
377

  
378
  # Je cree le contexte et la queue pour son execution
379
  ctx = cl.Context([XPU])
380
  queue = cl.CommandQueue(ctx,
381
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
382

  
383
  # Je recupere les flag possibles pour les buffers
384
  mf = cl.mem_flags
385
	
386
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
387
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
388
  
389
  MyDuration=numpy.zeros(steps)
390
  
391
  if iterations%jobs==0:
392
    iterationsCL=numpy.uint64(iterations/jobs)
393
  else:
394
    iterationsCL=numpy.uint64(iterations/jobs+1)
395

  
396
  iterationsNew=numpy.uint64(iterationsCL*jobs)
397

  
398
  print iterations,iterationsCL,iterationsNew
399

  
400
  MySplutter=numpy.zeros(steps)
401

  
402
  print 'toto ',2**(int)(numpy.log2(MemoryXPU/4)),(MemoryXPU/jobs/4)*jobs
403

  
404
  for i in range(steps):
405
		
406
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
407
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
408
 
409
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
410
    Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
411

  
412
    print Splutter,len(Splutter)
413

  
414
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
415

  
416
    if ParaStyle=='Blocks':
417
      # Call OpenCL kernel
418
      # (1,) is Global work size (only 1 work size)
419
      # (1,) is local work size
420
      # circleCL is lattice translated in CL format
421
      # SeedZCL is lattice translated in CL format
422
      # SeedWCL is lattice translated in CL format
423
      # step is number of iterations
424
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
425
      #                                      SplutterCL,
426
      #                                      numpy.uint32(len(Splutter)),
427
      #                                      numpy.uint64(iterationsCL),
428
      #                                      numpy.uint32(nprnd(2**30/jobs)),
429
      #                                      numpy.uint32(nprnd(2**30/jobs)))
430
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
431
                                           SplutterCL,
432
                                           numpy.uint32(len(Splutter)),
433
                                           numpy.uint64(iterationsCL),
434
                                           numpy.uint32(521288629),
435
                                           numpy.uint32(362436069))
436
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
437
            (Alu,jobs,1,ParaStyle)
438
    elif ParaStyle=='Hybrid':
439
      threads=BestThreadsNumber(jobs)
440
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
441
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
442
                                           SplutterCL,
443
                                           numpy.uint32(len(Splutter)),
444
                                           numpy.uint64(iterationsCL),
445
                                           numpy.uint32(nprnd(2**30/jobs)),
446
                                           numpy.uint32(nprnd(2**30/jobs)))
447
        
448
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
449
            (Alu,jobs/threads,threads,ParaStyle)
450
    else:
451
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
452
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
453
                                          SplutterCL,
454
                                          numpy.uint32(len(Splutter)),
455
                                          numpy.uint64(iterationsCL),
456
                                          numpy.uint32(nprnd(2**30/jobs)),
457
                                          numpy.uint32(nprnd(2**30/jobs)))
458
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
459

  
460
    CLLaunch.wait()
461
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
462

  
463
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
464

  
465
    MyDuration[i]=elapsed
466
    print Splutter,sum(Splutter)
467
    MySplutter[i]=numpy.median(Splutter)
468
    print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)
469

  
470
    SplutterCL.release()
471

  
472
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
473
	
474
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
475

  
476

  
477
def FitAndPrint(N,D,Curves):
478

  
479
  from scipy.optimize import curve_fit
480
  import matplotlib.pyplot as plt
481

  
482
  try:
483
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
484

  
485
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
486
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
487
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
488
    coeffs_Amdahl[0]=D[0]
489
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
490
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
491
  except:
492
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
493

  
494
  try:
495
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
496

  
497
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
498
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
499
    coeffs_AmdahlR[0]=D[0]
500
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
501
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
502

  
503
  except:
504
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
505

  
506
  try:
507
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
508

  
509
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
510
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
511
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
512
    coeffs_Mylq[0]=D[0]
513
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
514
                                                            coeffs_Mylq[1],
515
                                                            coeffs_Mylq[3],
516
                                                            coeffs_Mylq[2])
517
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
518
                coeffs_Mylq[3])
519
  except:
520
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
521

  
522
  try:
523
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
524

  
525
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
526
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
527
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
528
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
529
    coeffs_Mylq2[0]=D[0]
530
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
531
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
532
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
533

  
534
  except:
535
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
536

  
537
  if Curves:
538
    plt.xlabel("Number of Threads/work Items")
539
    plt.ylabel("Total Elapsed Time")
540

  
541
    Experience,=plt.plot(N,D,'ro') 
542
    try:
543
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
544
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
545
    except:
546
      print "Fit curves seem not to be available"
547

  
548
    plt.legend()
549
    plt.show()
550

  
551
if __name__=='__main__':
552

  
553
  # Set defaults values
554
  
555
  # Alu can be CPU, GPU or ACCELERATOR
556
  Alu='CPU'
557
  # Id of GPU : 1 is for first find !
558
  Device=0
559
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
560
  GpuStyle='OpenCL'
561
  # Parallel distribution can be on Threads or Blocks
562
  ParaStyle='Blocks'
563
  # Iterations is integer
564
  Iterations=100000000
565
  # JobStart in first number of Jobs to explore
566
  JobStart=1
567
  # JobEnd is last number of Jobs to explore
568
  JobEnd=16
569
  # JobStep is the step of Jobs to explore
570
  JobStep=1
571
  # Redo is the times to redo the test to improve metrology
572
  Redo=1
573
  # OutMetrology is method for duration estimation : False is GPU inside
574
  OutMetrology=False
575
  Metrology='InMetro'
576
  # Curves is True to print the curves
577
  Curves=False
578
  # Fit is True to print the curves
579
  Fit=False
580

  
581
  try:
582
    opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
583
  except getopt.GetoptError:
584
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
585
    sys.exit(2)
586
    
587
  for opt, arg in opts:
588
    if opt == '-h':
589
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
590

  
591
      print "\nInformations about devices detected under OpenCL:"
592
      # For PyOpenCL import
593
      try:
594
        import pyopencl as cl
595
        Id=1
596
        for platform in cl.get_platforms():
597
          for device in platform.get_devices():
598
            deviceType=cl.device_type.to_string(device.type)
599
            deviceMemory=device.max_mem_alloc_size
600
            print "Device #%i of type %s with memory %i : %s" % (Id,deviceType,deviceMemory,device.name)
601
            Id=Id+1
602

  
603
        print
604
        sys.exit()
605
      except ImportError:
606
        print "Your platform does not seem to support OpenCL"
607
        
608
    elif opt == '-o':
609
      OutMetrology=True
610
      Metrology='OutMetro'
611
    elif opt == '-c':
612
      Curves=True
613
    elif opt == '-f':
614
      Fit=True
615
    elif opt in ("-a", "--alu"):
616
      Alu = arg
617
    elif opt in ("-d", "--device"):
618
      Device = int(arg)
619
    elif opt in ("-g", "--gpustyle"):
620
      GpuStyle = arg
621
    elif opt in ("-p", "--parastyle"):
622
      ParaStyle = arg
623
    elif opt in ("-i", "--iterations"):
624
      Iterations = numpy.uint64(arg)
625
    elif opt in ("-s", "--jobstart"):
626
      JobStart = int(arg)
627
    elif opt in ("-e", "--jobend"):
628
      JobEnd = int(arg)
629
    elif opt in ("-t", "--jobstep"):
630
      JobStep = int(arg)
631
    elif opt in ("-r", "--redo"):
632
      Redo = int(arg)
633

  
634
  if Alu=='CPU' and GpuStyle=='CUDA':
635
    print "Alu can't be CPU for CUDA, set Alu to GPU"
636
    Alu='GPU'
637

  
638
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
639
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
640
    ParaStyle='Threads'
641

  
642
  print "Compute unit : %s" % Alu
643
  print "Device Identification : %s" % Device
644
  print "GpuStyle used : %s" % GpuStyle
645
  print "Parallel Style used : %s" % ParaStyle
646
  print "Iterations : %s" % Iterations
647
  print "Number of threads on start : %s" % JobStart
648
  print "Number of threads on end : %s" % JobEnd
649
  print "Number of redo : %s" % Redo
650
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
651

  
652
  if GpuStyle=='CUDA':
653
    try:
654
      # For PyCUDA import
655
      import pycuda.driver as cuda
656
      import pycuda.gpuarray as gpuarray
657
      import pycuda.autoinit
658
      from pycuda.compiler import SourceModule
659
    except ImportError:
660
      print "Platform does not seem to support CUDA"
661

  
662
  if GpuStyle=='OpenCL':
663
    try:
664
      # For PyOpenCL import
665
      import pyopencl as cl
666
      Id=1
667
      for platform in cl.get_platforms():
668
        for device in platform.get_devices():
669
          deviceType=cl.device_type.to_string(device.type)
670
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
671
          if Id == Device:
672
            # Set the Alu as detected Device Type
673
            Alu=deviceType
674
          Id=Id+1
675
    except ImportError:
676
      print "Platform does not seem to support CUDA"
677
      
678
  average=numpy.array([]).astype(numpy.float32)
679
  median=numpy.array([]).astype(numpy.float32)
680
  stddev=numpy.array([]).astype(numpy.float32)
681

  
682
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
683

  
684
  Jobs=JobStart
685

  
686
  while Jobs <= JobEnd:
687
    avg,med,std=0,0,0
688
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
689
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
690

  
691
    if OutMetrology:
692
      duration=numpy.array([]).astype(numpy.float32)
693
      for i in range(Redo):
694
        start=time.time()
695
        if GpuStyle=='CUDA':
696
          try:
697
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
698
          except:
699
            print "Problem with %i // computations on Cuda" % Jobs
700
        elif GpuStyle=='OpenCL':
701
          try:
702
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
703
                                   Alu,Device)
704
          except:
705
            print "Problem with %i // computations on OpenCL" % Jobs            
706
        duration=numpy.append(duration,time.time()-start)
707
      if (a,m,s) != (0,0,0):
708
        avg=numpy.mean(duration)
709
        med=numpy.median(duration)
710
        std=numpy.std(duration)
711
      else:
712
        print "Values seem to be wrong..."
713
    else:
714
      if GpuStyle=='CUDA':
715
        try:
716
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
717
        except:
718
          print "Problem with %i // computations on Cuda" % Jobs
719
      elif GpuStyle=='OpenCL':
720
        try:
721
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
722
        except:
723
          print "Problem with %i // computations on OpenCL" % Jobs            
724

  
725
    if (avg,med,std) != (0,0,0):
726
      print "jobs,avg,med,std",Jobs,avg,med,std
727
      average=numpy.append(average,avg)
728
      median=numpy.append(median,med)
729
      stddev=numpy.append(stddev,std)
730
    else:
731
      print "Values seem to be wrong..."
732
    #THREADS*=2
733
    if len(average)!=0:
734
      numpy.savez("Splutter_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
735
      ToSave=[ ExploredJobs,average,median,stddev ]
736
      numpy.savetxt("Splutter_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
737
    Jobs+=JobStep
738

  
739
  if Fit:
740
    FitAndPrint(ExploredJobs,median,Curves)
741

  
0 742

  
Splutter/GPU/Splutter_GPU_OpenCL_Blocks_1_16_01000000_Device1_InMetro_lambda (revision 57)
1
1.000000000000000000e+00 1.374917984000000093e+00 1.374917984000000093e+00 0.000000000000000000e+00
2
2.000000000000000000e+00 6.892269120000000804e-01 6.892269120000000804e-01 0.000000000000000000e+00
3
3.000000000000000000e+00 4.882355200000000339e-01 4.882355200000000339e-01 0.000000000000000000e+00
4
4.000000000000000000e+00 3.672600000000000309e-01 3.672600000000000309e-01 0.000000000000000000e+00
5
5.000000000000000000e+00 3.117225279999999987e-01 3.117225279999999987e-01 0.000000000000000000e+00
6
6.000000000000000000e+00 2.597960960000000319e-01 2.597960960000000319e-01 0.000000000000000000e+00
7
7.000000000000000000e+00 2.358063360000000053e-01 2.358063360000000053e-01 0.000000000000000000e+00
8
8.000000000000000000e+00 2.065430400000000111e-01 2.065430400000000111e-01 0.000000000000000000e+00
9
9.000000000000000000e+00 1.947394560000000052e-01 1.947394560000000052e-01 0.000000000000000000e+00
10
1.000000000000000000e+01 1.752788480000000149e-01 1.752788480000000149e-01 0.000000000000000000e+00
11
1.100000000000000000e+01 1.673377280000000189e-01 1.673377280000000189e-01 0.000000000000000000e+00
12
1.200000000000000000e+01 1.535781760000000107e-01 1.535781760000000107e-01 0.000000000000000000e+00
13
1.300000000000000000e+01 1.488022720000000132e-01 1.488022720000000132e-01 0.000000000000000000e+00
14
1.400000000000000000e+01 1.381024319999999972e-01 1.381024319999999972e-01 0.000000000000000000e+00
15
1.500000000000000000e+01 1.348893440000000221e-01 1.348893440000000221e-01 0.000000000000000000e+00
16
1.600000000000000000e+01 1.266907840000000007e-01 1.266907840000000007e-01 0.000000000000000000e+00
Splutter/GPU/Splutter_GPU_OpenCL_Blocks_16_32_10000000_Device1_InMetro_lambda (revision 57)
1
1.600000000000000000e+01 1.264748384000000003e+00 1.264748384000000003e+00 0.000000000000000000e+00
2
1.700000000000000000e+01 1.245382976000000141e+00 1.245382976000000141e+00 0.000000000000000000e+00
3
1.800000000000000000e+01 1.176304256000000104e+00 1.176304256000000104e+00 0.000000000000000000e+00
4
1.900000000000000000e+01 1.163268160000000107e+00 1.163268160000000107e+00 0.000000000000000000e+00
5
2.000000000000000000e+01 1.105208096000000140e+00 1.105208096000000140e+00 0.000000000000000000e+00
6
2.100000000000000000e+01 1.096875103999999990e+00 1.096875103999999990e+00 0.000000000000000000e+00
7
2.200000000000000000e+01 1.047029184000000113e+00 1.047029184000000113e+00 0.000000000000000000e+00
8
2.300000000000000000e+01 1.042028736000000011e+00 1.042028736000000011e+00 0.000000000000000000e+00
9
2.400000000000000000e+01 9.988303040000000577e-01 9.988303040000000577e-01 0.000000000000000000e+00
10
2.500000000000000000e+01 9.959597120000001080e-01 9.959597120000001080e-01 0.000000000000000000e+00
11
2.600000000000000000e+01 9.576417280000000254e-01 9.576417280000000254e-01 0.000000000000000000e+00
12
2.700000000000000000e+01 9.567340160000000759e-01 9.567340160000000759e-01 0.000000000000000000e+00
13
2.800000000000000000e+01 9.225528320000000448e-01 9.225528320000000448e-01 0.000000000000000000e+00
14
2.900000000000000000e+01 9.228707520000000164e-01 9.228707520000000164e-01 0.000000000000000000e+00
15
3.000000000000000000e+01 8.921079360000000458e-01 8.921079360000000458e-01 0.000000000000000000e+00
16
3.100000000000000000e+01 8.912513600000000480e-01 8.912513600000000480e-01 0.000000000000000000e+00
17
3.200000000000000000e+01 8.647776320000000183e-01 8.647776320000000183e-01 0.000000000000000000e+00

Formats disponibles : Unified diff