Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 48

Historique | Voir | Annoter | Télécharger (20,64 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MC using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import matplotlib.pyplot as plt
23
import math
24
from scipy.optimize import curve_fit
25
from socket import gethostname
26

    
27
# find prime factors of a number
28
# Get for WWW :
29
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
30
def PrimeFactors(x):
31
  factorlist=numpy.array([]).astype('uint32')
32
  loop=2
33
  while loop<=x:
34
    if x%loop==0:
35
      x/=loop
36
      factorlist=numpy.append(factorlist,[loop])
37
    else:
38
      loop+=1
39
  return factorlist
40

    
41
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
42
# output is thread number
43
def BestThreadsNumber(jobs):
44
  factors=PrimeFactors(jobs)
45
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
46
  threads=1
47
  for factor in matrix.transpose().ravel():
48
    threads=threads*factor
49
    if threads*threads>jobs:
50
      break
51
  return(long(threads))
52

    
53
# Predicted Amdahl Law (Reduced with s=1-p)  
54
def AmdahlR(N, T1, p):
55
  return (T1*(1-p+p/N))
56

    
57
# Predicted Amdahl Law
58
def Amdahl(N, T1, s, p):
59
  return (T1*(s+p/N))
60

    
61
# Predicted Mylq Law with first order
62
def Mylq(N, T1,s,c,p):
63
  return (T1*(s+p/N)+c*N)
64

    
65
# Predicted Mylq Law with second order
66
def Mylq2(N, T1,s,c1,c2,p):
67
  return (T1*(s+p/N)+c1*N+c2*N*N)
68

    
69
KERNEL_CODE_CUDA="""
70

71
// Marsaglia RNG very simple implementation
72

73
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
74
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
75
#define MWC   (znew+wnew)
76
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
77
#define CONG  (jcong=69069*jcong+1234567)
78
#define KISS  ((MWC^CONG)+SHR3)
79

80
#define MWCfp MWC * 2.328306435454494e-10f
81
#define KISSfp KISS * 2.328306435454494e-10f
82

83
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
84
{
85
   uint z=seed_z/(blockIdx.x+1);
86
   uint w=seed_w/(blockIdx.x+1);
87

88
   ulong total=0;
89

90
   for (ulong i=0;i<iterations;i++) {
91

92
      float x=MWCfp ;
93
      float y=MWCfp ;
94

95
      // Matching test
96
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
97
      total+=inside;
98

99
   }
100

101
   s[blockIdx.x]=total;
102
   __syncthreads();
103

104
}
105

106
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
107
{
108
   uint z=seed_z/(threadIdx.x+1);
109
   uint w=seed_w/(threadIdx.x+1);
110

111
   ulong total=0;
112

113
   for (ulong i=0;i<iterations;i++) {
114

115
      float x=MWCfp ;
116
      float y=MWCfp ;
117

118
      // Matching test
119
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
120
      total+=inside;
121

122
   }
123

124
   s[threadIdx.x]=total;
125
   __syncthreads();
126

127
}
128

129
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
130
{
131
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
132
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
133

134
   ulong total=0;
135

136
   for (ulong i=0;i<iterations;i++) {
137

138
      float x=MWCfp ;
139
      float y=MWCfp ;
140

141
      // Matching test
142
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
143
      total+=inside;
144

145
   }
146

147
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
148
   __syncthreads();
149

150
}
151
"""
152

    
153
KERNEL_CODE_OPENCL="""
154

155
// Marsaglia RNG very simple implementation
156
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
157
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
158
#define MWC   (znew+wnew)
159
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160
#define CONG  (jcong=69069*jcong+1234567)
161
#define KISS  ((MWC^CONG)+SHR3)
162

163
#define MWCfp MWC * 2.328306435454494e-10f
164
#define KISSfp KISS * 2.328306435454494e-10f
165

166
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
167
{
168
   uint z=seed_z/(get_global_id(0)+1);
169
   uint w=seed_w/(get_global_id(0)+1);
170

171
   ulong total=0;
172

173
   for (ulong i=0;i<iterations;i++) {
174

175
      float x=MWCfp ;
176
      float y=MWCfp ;
177

178
      // Matching test
179
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
180
      total+=inside;
181
   }
182
   s[get_global_id(0)]=total;
183
   barrier(CLK_GLOBAL_MEM_FENCE);
184
      
185
}
186

187
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
188
{
189
   uint z=seed_z/(get_local_id(0)+1);
190
   uint w=seed_w/(get_local_id(0)+1);
191

192
   ulong total=0;
193

194
   for (ulong i=0;i<iterations;i++) {
195

196
      float x=MWCfp ;
197
      float y=MWCfp ;
198

199
      // Matching test
200
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
201
      total+=inside;
202
   }
203
   s[get_local_id(0)]=total;
204
   barrier(CLK_LOCAL_MEM_FENCE);
205
      
206
}
207

208
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
209
{
210
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
211
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
212

213
   ulong total=0;
214

215
   for (uint i=0;i<iterations;i++) {
216

217
      float x=MWCfp ;
218
     float y=MWCfp ;
219

220
      // Matching test
221
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
222
      total+=inside;
223
   }
224
   barrier(CLK_LOCAL_MEM_FENCE);
225
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
226
      
227
}
228
"""
229

    
230
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
231

    
232
  # Avec PyCUDA autoinit, rien a faire !
233
  
234
  circleCU = cuda.InOut(circle)
235
  
236
  mod = SourceModule(KERNEL_CODE_CUDA)
237

    
238
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
239
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
240
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
241
  
242
  start = pycuda.driver.Event()
243
  stop = pycuda.driver.Event()
244
  
245
  MyPi=numpy.zeros(steps)
246
  MyDuration=numpy.zeros(steps)
247
  
248
  if iterations%jobs==0:
249
    iterationsCL=numpy.uint64(iterations/jobs)
250
    iterationsNew=iterationsCL*jobs
251
  else:
252
    iterationsCL=numpy.uint64(iterations/jobs+1)
253
    iterationsNew=iterations
254

    
255
  for i in range(steps):
256
    start.record()
257
    start.synchronize()
258
    if ParaStyle=='Blocks':
259
      MetropolisBlocksCU(circleCU,
260
                         numpy.uint64(iterationsCL),
261
                         numpy.uint32(nprnd(2**30/jobs)),
262
                         numpy.uint32(nprnd(2**30/jobs)),
263
                         grid=(jobs,1),
264
                         block=(1,1,1))
265
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
266
            (Alu,jobs,1,ParaStyle)      
267
    elif ParaStyle=='Hybrid':
268
      threads=BestThreadsNumber(jobs)
269
      MetropolisHybridCU(circleCU,
270
                          numpy.uint64(iterationsCL),
271
                          numpy.uint32(nprnd(2**30/jobs)),
272
                          numpy.uint32(nprnd(2**30/jobs)),
273
                          grid=(jobs,1),
274
                          block=(threads,1,1))
275
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
276
            (Alu,jobs/threads,threads,ParaStyle)
277
    else:
278
      MetropolisJobsCU(circleCU,
279
                          numpy.uint64(iterationsCL),
280
                          numpy.uint32(nprnd(2**30/jobs)),
281
                          numpy.uint32(nprnd(2**30/jobs)),
282
                          grid=(1,1),
283
                          block=(jobs,1,1))
284
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
285
            (Alu,jobs,1,ParaStyle)
286
    stop.record()
287
    stop.synchronize()
288
                
289
    #elapsed = stop.time_since(start)*1e-3
290
    elapsed = start.time_till(stop)*1e-3
291

    
292
    #print circle,float(numpy.sum(circle))
293
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
294
    MyDuration[i]=elapsed
295
    #print MyPi[i],MyDuration[i]
296
    #time.sleep(1)
297

    
298
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
299

    
300
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
301

    
302

    
303
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
304
        
305
  # Initialisation des variables en les CASTant correctement
306
    
307
  if Device==0:
308
    print "Enter XPU selector based on ALU type: first selected"
309
    HasXPU=False
310
    # Default Device selection based on ALU Type
311
    for platform in cl.get_platforms():
312
      for device in platform.get_devices():
313
        deviceType=cl.device_type.to_string(device.type)
314
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
315
          XPU=device
316
          print "GPU selected: ",device.name
317
          HasXPU=True
318
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
319
          XPU=device
320
          print "CPU selected: ",device.name
321
          HasXPU=True
322
  else:
323
    print "Enter XPU selector based on device number & ALU type"
324
    Id=1
325
    HasXPU=False
326
    # Primary Device selection based on Device Id
327
    for platform in cl.get_platforms():
328
      for device in platform.get_devices():
329
        deviceType=cl.device_type.to_string(device.type)
330
        if Id==Device and Alu==deviceType and HasXPU==False:
331
          XPU=device
332
          print "CPU/GPU selected: ",device.name
333
          HasXPU=True
334
        Id=Id+1
335
    if HasXPU==False:
336
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
337
          (Device,Alu,Id-1)
338
      return(0,0,0)
339
                                
340
  # Je cree le contexte et la queue pour son execution
341
  #ctx = cl.create_some_context()
342
  ctx = cl.Context([XPU])
343
  queue = cl.CommandQueue(ctx,
344
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
345

    
346
  # Je recupere les flag possibles pour les buffers
347
  mf = cl.mem_flags
348
        
349
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
350

    
351
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
352
    options = "-cl-mad-enable -cl-fast-relaxed-math")
353

    
354
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
355

    
356
  i=0
357

    
358
  MyPi=numpy.zeros(steps)
359
  MyDuration=numpy.zeros(steps)
360
  
361
  if iterations%jobs==0:
362
    iterationsCL=numpy.uint64(iterations/jobs)
363
    iterationsNew=numpy.uint64(iterationsCL*jobs)
364
  else:
365
    iterationsCL=numpy.uint64(iterations/jobs+1)
366
    iterationsNew=numpy.uint64(iterations)
367

    
368
  for i in range(steps):
369
                
370
    if ParaStyle=='Blocks':
371
      # Call OpenCL kernel
372
      # (1,) is Global work size (only 1 work size)
373
      # (1,) is local work size
374
      # circleCL is lattice translated in CL format
375
      # SeedZCL is lattice translated in CL format
376
      # SeedWCL is lattice translated in CL format
377
      # step is number of iterations
378
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
379
                                           circleCL,
380
                                           numpy.uint64(iterationsCL),
381
                                           numpy.uint32(nprnd(2**30/jobs)),
382
                                           numpy.uint32(nprnd(2**30/jobs)))
383
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
384
            (Alu,jobs,1,ParaStyle)
385
    elif ParaStyle=='Hybrid':
386
      threads=BestThreadsNumber(jobs)
387
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
388
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
389
                                          circleCL,
390
                                          numpy.uint64(iterationsCL),
391
                                          numpy.uint32(nprnd(2**30/jobs)),
392
                                          numpy.uint32(nprnd(2**30/jobs)))
393
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
394
            (Alu,jobs/threads,threads,ParaStyle)
395
    else:
396
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
397
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
398
                                          circleCL,
399
                                          numpy.uint64(iterationsCL),
400
                                          numpy.uint32(nprnd(2**30/jobs)),
401
                                          numpy.uint32(nprnd(2**30/jobs)))
402
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
403

    
404
    CLLaunch.wait()
405
    cl.enqueue_copy(queue, circle, circleCL).wait()
406

    
407
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
408

    
409
    #print circle,float(numpy.sum(circle))
410
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
411
    MyDuration[i]=elapsed
412
    print MyPi[i],MyDuration[i]
413

    
414
  circleCL.release()
415

    
416
  #print jobs,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
417
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
418
        
419
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
420

    
421

    
422
def FitAndPrint(N,D,Curves):
423

    
424
  try:
425
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
426

    
427
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
428
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
429
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
430
    coeffs_Amdahl[0]=D[0]
431
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
432
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
433
  except:
434
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
435

    
436
  try:
437
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
438

    
439
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
440
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
441
    coeffs_AmdahlR[0]=D[0]
442
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
443
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
444

    
445
  except:
446
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
447

    
448
  try:
449
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
450

    
451
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
452
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
453
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
454
    coeffs_Mylq[0]=D[0]
455
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
456
                                                            coeffs_Mylq[1],
457
                                                            coeffs_Mylq[3],
458
                                                            coeffs_Mylq[2])
459
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
460
                coeffs_Mylq[3])
461
  except:
462
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
463

    
464
  try:
465
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
466

    
467
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
468
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
469
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
470
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
471
    coeffs_Mylq2[0]=D[0]
472
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
473
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
474
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
475

    
476
  except:
477
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
478

    
479
  if Curves:
480
    plt.xlabel("Number of Threads/work Items")
481
    plt.ylabel("Total Elapsed Time")
482

    
483
    Experience,=plt.plot(N,D,'ro') 
484
    try:
485
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
486
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
487
    except:
488
      print "Fit curves seem not to be available"
489

    
490
    plt.legend()
491
    plt.show()
492

    
493
if __name__=='__main__':
494

    
495
  # Set defaults values
496
  # Alu can be CPU or GPU
497
  Alu='CPU'
498
  # Id of GPU : 1 is for first find !
499
  Device=0
500
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
501
  GpuStyle='OpenCL'
502
  # Parallel distribution can be on Threads or Blocks
503
  ParaStyle='Blocks'
504
  # Iterations is integer
505
  Iterations=100000000
506
  # JobStart in first number of Jobs to explore
507
  JobStart=1
508
  # JobEnd is last number of Jobs to explore
509
  JobEnd=16
510
  # JobStep is the step of Jobs to explore
511
  JobStep=1
512
  # Redo is the times to redo the test to improve metrology
513
  Redo=1
514
  # OutMetrology is method for duration estimation : False is GPU inside
515
  OutMetrology=False
516
  Metrology='InMetro'
517
  # Curves is True to print the curves
518
  Curves=False
519

    
520
  try:
521
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
522
  except getopt.GetoptError:
523
    print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
524
    sys.exit(2)
525
    
526
  for opt, arg in opts:
527
    if opt == '-h':
528
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
529

    
530
      print "\nInformations about devices detected under OpenCL:"
531
      # For PyOpenCL import
532
      import pyopencl as cl
533
      Id=1
534
      for platform in cl.get_platforms():
535
        for device in platform.get_devices():
536
          deviceType=cl.device_type.to_string(device.type)
537
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
538
          Id=Id+1
539

    
540
      sys.exit()
541
    elif opt == '-o':
542
      OutMetrology=True
543
      Metrology='OutMetro'
544
    elif opt == '-c':
545
      Curves=True
546
    elif opt in ("-a", "--alu"):
547
      Alu = arg
548
    elif opt in ("-d", "--device"):
549
      Device = int(arg)
550
    elif opt in ("-g", "--gpustyle"):
551
      GpuStyle = arg
552
    elif opt in ("-p", "--parastyle"):
553
      ParaStyle = arg
554
    elif opt in ("-i", "--iterations"):
555
      Iterations = numpy.uint64(arg)
556
    elif opt in ("-s", "--jobstart"):
557
      JobStart = int(arg)
558
    elif opt in ("-e", "--jobend"):
559
      JobEnd = int(arg)
560
    elif opt in ("-t", "--jobstep"):
561
      JobStep = int(arg)
562
    elif opt in ("-r", "--redo"):
563
      Redo = int(arg)
564

    
565
  if Alu=='CPU' and GpuStyle=='CUDA':
566
    print "Alu can't be CPU for CUDA, set Alu to GPU"
567
    Alu='GPU'
568

    
569
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
570
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
571
    ParaStyle='Threads'
572

    
573
  print "Compute unit : %s" % Alu
574
  print "Device Identification : %s" % Device
575
  print "GpuStyle used : %s" % GpuStyle
576
  print "Parallel Style used : %s" % ParaStyle
577
  print "Iterations : %s" % Iterations
578
  print "Number of threads on start : %s" % JobStart
579
  print "Number of threads on end : %s" % JobEnd
580
  print "Number of redo : %s" % Redo
581
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
582

    
583
  if GpuStyle=='CUDA':
584
    # For PyCUDA import
585
    import pycuda.driver as cuda
586
    import pycuda.gpuarray as gpuarray
587
    import pycuda.autoinit
588
    from pycuda.compiler import SourceModule
589

    
590
  if GpuStyle=='OpenCL':
591
    # For PyOpenCL import
592
    import pyopencl as cl
593
    Id=1
594
    for platform in cl.get_platforms():
595
      for device in platform.get_devices():
596
        deviceType=cl.device_type.to_string(device.type)
597
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
598
        Id=Id+1
599

    
600
  average=numpy.array([]).astype(numpy.float32)
601
  median=numpy.array([]).astype(numpy.float32)
602
  stddev=numpy.array([]).astype(numpy.float32)
603

    
604
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
605

    
606
  Jobs=JobStart
607

    
608
  while Jobs <= JobEnd:
609
    avg,med,std=0,0,0
610
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
611
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
612

    
613
    if OutMetrology:
614
      duration=numpy.array([]).astype(numpy.float32)
615
      for i in range(Redo):
616
        start=time.time()
617
        if GpuStyle=='CUDA':
618
          try:
619
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
620
          except:
621
            print "Problem with %i // computations on Cuda" % Jobs
622
        elif GpuStyle=='OpenCL':
623
          try:
624
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device)
625
          except:
626
            print "Problem with %i // computations on OpenCL" % Jobs            
627
        duration=numpy.append(duration,time.time()-start)
628
      if (a,m,s) != (0,0,0):
629
        avg=numpy.mean(duration)
630
        med=numpy.median(duration)
631
        std=numpy.std(duration)
632
      else:
633
        print "Values seem to be wrong..."
634
    else:
635
      if GpuStyle=='CUDA':
636
        try:
637
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
638
        except:
639
          print "Problem with %i // computations on Cuda" % Jobs
640
      elif GpuStyle=='OpenCL':
641
        # try:
642
        #   avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
643
        # except:
644
        #   print "Problem with %i // computations on OpenCL" % Jobs            
645
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
646

    
647
    if (avg,med,std) != (0,0,0):
648
      print "jobs,avg,med,std",Jobs,avg,med,std
649
      average=numpy.append(average,avg)
650
      median=numpy.append(median,med)
651
      stddev=numpy.append(stddev,std)
652
    else:
653
      print "Values seem to be wrong..."
654
    #THREADS*=2
655
    if len(average)!=0:
656
      numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
657
      numpy.savetxt("Pi_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
658
    Jobs+=JobStep
659

    
660
  FitAndPrint(ExploredJobs,median,Curves)
661