Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 49

Historique | Voir | Annoter | Télécharger (20,59 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MC using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import matplotlib.pyplot as plt
23
import math
24
from scipy.optimize import curve_fit
25
from socket import gethostname
26

    
27
# find prime factors of a number
28
# Get for WWW :
29
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
30
def PrimeFactors(x):
31
  factorlist=numpy.array([]).astype('uint32')
32
  loop=2
33
  while loop<=x:
34
    if x%loop==0:
35
      x/=loop
36
      factorlist=numpy.append(factorlist,[loop])
37
    else:
38
      loop+=1
39
  return factorlist
40

    
41
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
42
# output is thread number
43
def BestThreadsNumber(jobs):
44
  factors=PrimeFactors(jobs)
45
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
46
  threads=1
47
  for factor in matrix.transpose().ravel():
48
    threads=threads*factor
49
    if threads*threads>jobs:
50
      break
51
  return(long(threads))
52

    
53
# Predicted Amdahl Law (Reduced with s=1-p)  
54
def AmdahlR(N, T1, p):
55
  return (T1*(1-p+p/N))
56

    
57
# Predicted Amdahl Law
58
def Amdahl(N, T1, s, p):
59
  return (T1*(s+p/N))
60

    
61
# Predicted Mylq Law with first order
62
def Mylq(N, T1,s,c,p):
63
  return (T1*(s+p/N)+c*N)
64

    
65
# Predicted Mylq Law with second order
66
def Mylq2(N, T1,s,c1,c2,p):
67
  return (T1*(s+p/N)+c1*N+c2*N*N)
68

    
69
KERNEL_CODE_CUDA="""
70

71
// Marsaglia RNG very simple implementation
72

73
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
74
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
75
#define MWC   (znew+wnew)
76
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
77
#define CONG  (jcong=69069*jcong+1234567)
78
#define KISS  ((MWC^CONG)+SHR3)
79

80
#define MWCfp MWC * 2.328306435454494e-10f
81
#define KISSfp KISS * 2.328306435454494e-10f
82

83
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
84
{
85
   uint z=seed_z/(blockIdx.x+1);
86
   uint w=seed_w/(blockIdx.x+1);
87

88
   ulong total=0;
89

90
   for (ulong i=0;i<iterations;i++) {
91

92
      float x=MWCfp ;
93
      float y=MWCfp ;
94

95
      // Matching test
96
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
97
      total+=inside;
98

99
   }
100

101
   s[blockIdx.x]=total;
102
   __syncthreads();
103

104
}
105

106
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
107
{
108
   uint z=seed_z/(threadIdx.x+1);
109
   uint w=seed_w/(threadIdx.x+1);
110

111
   ulong total=0;
112

113
   for (ulong i=0;i<iterations;i++) {
114

115
      float x=MWCfp ;
116
      float y=MWCfp ;
117

118
      // Matching test
119
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
120
      total+=inside;
121

122
   }
123

124
   s[threadIdx.x]=total;
125
   __syncthreads();
126

127
}
128

129
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
130
{
131
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
132
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
133

134
   ulong total=0;
135

136
   for (ulong i=0;i<iterations;i++) {
137

138
      float x=MWCfp ;
139
      float y=MWCfp ;
140

141
      // Matching test
142
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
143
      total+=inside;
144

145
   }
146

147
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
148
   __syncthreads();
149

150
}
151
"""
152

    
153
KERNEL_CODE_OPENCL="""
154

155
// Marsaglia RNG very simple implementation
156
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
157
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
158
#define MWC   (znew+wnew)
159
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160
#define CONG  (jcong=69069*jcong+1234567)
161
#define KISS  ((MWC^CONG)+SHR3)
162

163
#define MWCfp MWC * 2.328306435454494e-10f
164
#define KISSfp KISS * 2.328306435454494e-10f
165

166
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
167
{
168
   uint z=seed_z/(get_global_id(0)+1);
169
   uint w=seed_w/(get_global_id(0)+1);
170

171
   ulong total=0;
172

173
   for (ulong i=0;i<iterations;i++) {
174

175
      float x=MWCfp ;
176
      float y=MWCfp ;
177

178
      // Matching test
179
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
180
      total+=inside;
181
   }
182
   s[get_global_id(0)]=total;
183
   barrier(CLK_GLOBAL_MEM_FENCE);
184
      
185
}
186

187
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
188
{
189
   uint z=seed_z/(get_local_id(0)+1);
190
   uint w=seed_w/(get_local_id(0)+1);
191

192
   ulong total=0;
193

194
   for (ulong i=0;i<iterations;i++) {
195

196
      float x=MWCfp ;
197
      float y=MWCfp ;
198

199
      // Matching test
200
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
201
      total+=inside;
202
   }
203
   s[get_local_id(0)]=total;
204
   barrier(CLK_LOCAL_MEM_FENCE);
205
      
206
}
207

208
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
209
{
210
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
211
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
212

213
   ulong total=0;
214

215
   for (uint i=0;i<iterations;i++) {
216

217
      float x=MWCfp ;
218
     float y=MWCfp ;
219

220
      // Matching test
221
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
222
      total+=inside;
223
   }
224
   barrier(CLK_LOCAL_MEM_FENCE);
225
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
226
      
227
}
228
"""
229

    
230
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
231

    
232
  # Avec PyCUDA autoinit, rien a faire !
233
  
234
  circleCU = cuda.InOut(circle)
235
  
236
  mod = SourceModule(KERNEL_CODE_CUDA)
237

    
238
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
239
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
240
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
241
  
242
  start = pycuda.driver.Event()
243
  stop = pycuda.driver.Event()
244
  
245
  MyPi=numpy.zeros(steps)
246
  MyDuration=numpy.zeros(steps)
247
  
248
  if iterations%jobs==0:
249
    iterationsCL=numpy.uint64(iterations/jobs)
250
    iterationsNew=iterationsCL*jobs
251
  else:
252
    iterationsCL=numpy.uint64(iterations/jobs+1)
253
    iterationsNew=iterations
254

    
255
  for i in range(steps):
256
    start.record()
257
    start.synchronize()
258
    if ParaStyle=='Blocks':
259
      MetropolisBlocksCU(circleCU,
260
                         numpy.uint64(iterationsCL),
261
                         numpy.uint32(nprnd(2**30/jobs)),
262
                         numpy.uint32(nprnd(2**30/jobs)),
263
                         grid=(jobs,1),
264
                         block=(1,1,1))
265
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
266
            (Alu,jobs,1,ParaStyle)      
267
    elif ParaStyle=='Hybrid':
268
      threads=BestThreadsNumber(jobs)
269
      MetropolisHybridCU(circleCU,
270
                          numpy.uint64(iterationsCL),
271
                          numpy.uint32(nprnd(2**30/jobs)),
272
                          numpy.uint32(nprnd(2**30/jobs)),
273
                          grid=(jobs,1),
274
                          block=(threads,1,1))
275
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
276
            (Alu,jobs/threads,threads,ParaStyle)
277
    else:
278
      MetropolisJobsCU(circleCU,
279
                          numpy.uint64(iterationsCL),
280
                          numpy.uint32(nprnd(2**30/jobs)),
281
                          numpy.uint32(nprnd(2**30/jobs)),
282
                          grid=(1,1),
283
                          block=(jobs,1,1))
284
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
285
            (Alu,jobs,1,ParaStyle)
286
    stop.record()
287
    stop.synchronize()
288
                
289
    #elapsed = stop.time_since(start)*1e-3
290
    elapsed = start.time_till(stop)*1e-3
291

    
292
    #print circle,float(numpy.sum(circle))
293
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
294
    MyDuration[i]=elapsed
295
    #print MyPi[i],MyDuration[i]
296
    #time.sleep(1)
297

    
298
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
299

    
300
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
301

    
302

    
303
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
304
        
305
  # Initialisation des variables en les CASTant correctement
306
    
307
  if Device==0:
308
    print "Enter XPU selector based on ALU type: first selected"
309
    HasXPU=False
310
    # Default Device selection based on ALU Type
311
    for platform in cl.get_platforms():
312
      for device in platform.get_devices():
313
        deviceType=cl.device_type.to_string(device.type)
314
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
315
          XPU=device
316
          print "GPU selected: ",device.name
317
          HasXPU=True
318
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
319
          XPU=device
320
          print "CPU selected: ",device.name
321
          HasXPU=True
322
  else:
323
    print "Enter XPU selector based on device number & ALU type"
324
    Id=1
325
    HasXPU=False
326
    # Primary Device selection based on Device Id
327
    for platform in cl.get_platforms():
328
      for device in platform.get_devices():
329
        deviceType=cl.device_type.to_string(device.type)
330
        if Id==Device and Alu==deviceType and HasXPU==False:
331
          XPU=device
332
          print "CPU/GPU selected: ",device.name
333
          HasXPU=True
334
        Id=Id+1
335
    if HasXPU==False:
336
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
337
          (Device,Alu,Id-1)
338
      return(0,0,0)
339
                                
340
  # Je cree le contexte et la queue pour son execution
341
  #ctx = cl.create_some_context()
342
  ctx = cl.Context([XPU])
343
  queue = cl.CommandQueue(ctx,
344
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
345

    
346
  # Je recupere les flag possibles pour les buffers
347
  mf = cl.mem_flags
348
        
349
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
350

    
351
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
352
    options = "-cl-mad-enable -cl-fast-relaxed-math")
353

    
354
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
355

    
356
  i=0
357

    
358
  MyPi=numpy.zeros(steps)
359
  MyDuration=numpy.zeros(steps)
360
  
361
  if iterations%jobs==0:
362
    iterationsCL=numpy.uint64(iterations/jobs)
363
    iterationsNew=numpy.uint64(iterationsCL*jobs)
364
  else:
365
    iterationsCL=numpy.uint64(iterations/jobs+1)
366
    iterationsNew=numpy.uint64(iterations)
367

    
368
  for i in range(steps):
369
                
370
    if ParaStyle=='Blocks':
371
      # Call OpenCL kernel
372
      # (1,) is Global work size (only 1 work size)
373
      # (1,) is local work size
374
      # circleCL is lattice translated in CL format
375
      # SeedZCL is lattice translated in CL format
376
      # SeedWCL is lattice translated in CL format
377
      # step is number of iterations
378
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
379
                                           circleCL,
380
                                           numpy.uint64(iterationsCL),
381
                                           numpy.uint32(nprnd(2**30/jobs)),
382
                                           numpy.uint32(nprnd(2**30/jobs)))
383
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
384
            (Alu,jobs,1,ParaStyle)
385
    elif ParaStyle=='Hybrid':
386
      threads=BestThreadsNumber(jobs)
387
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
388
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
389
                                          circleCL,
390
                                          numpy.uint64(iterationsCL),
391
                                          numpy.uint32(nprnd(2**30/jobs)),
392
                                          numpy.uint32(nprnd(2**30/jobs)))
393
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
394
            (Alu,jobs/threads,threads,ParaStyle)
395
    else:
396
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
397
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
398
                                          circleCL,
399
                                          numpy.uint64(iterationsCL),
400
                                          numpy.uint32(nprnd(2**30/jobs)),
401
                                          numpy.uint32(nprnd(2**30/jobs)))
402
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
403

    
404
    CLLaunch.wait()
405
    cl.enqueue_copy(queue, circle, circleCL).wait()
406

    
407
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
408

    
409
    MyDuration[i]=elapsed
410
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
411
    MyPi[i]=numpy.median(AllPi)
412
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
413

    
414
  circleCL.release()
415

    
416
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
417
        
418
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
419

    
420

    
421
def FitAndPrint(N,D,Curves):
422

    
423
  try:
424
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
425

    
426
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
427
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
428
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
429
    coeffs_Amdahl[0]=D[0]
430
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
431
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
432
  except:
433
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
434

    
435
  try:
436
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
437

    
438
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
439
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
440
    coeffs_AmdahlR[0]=D[0]
441
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
442
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
443

    
444
  except:
445
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
446

    
447
  try:
448
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
449

    
450
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
451
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
452
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
453
    coeffs_Mylq[0]=D[0]
454
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
455
                                                            coeffs_Mylq[1],
456
                                                            coeffs_Mylq[3],
457
                                                            coeffs_Mylq[2])
458
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
459
                coeffs_Mylq[3])
460
  except:
461
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
462

    
463
  try:
464
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
465

    
466
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
467
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
468
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
469
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
470
    coeffs_Mylq2[0]=D[0]
471
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
472
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
473
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
474

    
475
  except:
476
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
477

    
478
  if Curves:
479
    plt.xlabel("Number of Threads/work Items")
480
    plt.ylabel("Total Elapsed Time")
481

    
482
    Experience,=plt.plot(N,D,'ro') 
483
    try:
484
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
485
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
486
    except:
487
      print "Fit curves seem not to be available"
488

    
489
    plt.legend()
490
    plt.show()
491

    
492
if __name__=='__main__':
493

    
494
  # Set defaults values
495
  # Alu can be CPU or GPU
496
  Alu='CPU'
497
  # Id of GPU : 1 is for first find !
498
  Device=0
499
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
500
  GpuStyle='OpenCL'
501
  # Parallel distribution can be on Threads or Blocks
502
  ParaStyle='Blocks'
503
  # Iterations is integer
504
  Iterations=100000000
505
  # JobStart in first number of Jobs to explore
506
  JobStart=1
507
  # JobEnd is last number of Jobs to explore
508
  JobEnd=16
509
  # JobStep is the step of Jobs to explore
510
  JobStep=1
511
  # Redo is the times to redo the test to improve metrology
512
  Redo=1
513
  # OutMetrology is method for duration estimation : False is GPU inside
514
  OutMetrology=False
515
  Metrology='InMetro'
516
  # Curves is True to print the curves
517
  Curves=False
518

    
519
  try:
520
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
521
  except getopt.GetoptError:
522
    print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
523
    sys.exit(2)
524
    
525
  for opt, arg in opts:
526
    if opt == '-h':
527
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
528

    
529
      print "\nInformations about devices detected under OpenCL:"
530
      # For PyOpenCL import
531
      import pyopencl as cl
532
      Id=1
533
      for platform in cl.get_platforms():
534
        for device in platform.get_devices():
535
          deviceType=cl.device_type.to_string(device.type)
536
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
537
          Id=Id+1
538

    
539
      sys.exit()
540
    elif opt == '-o':
541
      OutMetrology=True
542
      Metrology='OutMetro'
543
    elif opt == '-c':
544
      Curves=True
545
    elif opt in ("-a", "--alu"):
546
      Alu = arg
547
    elif opt in ("-d", "--device"):
548
      Device = int(arg)
549
    elif opt in ("-g", "--gpustyle"):
550
      GpuStyle = arg
551
    elif opt in ("-p", "--parastyle"):
552
      ParaStyle = arg
553
    elif opt in ("-i", "--iterations"):
554
      Iterations = numpy.uint64(arg)
555
    elif opt in ("-s", "--jobstart"):
556
      JobStart = int(arg)
557
    elif opt in ("-e", "--jobend"):
558
      JobEnd = int(arg)
559
    elif opt in ("-t", "--jobstep"):
560
      JobStep = int(arg)
561
    elif opt in ("-r", "--redo"):
562
      Redo = int(arg)
563

    
564
  if Alu=='CPU' and GpuStyle=='CUDA':
565
    print "Alu can't be CPU for CUDA, set Alu to GPU"
566
    Alu='GPU'
567

    
568
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
569
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
570
    ParaStyle='Threads'
571

    
572
  print "Compute unit : %s" % Alu
573
  print "Device Identification : %s" % Device
574
  print "GpuStyle used : %s" % GpuStyle
575
  print "Parallel Style used : %s" % ParaStyle
576
  print "Iterations : %s" % Iterations
577
  print "Number of threads on start : %s" % JobStart
578
  print "Number of threads on end : %s" % JobEnd
579
  print "Number of redo : %s" % Redo
580
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
581

    
582
  if GpuStyle=='CUDA':
583
    # For PyCUDA import
584
    import pycuda.driver as cuda
585
    import pycuda.gpuarray as gpuarray
586
    import pycuda.autoinit
587
    from pycuda.compiler import SourceModule
588

    
589
  if GpuStyle=='OpenCL':
590
    # For PyOpenCL import
591
    import pyopencl as cl
592
    Id=1
593
    for platform in cl.get_platforms():
594
      for device in platform.get_devices():
595
        deviceType=cl.device_type.to_string(device.type)
596
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
597
        Id=Id+1
598

    
599
  average=numpy.array([]).astype(numpy.float32)
600
  median=numpy.array([]).astype(numpy.float32)
601
  stddev=numpy.array([]).astype(numpy.float32)
602

    
603
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
604

    
605
  Jobs=JobStart
606

    
607
  while Jobs <= JobEnd:
608
    avg,med,std=0,0,0
609
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
610
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
611

    
612
    if OutMetrology:
613
      duration=numpy.array([]).astype(numpy.float32)
614
      for i in range(Redo):
615
        start=time.time()
616
        if GpuStyle=='CUDA':
617
          try:
618
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
619
          except:
620
            print "Problem with %i // computations on Cuda" % Jobs
621
        elif GpuStyle=='OpenCL':
622
          try:
623
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device)
624
          except:
625
            print "Problem with %i // computations on OpenCL" % Jobs            
626
        duration=numpy.append(duration,time.time()-start)
627
      if (a,m,s) != (0,0,0):
628
        avg=numpy.mean(duration)
629
        med=numpy.median(duration)
630
        std=numpy.std(duration)
631
      else:
632
        print "Values seem to be wrong..."
633
    else:
634
      if GpuStyle=='CUDA':
635
        try:
636
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
637
        except:
638
          print "Problem with %i // computations on Cuda" % Jobs
639
      elif GpuStyle=='OpenCL':
640
        # try:
641
        #   avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
642
        # except:
643
        #   print "Problem with %i // computations on OpenCL" % Jobs            
644
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
645

    
646
    if (avg,med,std) != (0,0,0):
647
      print "jobs,avg,med,std",Jobs,avg,med,std
648
      average=numpy.append(average,avg)
649
      median=numpy.append(median,med)
650
      stddev=numpy.append(stddev,std)
651
    else:
652
      print "Values seem to be wrong..."
653
    #THREADS*=2
654
    if len(average)!=0:
655
      numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
656
      numpy.savetxt("Pi_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
657
    Jobs+=JobStep
658

    
659
  FitAndPrint(ExploredJobs,median,Curves)
660