Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 35

Historique | Voir | Annoter | Télécharger (19,56 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MC using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import matplotlib.pyplot as plt
23
import math
24
from scipy.optimize import curve_fit
25
from socket import gethostname
26

    
27
# find prime factors of a number
28
# Get for WWW :
29
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
30
def PrimeFactors(x):
31
  factorlist=numpy.array([]).astype('uint32')
32
  loop=2
33
  while loop<=x:
34
    if x%loop==0:
35
      x/=loop
36
      factorlist=numpy.append(factorlist,[loop])
37
    else:
38
      loop+=1
39
  return factorlist
40

    
41
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
42
# output is thread number
43
def BestThreadsNumber(jobs):
44
  factors=PrimeFactors(jobs)
45
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
46
  threads=1
47
  for factor in matrix.transpose().ravel():
48
    threads=threads*factor
49
    if threads*threads>jobs:
50
      break
51
  return(long(threads))
52

    
53
# Predicted Amdahl Law (Reduced with s=1-p)  
54
def AmdahlR(N, T1, p):
55
  return (T1*(1-p+p/N))
56

    
57
# Predicted Amdahl Law
58
def Amdahl(N, T1, s, p):
59
  return (T1*(s+p/N))
60

    
61
# Predicted Mylq Law with first order
62
def Mylq(N, T1,s,c,p):
63
  return (T1*(s+c*N+p/N))
64

    
65
# Predicted Mylq Law with second order
66
def Mylq2(N, T1,s,c1,c2,p):
67
  return (T1*(s+c1*N+c2*N*N+p/N))
68

    
69
KERNEL_CODE_CUDA="""
70

71
// Marsaglia RNG very simple implementation
72

73
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
74
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
75
#define MWC   (znew+wnew)
76
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
77
#define CONG  (jcong=69069*jcong+1234567)
78
#define KISS  ((MWC^CONG)+SHR3)
79

80
#define MWCfp MWC * 2.328306435454494e-10f
81
#define KISSfp KISS * 2.328306435454494e-10f
82

83
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
84
{
85
   uint z=seed_z/(blockIdx.x+1);
86
   uint w=seed_w/(blockIdx.x+1);
87

88
   ulong total=0;
89

90
   for (ulong i=0;i<iterations;i++) {
91

92
      float x=MWCfp ;
93
      float y=MWCfp ;
94

95
      // Matching test
96
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
97
      total+=inside;
98

99
   }
100

101
   s[blockIdx.x]=total;
102
   __syncthreads();
103

104
}
105

106
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
107
{
108
   uint z=seed_z/(threadIdx.x+1);
109
   uint w=seed_w/(threadIdx.x+1);
110

111
   ulong total=0;
112

113
   for (ulong i=0;i<iterations;i++) {
114

115
      float x=MWCfp ;
116
      float y=MWCfp ;
117

118
      // Matching test
119
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
120
      total+=inside;
121

122
   }
123

124
   s[threadIdx.x]=total;
125
   __syncthreads();
126

127
}
128

129
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
130
{
131
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
132
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
133

134
   ulong total=0;
135

136
   for (ulong i=0;i<iterations;i++) {
137

138
      float x=MWCfp ;
139
      float y=MWCfp ;
140

141
      // Matching test
142
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
143
      total+=inside;
144

145
   }
146

147
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
148
   __syncthreads();
149

150
}
151
"""
152

    
153
KERNEL_CODE_OPENCL="""
154

155
// Marsaglia RNG very simple implementation
156
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
157
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
158
#define MWC   (znew+wnew)
159
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160
#define CONG  (jcong=69069*jcong+1234567)
161
#define KISS  ((MWC^CONG)+SHR3)
162

163
#define MWCfp MWC * 2.328306435454494e-10f
164
#define KISSfp KISS * 2.328306435454494e-10f
165

166
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
167
{
168
   uint z=seed_z/(get_global_id(0)+1);
169
   uint w=seed_w/(get_global_id(0)+1);
170

171
   ulong total=0;
172

173
   for (ulong i=0;i<iterations;i++) {
174

175
      float x=MWCfp ;
176
      float y=MWCfp ;
177

178
      // Matching test
179
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
180
      total+=inside;
181
   }
182
   s[get_global_id(0)]=total;
183
   barrier(CLK_GLOBAL_MEM_FENCE);
184
      
185
}
186

187
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
188
{
189
   uint z=seed_z/(get_local_id(0)+1);
190
   uint w=seed_w/(get_local_id(0)+1);
191

192
   ulong total=0;
193

194
   for (ulong i=0;i<iterations;i++) {
195

196
      float x=MWCfp ;
197
      float y=MWCfp ;
198

199
      // Matching test
200
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
201
      total+=inside;
202
   }
203
   s[get_local_id(0)]=total;
204
   barrier(CLK_LOCAL_MEM_FENCE);
205
      
206
}
207

208
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
209
{
210
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
211
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
212

213
   ulong total=0;
214

215
   for (uint i=0;i<iterations;i++) {
216

217
      float x=MWCfp ;
218
     float y=MWCfp ;
219

220
      // Matching test
221
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
222
      total+=inside;
223
   }
224
   barrier(CLK_LOCAL_MEM_FENCE);
225
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
226
      
227
}
228
"""
229

    
230
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
231

    
232
  # Avec PyCUDA autoinit, rien a faire !
233
  
234
  circleCU = cuda.InOut(circle)
235
  
236
  mod = SourceModule(KERNEL_CODE_CUDA)
237

    
238
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
239
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
240
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
241
  
242
  start = pycuda.driver.Event()
243
  stop = pycuda.driver.Event()
244
  
245
  MyPi=numpy.zeros(steps)
246
  MyDuration=numpy.zeros(steps)
247
  
248
  if iterations%jobs==0:
249
    iterationsCL=numpy.uint64(iterations/jobs)
250
    iterationsNew=iterationsCL*jobs
251
  else:
252
    iterationsCL=numpy.uint64(iterations/jobs+1)
253
    iterationsNew=iterations
254

    
255
  for i in range(steps):
256
    start.record()
257
    start.synchronize()
258
    if ParaStyle=='Blocks':
259
      MetropolisBlocksCU(circleCU,
260
                         numpy.uint64(iterationsCL),
261
                         numpy.uint32(nprnd(2**30/jobs)),
262
                         numpy.uint32(nprnd(2**30/jobs)),
263
                         grid=(jobs,1),
264
                         block=(1,1,1))
265
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
266
            (Alu,jobs,1,ParaStyle)      
267
    elif ParaStyle=='Hybrid':
268
      threads=BestThreadsNumber(jobs)
269
      MetropolisHybridCU(circleCU,
270
                          numpy.uint64(iterationsCL),
271
                          numpy.uint32(nprnd(2**30/jobs)),
272
                          numpy.uint32(nprnd(2**30/jobs)),
273
                          grid=(jobs,1),
274
                          block=(threads,1,1))
275
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
276
            (Alu,jobs/threads,threads,ParaStyle)
277
    else:
278
      MetropolisJobsCU(circleCU,
279
                          numpy.uint64(iterationsCL),
280
                          numpy.uint32(nprnd(2**30/jobs)),
281
                          numpy.uint32(nprnd(2**30/jobs)),
282
                          grid=(1,1),
283
                          block=(jobs,1,1))
284
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
285
            (Alu,jobs,1,ParaStyle)
286
    stop.record()
287
    stop.synchronize()
288
                
289
    #elapsed = stop.time_since(start)*1e-3
290
    elapsed = start.time_till(stop)*1e-3
291

    
292
    #print circle,float(numpy.sum(circle))
293
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
294
    MyDuration[i]=elapsed
295
    #print MyPi[i],MyDuration[i]
296
    #time.sleep(1)
297

    
298
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
299

    
300
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
301

    
302

    
303
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
304
        
305
  # Initialisation des variables en les CASTant correctement
306
    
307
  # Je detecte un peripherique GPU dans la liste des peripheriques
308
  # for platform in cl.get_platforms():
309
  #         for device in platform.get_devices():
310
  #                 if cl.device_type.to_string(device.type)=='GPU':
311
  #                         GPU=device
312
  #print "GPU detected: ",device.name
313

    
314
  HasGPU=False
315
  Id=1
316
  # Primary Device selection based on Device Id
317
  for platform in cl.get_platforms():
318
    for device in platform.get_devices():
319
      deviceType=cl.device_type.to_string(device.type)
320
      if Id==Device and not HasGPU:
321
        GPU=device
322
        print "CPU/GPU selected: ",device.name
323
        HasGPU=True
324
      Id=Id+1
325
  # Default Device selection based on ALU Type
326
  for platform in cl.get_platforms():
327
    for device in platform.get_devices():
328
      deviceType=cl.device_type.to_string(device.type)
329
      if deviceType=="GPU" and Alu=="GPU" and not HasGPU:
330
        GPU=device
331
        print "GPU selected: ",device.name
332
        HasGPU=True
333
      if deviceType=="CPU" and Alu=="CPU" and not HasGPU:        
334
        GPU=device
335
        print "CPU selected: ",device.name
336
        HasGPU=True
337
                                
338
  # Je cree le contexte et la queue pour son execution
339
  #ctx = cl.create_some_context()
340
  ctx = cl.Context([GPU])
341
  queue = cl.CommandQueue(ctx,
342
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
343

    
344
  # Je recupere les flag possibles pour les buffers
345
  mf = cl.mem_flags
346
        
347
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
348

    
349
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
350
    options = "-cl-mad-enable -cl-fast-relaxed-math")
351

    
352
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
353

    
354
  i=0
355

    
356
  MyPi=numpy.zeros(steps)
357
  MyDuration=numpy.zeros(steps)
358
  
359
  if iterations%jobs==0:
360
    iterationsCL=numpy.uint32(iterations/jobs)
361
    iterationsNew=iterationsCL*jobs
362
  else:
363
    iterationsCL=numpy.uint32(iterations/jobs+1)
364
    iterationsNew=iterations
365

    
366
  for i in range(steps):
367
                
368
    if ParaStyle=='Blocks':
369
      # Call OpenCL kernel
370
      # (1,) is Global work size (only 1 work size)
371
      # (1,) is local work size
372
      # circleCL is lattice translated in CL format
373
      # SeedZCL is lattice translated in CL format
374
      # SeedWCL is lattice translated in CL format
375
      # step is number of iterations
376
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
377
                                           circleCL,
378
                                           numpy.uint64(iterationsCL),
379
                                           numpy.uint32(nprnd(2**30/jobs)),
380
                                           numpy.uint32(nprnd(2**30/jobs)))
381
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
382
            (Alu,jobs,1,ParaStyle)
383
    elif ParaStyle=='Hybrid':
384
      threads=BestThreadsNumber(jobs)
385
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
386
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
387
                                          circleCL,
388
                                          numpy.uint64(iterationsCL),
389
                                          numpy.uint32(nprnd(2**30/jobs)),
390
                                          numpy.uint32(nprnd(2**30/jobs)))
391
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
392
            (Alu,jobs/threads,threads,ParaStyle)
393
    else:
394
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
395
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
396
                                          circleCL,
397
                                          numpy.uint64(iterationsCL),
398
                                          numpy.uint32(nprnd(2**30/jobs)),
399
                                          numpy.uint32(nprnd(2**30/jobs)))
400
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
401

    
402
    CLLaunch.wait()
403
    cl.enqueue_copy(queue, circle, circleCL).wait()
404

    
405
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
406

    
407
    #print circle,float(numpy.sum(circle))
408
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
409
    MyDuration[i]=elapsed
410
    #print MyPi[i],MyDuration[i]
411

    
412
  circleCL.release()
413

    
414
  #print jobs,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
415
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
416
        
417
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
418

    
419

    
420
def FitAndPrint(N,D,Curves):
421

    
422
  try:
423
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
424

    
425
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
426
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
427
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
428
    coeffs_Amdahl[0]=D[0]
429
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
430
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
431
  except:
432
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
433

    
434
  try:
435
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
436

    
437
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
438
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
439
    coeffs_AmdahlR[0]=D[0]
440
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
441
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
442

    
443
  except:
444
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
445

    
446
  try:
447
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
448

    
449
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
450
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
451
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
452
    coeffs_Mylq[0]=D[0]
453
    print "Mylq Normalized : T=%.2f(%.6f+%.6f*N+%.6f/N)" % (coeffs_Mylq[0],
454
                                                            coeffs_Mylq[1],
455
                                                            coeffs_Mylq[2],
456
                                                            coeffs_Mylq[3])
457
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
458
                coeffs_Mylq[3])
459
  except:
460
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
461

    
462
  try:
463
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
464

    
465
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
466
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
467
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
468
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
469
    coeffs_Mylq2[0]=D[0]
470
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f*N+%.6f*N^2+%.6f/N)" % \
471
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
472
         coeffs_Mylq2[4])
473

    
474
  except:
475
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
476

    
477
  if Curves:
478
    plt.xlabel("Number of Threads/work Items")
479
    plt.ylabel("Total Elapsed Time")
480

    
481
    Experience,=plt.plot(N,D,'ro') 
482
    try:
483
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
484
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
485
    except:
486
      print "Fit curves seem not to be available"
487

    
488
    plt.legend()
489
    plt.show()
490

    
491
if __name__=='__main__':
492

    
493
  # Set defaults values
494
  # Alu can be CPU or GPU
495
  Alu='CPU'
496
  # Id of GPU : 0 is for first find !
497
  Device=0
498
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
499
  GpuStyle='OpenCL'
500
  # Parallel distribution can be on Threads or Blocks
501
  ParaStyle='Blocks'
502
  # Iterations is integer
503
  Iterations=100000000
504
  # JobStart in first number of Jobs to explore
505
  JobStart=1
506
  # JobEnd is last number of Jobs to explore
507
  JobEnd=16
508
  # Redo is the times to redo the test to improve metrology
509
  Redo=1
510
  # OutMetrology is method for duration estimation : False is GPU inside
511
  OutMetrology=False
512
  # Curves is True to print the curves
513
  Curves=False
514

    
515
  try:
516
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","redo=","device="])
517
  except getopt.GetoptError:
518
    print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
519
    sys.exit(2)
520
    
521
  for opt, arg in opts:
522
    if opt == '-h':
523
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
524
      sys.exit()
525
    elif opt == '-o':
526
      OutMetrology=True
527
    elif opt == '-c':
528
      Curves=True
529
    elif opt in ("-a", "--alu"):
530
      Alu = arg
531
    elif opt in ("-d", "--device"):
532
      Device = int(arg)
533
    elif opt in ("-g", "--gpustyle"):
534
      GpuStyle = arg
535
    elif opt in ("-p", "--parastyle"):
536
      ParaStyle = arg
537
    elif opt in ("-i", "--iterations"):
538
      Iterations = numpy.uint32(arg)
539
    elif opt in ("-s", "--jobstart"):
540
      JobStart = int(arg)
541
    elif opt in ("-e", "--jobend"):
542
      JobEnd = int(arg)
543
    elif opt in ("-r", "--redo"):
544
      Redo = int(arg)
545

    
546
  if Alu=='CPU' and GpuStyle=='CUDA':
547
    print "Alu can't be CPU for CUDA, set Alu to GPU"
548
    Alu='GPU'
549

    
550
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
551
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
552
    ParaStyle='Threads'
553

    
554
  print "Compute unit : %s" % Alu
555
  print "Device Identification : %s" % Device
556
  print "GpuStyle used : %s" % GpuStyle
557
  print "Parallel Style used : %s" % ParaStyle
558
  print "Iterations : %s" % Iterations
559
  print "Number of threads on start : %s" % JobStart
560
  print "Number of threads on end : %s" % JobEnd
561
  print "Number of redo : %s" % Redo
562
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
563

    
564
  if GpuStyle=='CUDA':
565
    # For PyCUDA import
566
    import pycuda.driver as cuda
567
    import pycuda.gpuarray as gpuarray
568
    import pycuda.autoinit
569
    from pycuda.compiler import SourceModule
570

    
571
  if GpuStyle=='OpenCL':
572
    # For PyOpenCL import
573
    import pyopencl as cl
574
    Id=1
575
    for platform in cl.get_platforms():
576
      for device in platform.get_devices():
577
        deviceType=cl.device_type.to_string(device.type)
578
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
579
        Id=Id+1
580

    
581
  average=numpy.array([]).astype(numpy.float32)
582
  median=numpy.array([]).astype(numpy.float32)
583
  stddev=numpy.array([]).astype(numpy.float32)
584

    
585
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
586

    
587
  Jobs=JobStart
588

    
589
  while Jobs <= JobEnd:
590
    avg,med,std=0,0,0
591
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
592
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
593

    
594
    if OutMetrology:
595
      duration=numpy.array([]).astype(numpy.float32)
596
      for i in range(Redo):
597
        start=time.time()
598
        if GpuStyle=='CUDA':
599
          try:
600
            MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
601
          except:
602
            print "Problem with %i // computations on Cuda" % Jobs
603
        elif GpuStyle=='OpenCL':
604
          try:
605
            MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device)
606
          except:
607
            print "Problem with %i // computations on OpenCL" % Jobs            
608
        duration=numpy.append(duration,time.time()-start)
609
      avg=numpy.mean(duration)
610
      med=numpy.median(duration)
611
      std=numpy.std(duration)
612
    else:
613
      if GpuStyle=='CUDA':
614
        try:
615
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
616
        except:
617
          print "Problem with %i // computations on Cuda" % Jobs
618
      elif GpuStyle=='OpenCL':
619
        # try:
620
        #   avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
621
        # except:
622
        #   print "Problem with %i // computations on OpenCL" % Jobs            
623
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
624

    
625
    if (avg,med,std) != (0,0,0):
626
      print "jobs,avg,med,std",Jobs,avg,med,std
627
      average=numpy.append(average,avg)
628
      median=numpy.append(median,med)
629
      stddev=numpy.append(stddev,std)
630
    else:
631
      print "Values seem to be wrong..."
632
    #THREADS*=2
633
    numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,gethostname()),(ExploredJobs,average,median,stddev))
634
    Jobs+=1
635

    
636
  FitAndPrint(ExploredJobs,median,Curves)
637