Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py-20130214 @ 7

Historique | Voir | Annoter | Télécharger (18,28 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MC using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14
# Option "Interactive" "0" in /etc/X11/xorg.conf
15

    
16
# Common tools
17
import numpy
18
from numpy.random import randint as nprnd
19
import sys
20
import getopt
21
import time
22
import matplotlib.pyplot as plt
23
import math
24
from scipy.optimize import curve_fit
25
from socket import gethostname
26

    
27
# Predicted Amdahl Law (Reduced with s=1-p)  
28
def AmdahlR(N, T1, p):
29
  return (T1*(1-p+p/N))
30

    
31
# Predicted Amdahl Law
32
def Amdahl(N, T1, s, p):
33
  return (T1*(s+p/N))
34

    
35
# Predicted Mylq Law with first order
36
def Mylq(N, T1,s,c,p):
37
  return (T1*(s+c*N+p/N))
38

    
39
# Predicted Mylq Law with second order
40
def Mylq2(N, T1,s,c1,c2,p):
41
  return (T1*(s+c1*N+c2*N*N+p/N))
42

    
43
KERNEL_CODE_CUDA="""
44

    
45
// Marsaglia RNG very simple implementation
46

    
47
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
48
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
49
#define MWC   (znew+wnew)
50
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
51
#define CONG  (jcong=69069*jcong+1234567)
52
#define KISS  ((MWC^CONG)+SHR3)
53

    
54
#define MWCfp MWC * 2.328306435454494e-10f
55
#define KISSfp KISS * 2.328306435454494e-10f
56

    
57
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
58
{
59
   uint z=seed_z/(blockIdx.x+1);
60
   uint w=seed_w/(blockIdx.x+1);
61

    
62
   int total=0;
63

    
64
   for (uint i=0;i<iterations;i++) {
65

    
66
      float x=MWCfp ;
67
      float y=MWCfp ;
68

    
69
      // Matching test
70
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
71
      total+=inside;
72

    
73
   }
74

    
75
   s[blockIdx.x]=total;
76
   __syncthreads();
77

    
78
}
79

    
80
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
81
{
82
   uint z=seed_z/(threadIdx.x+1);
83
   uint w=seed_w/(threadIdx.x+1);
84

    
85
   int total=0;
86

    
87
   for (uint i=0;i<iterations;i++) {
88

    
89
      float x=MWCfp ;
90
      float y=MWCfp ;
91

    
92
      // Matching test
93
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
94
      total+=inside;
95

    
96
   }
97

    
98
   s[threadIdx.x]=total;
99
   __syncthreads();
100

    
101
}
102

    
103
__global__ void MainLoopHybrid(uint *s,uint iterations,uint seed_w,uint seed_z)
104
{
105
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
106
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
107

    
108
   int total=0;
109

    
110
   for (uint i=0;i<iterations;i++) {
111

    
112
      float x=MWCfp ;
113
      float y=MWCfp ;
114

    
115
      // Matching test
116
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
117
      total+=inside;
118

    
119
   }
120

    
121
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
122
   __syncthreads();
123

    
124
}
125
"""
126

    
127
KERNEL_CODE_OPENCL="""
128

    
129
// Marsaglia RNG very simple implementation
130
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
131
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
132
#define MWC   (znew+wnew)
133
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
134
#define CONG  (jcong=69069*jcong+1234567)
135
#define KISS  ((MWC^CONG)+SHR3)
136

    
137
#define MWCfp MWC * 2.328306435454494e-10f
138
#define KISSfp KISS * 2.328306435454494e-10f
139

    
140
__kernel void MainLoopGlobal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
141
{
142
   uint z=seed_z/(get_global_id(0)+1);
143
   uint w=seed_w/(get_global_id(0)+1);
144

    
145
   int total=0;
146

    
147
   for (uint i=0;i<iterations;i++) {
148

    
149
      float x=MWCfp ;
150
      float y=MWCfp ;
151

    
152
      // Matching test
153
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
154
      total+=inside;
155
   }
156
   s[get_global_id(0)]=total;
157
   barrier(CLK_GLOBAL_MEM_FENCE);
158
      
159
}
160

    
161
__kernel void MainLoopLocal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
162
{
163
   uint z=seed_z/(get_local_id(0)+1);
164
   uint w=seed_w/(get_local_id(0)+1);
165

    
166
   int total=0;
167

    
168
   for (uint i=0;i<iterations;i++) {
169

    
170
      float x=MWCfp ;
171
      float y=MWCfp ;
172

    
173
      // Matching test
174
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
175
      total+=inside;
176
   }
177
   s[get_local_id(0)]=total;
178
   barrier(CLK_LOCAL_MEM_FENCE);
179
      
180
}
181

    
182
__kernel void MainLoopHybrid(__global uint *s,uint iterations,uint seed_w,uint seed_z)
183
{
184
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
185
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
186
   
187
   // uint jsr=123456789;
188
   // uint jcong=380116160;
189

    
190
   int total=0;
191

    
192
   for (uint i=0;i<iterations;i++) {
193

    
194
      float x=MWCfp ;
195
     float y=MWCfp ;
196

    
197
      // Matching test
198
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
199
      total+=inside;
200
   }
201
   barrier(CLK_LOCAL_MEM_FENCE);
202
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
203
      
204
}
205
"""
206

    
207
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
208

    
209
  # Avec PyCUDA autoinit, rien a faire !
210
  
211
  circleCU = cuda.InOut(circle)
212
  
213
  mod = SourceModule(KERNEL_CODE_CUDA)
214

    
215
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
216
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
217
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
218
  
219
  start = pycuda.driver.Event()
220
  stop = pycuda.driver.Event()
221
  
222
  MyPi=numpy.zeros(steps)
223
  MyDuration=numpy.zeros(steps)
224
  
225
  if iterations%jobs==0:
226
    iterationsCL=numpy.uint32(iterations/jobs+1)
227
    iterationsNew=iterationsCL*jobs
228
  else:
229
    iterationsCL=numpy.uint32(iterations/jobs)
230
    iterationsNew=iterations
231

    
232
  for i in range(steps):
233
    start.record()
234
    start.synchronize()
235
    if ParaStyle=='Blocks':
236
      MetropolisBlocksCU(circleCU,
237
                         numpy.uint32(iterationsCL),
238
                         numpy.uint32(nprnd(2**32/jobs)),
239
                         numpy.uint32(nprnd(2**32/jobs)),
240
                         grid=(jobs,1),
241
                         block=(1,1,1))
242
      print "GPU with %i %s done" % (jobs,ParaStyle)
243
    elif ParaStyle=='Hybrid':
244
      blocks=jobs/int(math.sqrt(float(jobs)))
245
      MetropolisHybridCU(circleCU,
246
                          numpy.uint32(iterationsCL),
247
                          numpy.uint32(nprnd(2**32/jobs)),
248
                          numpy.uint32(nprnd(2**32/jobs)),
249
                          grid=(blocks,1),
250
                          block=(jobs/blocks,1,1))
251
      print "GPU with (blocks,jobs)=(%i,%i) %s done" % (blocks,jobs/blocks,ParaStyle)
252
    else:
253
      MetropolisJobsCU(circleCU,
254
                          numpy.uint32(iterationsCL),
255
                          numpy.uint32(nprnd(2**32/jobs)),
256
                          numpy.uint32(nprnd(2**32/jobs)),
257
                          grid=(1,1),
258
                          block=(jobs,1,1))
259
      print "GPU with %i %s done" % (jobs,ParaStyle)
260
    stop.record()
261
    stop.synchronize()
262
                
263
    #elapsed = stop.time_since(start)*1e-3
264
    elapsed = start.time_till(stop)*1e-3
265

    
266
    #print circle,float(numpy.sum(circle))
267
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
268
    MyDuration[i]=elapsed
269
    #print MyPi[i],MyDuration[i]
270
    #time.sleep(1)
271

    
272
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
273

    
274
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
275

    
276

    
277
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
278
	
279
  # Initialisation des variables en les CASTant correctement
280
    
281
  # Je detecte un peripherique GPU dans la liste des peripheriques
282
  # for platform in cl.get_platforms():
283
  # 	for device in platform.get_devices():
284
  # 		if cl.device_type.to_string(device.type)=='GPU':
285
  # 			GPU=device
286
  #print "GPU detected: ",device.name
287

    
288
  HasGPU=False
289
  Id=1
290
  # Device selection based on choice (default is GPU)
291
  for platform in cl.get_platforms():
292
    for device in platform.get_devices():
293
      if not HasGPU:
294
        deviceType=cl.device_type.to_string(device.type)
295
        if deviceType=="GPU" and Alu=="GPU" and Id==Device:
296
          GPU=device
297
          print "GPU selected: ",device.name
298
          HasGPU=True
299
        if deviceType=="CPU" and Alu=="CPU":	    
300
          GPU=device
301
          print "CPU selected: ",device.name
302
          HasGPU=True
303
      Id=Id+1
304
				
305
  # Je cree le contexte et la queue pour son execution
306
  #ctx = cl.create_some_context()
307
  ctx = cl.Context([GPU])
308
  queue = cl.CommandQueue(ctx,
309
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
310

    
311
  # Je recupere les flag possibles pour les buffers
312
  mf = cl.mem_flags
313
	
314
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
315

    
316
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
317
    options = "-cl-mad-enable -cl-fast-relaxed-math")
318

    
319
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
320

    
321
  i=0
322

    
323
  MyPi=numpy.zeros(steps)
324
  MyDuration=numpy.zeros(steps)
325
  
326
  if iterations%jobs==0:
327
    iterationsCL=numpy.uint32(iterations/jobs+1)
328
    iterationsNew=iterationsCL*jobs
329
  else:
330
    iterationsCL=numpy.uint32(iterations/jobs)
331
    iterationsNew=iterations
332

    
333
  blocks=int(math.sqrt(jobs))
334

    
335
  for i in range(steps):
336
		
337
    if ParaStyle=='Blocks':
338
      # Call OpenCL kernel
339
      # (1,) is Global work size (only 1 work size)
340
      # (1,) is local work size
341
      # circleCL is lattice translated in CL format
342
      # SeedZCL is lattice translated in CL format
343
      # SeedWCL is lattice translated in CL format
344
      # step is number of iterations
345
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
346
                                           circleCL,
347
                                           numpy.uint32(iterationsCL),
348
                                           numpy.uint32(nprnd(2**32/jobs)),
349
                                           numpy.uint32(nprnd(2**32/jobs)))
350
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
351
    elif ParaStyle=='Hybrid':
352
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
353
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(blocks*blocks,),(blocks,),
354
                                          circleCL,
355
                                          numpy.uint32(iterationsCL),
356
                                          numpy.uint32(nprnd(2**32/jobs)),
357
                                          numpy.uint32(nprnd(2**32/jobs)))
358
      print "%s with (Blocks,Threads)=(%i,%i) %s done" % (Alu,blocks,blocks,ParaStyle)
359
    else:
360
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
361
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
362
                                          circleCL,
363
                                          numpy.uint32(iterationsCL),
364
                                          numpy.uint32(nprnd(2**32/jobs)),
365
                                          numpy.uint32(nprnd(2**32/jobs)))
366
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
367

    
368
    CLLaunch.wait()
369
    cl.enqueue_copy(queue, circle, circleCL).wait()
370

    
371
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
372

    
373
    #print circle,float(numpy.sum(circle))
374
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
375
    MyDuration[i]=elapsed
376
    #print MyPi[i],MyDuration[i]
377

    
378
  circleCL.release()
379

    
380
  #print jobs,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
381
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
382
	
383
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
384

    
385

    
386
def FitAndPrint(N,D,Curves):
387

    
388
  try:
389
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
390

    
391
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
392
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
393
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
394
    coeffs_Amdahl[0]=D[0]
395
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
396
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
397
  except:
398
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
399

    
400
  try:
401
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
402

    
403
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
404
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
405
    coeffs_AmdahlR[0]=D[0]
406
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
407
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
408

    
409
  except:
410
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
411

    
412
  try:
413
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
414

    
415
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
416
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
417
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
418
    coeffs_Mylq[0]=D[0]
419
    print "Mylq Normalized : T=%.2f(%.6f+%.6f*N+%.6f/N)" % (coeffs_Mylq[0],
420
                                                            coeffs_Mylq[1],
421
                                                            coeffs_Mylq[2],
422
                                                            coeffs_Mylq[3])
423
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
424
                coeffs_Mylq[3])
425
  except:
426
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
427

    
428
  try:
429
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
430

    
431
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
432
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
433
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
434
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
435
    coeffs_Mylq2[0]=D[0]
436
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f*N+%.6f*N^2+%.6f/N)" % \
437
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
438
         coeffs_Mylq2[4])
439

    
440
  except:
441
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
442

    
443
  if Curves:
444
    plt.xlabel("Number of Threads/work Items")
445
    plt.ylabel("Total Elapsed Time")
446

    
447
    Experience,=plt.plot(N,D,'ro') 
448
    try:
449
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
450
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
451
    except:
452
      print "Fit curves seem not to be available"
453

    
454
    plt.legend()
455
    plt.show()
456

    
457
if __name__=='__main__':
458

    
459
  # Set defaults values
460
  # Alu can be CPU or GPU
461
  Alu='CPU'
462
  # Id of GPU
463
  Device=1
464
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
465
  GpuStyle='OpenCL'
466
  # Parallel distribution can be on Threads or Blocks
467
  ParaStyle='Threads'
468
  # Iterations is integer
469
  Iterations=1000000
470
  # JobStart in first number of Jobs to explore
471
  JobStart=1
472
  # JobEnd is last number of Jobs to explore
473
  JobEnd=512
474
  # Redo is the times to redo the test to improve metrology
475
  Redo=1
476
  # OutMetrology is method for duration estimation : False is GPU inside
477
  OutMetrology=False
478
  # Curves is True to print the curves
479
  Curves=False
480

    
481
  try:
482
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","redo=","device="])
483
  except getopt.GetoptError:
484
    print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
485
    sys.exit(2)
486
    
487
  for opt, arg in opts:
488
    if opt == '-h':
489
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
490
      sys.exit()
491
    elif opt == '-o':
492
      OutMetrology=True
493
    elif opt == '-c':
494
      Curves=True
495
    elif opt in ("-a", "--alu"):
496
      Alu = arg
497
    elif opt in ("-d", "--device"):
498
      Device = int(arg)
499
    elif opt in ("-g", "--gpustyle"):
500
      GpuStyle = arg
501
    elif opt in ("-p", "--parastyle"):
502
      ParaStyle = arg
503
    elif opt in ("-i", "--iterations"):
504
      Iterations = numpy.uint32(arg)
505
    elif opt in ("-s", "--jobstart"):
506
      JobStart = int(arg)
507
    elif opt in ("-e", "--jobend"):
508
      JobEnd = int(arg)
509
    elif opt in ("-r", "--redo"):
510
      Redo = int(arg)
511

    
512
  if Alu=='CPU' and GpuStyle=='CUDA':
513
    print "Alu can't be CPU for CUDA, set Alu to GPU"
514
    Alu='GPU'
515

    
516
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
517
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
518
    ParaStyle='Threads'
519

    
520
  print "Compute unit : %s" % Alu
521
  print "Device Identification : %s" % Device
522
  print "GpuStyle used : %s" % GpuStyle
523
  print "Parallel Style used : %s" % ParaStyle
524
  print "Iterations : %s" % Iterations
525
  print "Number of threads on start : %s" % JobStart
526
  print "Number of threads on end : %s" % JobEnd
527
  print "Number of redo : %s" % Redo
528
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
529

    
530
  if GpuStyle=='CUDA':
531
    # For PyCUDA import
532
    import pycuda.driver as cuda
533
    import pycuda.gpuarray as gpuarray
534
    import pycuda.autoinit
535
    from pycuda.compiler import SourceModule
536

    
537
  if GpuStyle=='OpenCL':
538
    # For PyOpenCL import
539
    import pyopencl as cl
540
    Id=1
541
    for platform in cl.get_platforms():
542
      for device in platform.get_devices():
543
        deviceType=cl.device_type.to_string(device.type)
544
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
545
        Id=Id+1
546

    
547
  average=numpy.array([]).astype(numpy.float32)
548
  median=numpy.array([]).astype(numpy.float32)
549
  stddev=numpy.array([]).astype(numpy.float32)
550

    
551
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
552

    
553
  Jobs=JobStart
554

    
555
  while Jobs <= JobEnd:
556
    avg,med,std=0,0,0
557
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
558
    circle=numpy.zeros(Jobs).astype(numpy.uint32)
559

    
560
    if OutMetrology:
561
      duration=numpy.array([]).astype(numpy.float32)
562
      for i in range(Redo):
563
        start=time.time()
564
        if GpuStyle=='CUDA':
565
          try:
566
            MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
567
          except:
568
            print "Problem with %i // computations on Cuda" % Jobs
569
        elif GpuStyle=='OpenCL':
570
          try:
571
            MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device)
572
          except:
573
            print "Problem with %i // computations on OpenCL" % Jobs            
574
        duration=numpy.append(duration,time.time()-start)
575
      avg=numpy.mean(duration)
576
      med=numpy.median(duration)
577
      std=numpy.std(duration)
578
    else:
579
      if GpuStyle=='CUDA':
580
        try:
581
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
582
        except:
583
          print "Problem with %i // computations on Cuda" % Jobs
584
      elif GpuStyle=='OpenCL':
585
        try:
586
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
587
        except:
588
          print "Problem with %i // computations on OpenCL" % Jobs            
589

    
590
    if (avg,med,std) != (0,0,0):
591
      print "avg,med,std",avg,med,std
592
      average=numpy.append(average,avg)
593
      median=numpy.append(median,med)
594
      stddev=numpy.append(stddev,std)
595
    else:
596
      print "Values seem to be wrong..."
597
    #THREADS*=2
598
    numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,gethostname()),(ExploredJobs,average,median,stddev))
599
    Jobs+=1
600

    
601
  FitAndPrint(ExploredJobs,median,Curves)
602