Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py-20130207 @ 47

Historique | Voir | Annoter | Télécharger (18,25 ko)

1
#!/usr/bin/env python
2

    
3
#
4
# Pi-by-MC using PyCUDA
5
#
6
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
7
#
8
# Thanks to Andreas Klockner for PyCUDA:
9
# http://mathema.tician.de/software/pycuda
10
# 
11

    
12
# 2013-01-01 : problems with launch timeout
13
# nvidia-smi -c 1 : ko
14
# nvidia-smi -c 3 : ko
15
# nvidia-smi --gom=1 : not supported
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import matplotlib.pyplot as plt
26
import math
27
from scipy.optimize import curve_fit
28
from socket import gethostname
29

    
30
# Predicted Amdahl Law (Reduced with s=1-p)  
31
def AmdahlR(N, T1, p):
32
  return (T1*(1-p+p/N))
33

    
34
# Predicted Amdahl Law
35
def Amdahl(N, T1, s, p):
36
  return (T1*(s+p/N))
37

    
38
# Predicted Mylq Law with first order
39
def Mylq(N, T1,s,c,p):
40
  return (T1*(s+c*N+p/N))
41

    
42
# Predicted Mylq Law with second order
43
def Mylq2(N, T1,s,c1,c2,p):
44
  return (T1*(s+c1*N+c2*N*N+p/N))
45

    
46
KERNEL_CODE_CUDA="""
47

    
48
// Marsaglia RNG very simple implementation
49

    
50
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
51
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
52
#define MWC   (znew+wnew)
53
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
54
#define CONG  (jcong=69069*jcong+1234567)
55
#define KISS  ((MWC^CONG)+SHR3)
56

    
57
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
58
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
59
#define MWCfp MWC * 2.328306435454494e-10f
60
#define KISSfp KISS * 2.328306435454494e-10f
61

    
62
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
63
{
64
   uint z=seed_z/(blockIdx.x+1);
65
   uint w=seed_w/(blockIdx.x+1);
66
   // uint jsr=123456789;
67
   // uint jcong=380116160;
68

    
69
   int total=0;
70

    
71
   for (uint i=0;i<iterations;i++) {
72

    
73
      float x=MWCfp ;
74
      float y=MWCfp ;
75

    
76
      // Matching test
77
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
78
      total+=inside;
79

    
80
   }
81

    
82
   s[blockIdx.x]=total;
83
   __syncthreads();
84

    
85
}
86

    
87
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
88
{
89
   uint z=seed_z/(threadIdx.x+1);
90
   uint w=seed_w/(threadIdx.x+1);
91
   // uint jsr=123456789;
92
   // uint jcong=380116160;
93

    
94
   int total=0;
95

    
96
   for (uint i=0;i<iterations;i++) {
97

    
98
      float x=MWCfp ;
99
      float y=MWCfp ;
100

    
101
      // Matching test
102
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
103
      total+=inside;
104

    
105
   }
106

    
107
   s[threadIdx.x]=total;
108
   __syncthreads();
109

    
110
}
111

    
112
__global__ void MainLoopHybrid(uint *s,uint iterations,uint seed_w,uint seed_z)
113
{
114
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
115
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
116
   // uint jsr=123456789;
117
   // uint jcong=380116160;
118

    
119
   int total=0;
120

    
121
   for (uint i=0;i<iterations;i++) {
122

    
123
      float x=MWCfp ;
124
      float y=MWCfp ;
125

    
126
      // Matching test
127
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
128
      total+=inside;
129

    
130
   }
131

    
132
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
133
   __syncthreads();
134

    
135
}
136
"""
137

    
138
KERNEL_CODE_OPENCL="""
139

    
140
// Marsaglia RNG very simple implementation
141
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
142
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
143
#define MWC   (znew+wnew)
144
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
145
#define CONG  (jcong=69069*jcong+1234567)
146
#define KISS  ((MWC^CONG)+SHR3)
147

    
148
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
149
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
150
#define MWCfp MWC * 2.328306435454494e-10f
151
#define KISSfp KISS * 2.328306435454494e-10f
152

    
153
__kernel void MainLoopGlobal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
154
{
155
   uint z=seed_z/(get_global_id(0)+1);
156
   uint w=seed_w/(get_global_id(0)+1);
157
   // uint jsr=123456789;
158
   // uint jcong=380116160;
159

    
160
   int total=0;
161

    
162
   for (uint i=0;i<iterations;i++) {
163

    
164
      float x=MWCfp ;
165
      float y=MWCfp ;
166

    
167
      // Matching test
168
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
169
      total+=inside;
170
   }
171
   s[get_global_id(0)]=total;
172
   barrier(CLK_GLOBAL_MEM_FENCE);
173
      
174
}
175

    
176
__kernel void MainLoopLocal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
177
{
178
   uint z=seed_z/(get_local_id(0)+1);
179
   uint w=seed_w/(get_local_id(0)+1);
180
   // uint jsr=123456789;
181
   // uint jcong=380116160;
182

    
183
   int total=0;
184

    
185
   for (uint i=0;i<iterations;i++) {
186

    
187
      float x=MWCfp ;
188
      float y=MWCfp ;
189

    
190
      // Matching test
191
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
192
      total+=inside;
193
   }
194
   s[get_local_id(0)]=total;
195
   barrier(CLK_LOCAL_MEM_FENCE);
196
      
197
}
198

    
199
__kernel void MainLoopHybrid(__global uint *s,uint iterations,uint seed_w,uint seed_z)
200
{
201
   uint z=seed_z/(get_global_id(0)+1);
202
   uint w=seed_w/(get_global_id(0)+1);
203
   
204
   // uint jsr=123456789;
205
   // uint jcong=380116160;
206

    
207
   int total=0;
208

    
209
   for (uint i=0;i<iterations;i++) {
210

    
211
      float x=MWCfp ;
212
     float y=MWCfp ;
213

    
214
      // Matching test
215
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
216
      total+=inside;
217
   }
218
   barrier(CLK_LOCAL_MEM_FENCE);
219
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
220
      
221
}
222
"""
223

    
224
def MetropolisCuda(circle,iterations,steps,threads,ParaStyle):
225

    
226
  # Avec PyCUDA autoinit, rien a faire !
227
  
228
  circleCU = cuda.InOut(circle)
229
  
230
  mod = SourceModule(KERNEL_CODE_CUDA)
231

    
232
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
233
  MetropolisThreadsCU=mod.get_function("MainLoopThreads")
234
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
235
  
236
  start = pycuda.driver.Event()
237
  stop = pycuda.driver.Event()
238
  
239
  MyPi=numpy.zeros(steps)
240
  MyDuration=numpy.zeros(steps)
241
  
242
  if iterations%threads==0:
243
    iterationsCL=numpy.uint32(iterations/threads+1)
244
    iterationsNew=iterationsCL*threads
245
  else:
246
    iterationsCL=numpy.uint32(iterations/threads)
247
    iterationsNew=iterations
248

    
249
  for i in range(steps):
250
    start.record()
251
    start.synchronize()
252
    if ParaStyle=='Blocks':
253
      MetropolisBlocksCU(circleCU,
254
                         numpy.uint32(iterationsCL),
255
                         numpy.uint32(nprnd(2**32/threads)),
256
                         numpy.uint32(nprnd(2**32/threads)),
257
                         grid=(threads,1),
258
                         block=(1,1,1))
259
      print "GPU with %i %s done" % (threads,ParaStyle)
260
    elif ParaStyle=='Hybrid':
261
      blocks=int(math.sqrt(float(threads)))
262
      MetropolisHybridCU(circleCU,
263
                          numpy.uint32(iterationsCL),
264
                          numpy.uint32(nprnd(2**32/threads)),
265
                          numpy.uint32(nprnd(2**32/threads)),
266
                          grid=(blocks,1),
267
                          block=(threads/blocks,1,1))
268
      print "GPU with (blocks,threads)=(%i,%i) %s done" % (blocks,threads/blocks,ParaStyle)
269
    else:
270
      MetropolisThreadsCU(circleCU,
271
                          numpy.uint32(iterationsCL),
272
                          numpy.uint32(nprnd(2**32/threads)),
273
                          numpy.uint32(nprnd(2**32/threads)),
274
                          grid=(1,1),
275
                          block=(threads,1,1))
276
      print "GPU with %i %s done" % (threads,ParaStyle)
277
    stop.record()
278
    stop.synchronize()
279
                
280
    #elapsed = stop.time_since(start)*1e-3
281
    elapsed = start.time_till(stop)*1e-3
282

    
283
    #print circle,float(numpy.sum(circle))
284
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
285
    MyDuration[i]=elapsed
286
    #print MyPi[i],MyDuration[i]
287
    #time.sleep(1)
288

    
289
  print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
290

    
291
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
292

    
293

    
294
def MetropolisOpenCL(circle,iterations,steps,threads,ParaStyle,Alu):
295
	
296
  # Initialisation des variables en les CASTant correctement
297
    
298
  # Je detecte un peripherique GPU dans la liste des peripheriques
299
  # for platform in cl.get_platforms():
300
  # 	for device in platform.get_devices():
301
  # 		if cl.device_type.to_string(device.type)=='GPU':
302
  # 			GPU=device
303
  #print "GPU detected: ",device.name
304

    
305
  HasGPU=False
306
  # Device selection based on choice (default is GPU)
307
  for platform in cl.get_platforms():
308
    for device in platform.get_devices():
309
      if not HasGPU:
310
        deviceType=cl.device_type.to_string(device.type)
311
        if deviceType=="GPU" and Alu=="GPU":
312
          GPU=device
313
          print "GPU selected: ",device.name
314
          HasGPU=True
315
        if deviceType=="CPU" and Alu=="CPU":	    
316
          GPU=device
317
          print "CPU selected: ",device.name
318
          HasGPU=True	
319
				
320
  # Je cree le contexte et la queue pour son execution
321
  #ctx = cl.create_some_context()
322
  ctx = cl.Context([GPU])
323
  queue = cl.CommandQueue(ctx,
324
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
325

    
326
  # Je recupere les flag possibles pour les buffers
327
  mf = cl.mem_flags
328
	
329
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
330

    
331
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
332
    options = "-cl-mad-enable -cl-fast-relaxed-math")
333

    
334
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
335

    
336
  i=0
337

    
338
  MyPi=numpy.zeros(steps)
339
  MyDuration=numpy.zeros(steps)
340
  
341
  if iterations%threads==0:
342
    iterationsCL=numpy.uint32(iterations/threads+1)
343
    iterationsNew=iterationsCL*threads
344
  else:
345
    iterationsCL=numpy.uint32(iterations/threads)
346
    iterationsNew=iterations
347

    
348
  for i in range(steps):
349
		
350
    if ParaStyle=='Blocks':
351
      # Call OpenCL kernel
352
      # (1,) is Global work size (only 1 work size)
353
      # (1,) is local work size
354
      # circleCL is lattice translated in CL format
355
      # SeedZCL is lattice translated in CL format
356
      # SeedWCL is lattice translated in CL format
357
      # step is number of iterations
358
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(threads,),None,
359
                                           circleCL,
360
                                           numpy.uint32(iterationsCL),
361
                                           numpy.uint32(nprnd(2**32/threads)),
362
                                           numpy.uint32(nprnd(2**32/threads)))
363
      print "%s with %i %s done" % (Alu,threads,ParaStyle)
364
    elif ParaStyle=='Hybrid':
365
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
366
      blocks=int(math.sqrt(threads))
367
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(threads,),(blocks,),
368
                                          circleCL,
369
                                          numpy.uint32(iterationsCL),
370
                                          numpy.uint32(nprnd(2**32/threads)),
371
                                          numpy.uint32(nprnd(2**32/threads)))
372
      print "%s with (blocks,threads)=(%i,%i) %s done" % (Alu,blocks,threads/blocks,ParaStyle)
373
    else:
374
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
375
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(threads,),(threads,),
376
                                          circleCL,
377
                                          numpy.uint32(iterationsCL),
378
                                          numpy.uint32(nprnd(2**32/threads)),
379
                                          numpy.uint32(nprnd(2**32/threads)))
380
      print "%s with %i %s done" % (Alu,threads,ParaStyle)
381

    
382
    CLLaunch.wait()
383
    cl.enqueue_copy(queue, circle, circleCL).wait()
384

    
385
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
386

    
387
    print circle,float(numpy.sum(circle))
388
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
389
    MyDuration[i]=elapsed
390
    #print MyPi[i],MyDuration[i]
391

    
392
  circleCL.release()
393

    
394
  #print threads,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
395
  print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
396
	
397
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
398

    
399

    
400
def FitAndPrint(N,D,Curves):
401

    
402
  try:
403
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
404

    
405
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
406
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
407
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
408
    coeffs_Amdahl[0]=D[0]
409
    print "Amdahl Normalized: T=%.2f(%.5f+%.5f/N)" % \
410
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
411
  except:
412
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
413

    
414
  try:
415
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
416

    
417
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
418
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
419
    coeffs_AmdahlR[0]=D[0]
420
    print "Amdahl Reduced Normalized: T=%.2f(%.5f+%.5f/N)" % \
421
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
422

    
423
  except:
424
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
425

    
426
  try:
427
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
428

    
429
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
430
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
431
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
432
    coeffs_Mylq[0]=D[0]
433
    print "Mylq Normalized : T=%.2f(%.5f+%.5f*N+%.5f/N)" % (coeffs_Mylq[0],
434
                                                            coeffs_Mylq[1],
435
                                                            coeffs_Mylq[2],
436
                                                            coeffs_Mylq[3])
437
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
438
                coeffs_Mylq[3])
439
  except:
440
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
441

    
442
  try:
443
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
444

    
445
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
446
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
447
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
448
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
449
    coeffs_Mylq2[0]=D[0]
450
    print "Mylq 2nd order Normalized: T=%.2f(%.5f+%.5f*N+%.5f*N^2+%.5f/N)" % \
451
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
452
         coeffs_Mylq2[4])
453

    
454
  except:
455
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
456

    
457
  if Curves:
458
    plt.xlabel("Number of Threads/work Items")
459
    plt.ylabel("Total Elapsed Time")
460

    
461
    Experience,=plt.plot(N,D,'ro') 
462
    try:
463
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
464
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
465
    except:
466
      print "Fit curves seem not to be available"
467

    
468
    plt.legend()
469
    plt.show()
470

    
471
if __name__=='__main__':
472

    
473
  # Set defaults values
474
  # Alu can be CPU or GPU
475
  Alu='CPU'
476
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
477
  GpuStyle='OpenCL'
478
  # Parallel distribution can be on Threads or Blocks
479
  ParaStyle='Threads'
480
  # Iterations is integer
481
  Iterations=1000000000
482
  # ThreadStart in first number of Threads to explore
483
  ThreadStart=1
484
  # ThreadEnd is last number of Threads to explore
485
  ThreadEnd=512
486
  # Redo is the times to redo the test to improve metrology
487
  Redo=1
488
  # OutMetrology is method for duration estimation : False is GPU inside
489
  OutMetrology=False
490
  # Curves is True to print the curves
491
  Curves=False
492

    
493
  try:
494
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:",["alu=","gpustyle=","parastyle=","iterations=","threadstart=","threadend=","redo="])
495
  except getopt.GetoptError:
496
    print '%s -o -a <CPU/GPU> -g <CUDA/OpenCL> -p <ParaStyle> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
497
    sys.exit(2)
498
    
499
  for opt, arg in opts:
500
    if opt == '-h':
501
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -g <CUDA/OpenCL> -p <Threads/Blocks> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
502
      sys.exit()
503
    elif opt == '-o':
504
      OutMetrology=True
505
    elif opt == '-c':
506
      Curves=True
507
    elif opt in ("-a", "--alu"):
508
      Alu = arg
509
    elif opt in ("-g", "--gpustyle"):
510
      GpuStyle = arg
511
    elif opt in ("-p", "--parastyle"):
512
      ParaStyle = arg
513
    elif opt in ("-i", "--iterations"):
514
      Iterations = numpy.uint32(arg)
515
    elif opt in ("-s", "--threadstart"):
516
      ThreadStart = int(arg)
517
    elif opt in ("-e", "--threadend"):
518
      ThreadEnd = int(arg)
519
    elif opt in ("-r", "--redo"):
520
      Redo = int(arg)
521

    
522
  if Alu=='CPU' and GpuStyle=='CUDA':
523
    print "Alu can't be CPU for CUDA, set Alu to GPU"
524
    Alu='GPU'
525

    
526
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
527
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
528
    ParaStyle='Threads'
529

    
530
  print "Compute unit : %s" % Alu
531
  print "GpuStyle used : %s" % GpuStyle
532
  print "Parallel Style used : %s" % ParaStyle
533
  print "Iterations : %s" % Iterations
534
  print "Number of threads on start : %s" % ThreadStart
535
  print "Number of threads on end : %s" % ThreadEnd
536
  print "Number of redo : %s" % Redo
537
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
538

    
539
  if GpuStyle=='CUDA':
540
    # For PyCUDA import
541
    import pycuda.driver as cuda
542
    import pycuda.gpuarray as gpuarray
543
    import pycuda.autoinit
544
    from pycuda.compiler import SourceModule
545

    
546
  if GpuStyle=='OpenCL':
547
    # For PyOpenCL import
548
    import pyopencl as cl
549

    
550
  average=numpy.array([]).astype(numpy.float32)
551
  median=numpy.array([]).astype(numpy.float32)
552
  stddev=numpy.array([]).astype(numpy.float32)
553

    
554
  ExploredThreads=numpy.array([]).astype(numpy.uint32)
555

    
556
  Threads=ThreadStart
557

    
558
  while Threads <= ThreadEnd:
559
    ExploredThreads=numpy.append(ExploredThreads,Threads)
560
    circle=numpy.zeros(Threads).astype(numpy.uint32)
561

    
562
    if OutMetrology:
563
      duration=numpy.array([]).astype(numpy.float32)
564
      for i in range(Redo):
565
        start=time.time()
566
        if GpuStyle=='CUDA':
567
          try:
568
            MetropolisCuda(circle,Iterations,1,Threads,ParaStyle)
569
          except:
570
            print "Problem with %i // computations on Cuda" % Threads
571
        elif GpuStyle=='OpenCL':
572
          try:
573
            MetropolisOpenCL(circle,Iterations,1,Threads,ParaStyle,Alu)
574
          except:
575
            print "Problem with %i // computations on OpenCL" % Threads            
576
        duration=numpy.append(duration,time.time()-start)
577
      avg=numpy.mean(duration)
578
      med=numpy.median(duration)
579
      std=numpy.std(duration)
580
    else:
581
      if GpuStyle=='CUDA':
582
        avg,med,std=MetropolisCuda(circle,Iterations,Redo,Threads,ParaStyle)
583
      elif GpuStyle=='OpenCL':
584
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Threads,
585
                                     ParaStyle,Alu)
586
    print "avg,med,std",avg,med,std
587
    average=numpy.append(average,avg)
588
    median=numpy.append(median,med)
589
    stddev=numpy.append(stddev,std)
590
    #THREADS*=2
591
    numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,ThreadStart,ThreadEnd,Iterations,gethostname()),(ExploredThreads,average,median,stddev))
592
    Threads+=1
593

    
594
  FitAndPrint(ExploredThreads,median,Curves)
595