Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py-20130206 @ 65

Historique | Voir | Annoter | Télécharger (16,01 ko)

1 7 equemene
#!/usr/bin/env python
2 7 equemene
#
3 7 equemene
# Pi-by-MC using PyCUDA
4 7 equemene
#
5 7 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
6 7 equemene
#
7 7 equemene
# Thanks to Andreas Klockner for PyCUDA:
8 7 equemene
# http://mathema.tician.de/software/pycuda
9 7 equemene
#
10 7 equemene
11 7 equemene
# 2013-01-01 : problems with launch timeout
12 7 equemene
# nvidia-smi -c 1 : ko
13 7 equemene
# nvidia-smi -c 3 : ko
14 7 equemene
# nvidia-smi --gom=1 : not supported
15 7 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
16 7 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
17 7 equemene
18 7 equemene
# Common tools
19 7 equemene
import numpy
20 7 equemene
from numpy.random import randint as nprnd
21 7 equemene
import sys
22 7 equemene
import getopt
23 7 equemene
import time
24 7 equemene
import matplotlib.pyplot as plt
25 7 equemene
from scipy.optimize import curve_fit
26 7 equemene
from socket import gethostname
27 7 equemene
28 7 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
29 7 equemene
def AmdahlR(N, T1, p):
30 7 equemene
  return (T1*(1-p+p/N))
31 7 equemene
32 7 equemene
# Predicted Amdahl Law
33 7 equemene
def Amdahl(N, T1, s, p):
34 7 equemene
  return (T1*(s+p/N))
35 7 equemene
36 7 equemene
# Predicted Mylq Law with first order
37 7 equemene
def Mylq(N, T1,s,c,p):
38 7 equemene
  return (T1*(s+c*N+p/N))
39 7 equemene
40 7 equemene
# Predicted Mylq Law with second order
41 7 equemene
def Mylq2(N, T1,s,c1,c2,p):
42 7 equemene
  return (T1*(s+c1*N+c2*N*N+p/N))
43 7 equemene
44 7 equemene
KERNEL_CODE_CUDA="""
45 7 equemene
46 7 equemene
// Marsaglia RNG very simple implementation
47 7 equemene
48 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
49 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
50 7 equemene
#define MWC   (znew+wnew)
51 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
52 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
53 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
54 7 equemene
55 7 equemene
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
56 7 equemene
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
57 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
58 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
59 7 equemene
60 7 equemene
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
61 7 equemene
{
62 7 equemene
   uint z=seed_z/(blockIdx.x+1);
63 7 equemene
   uint w=seed_w/(blockIdx.x+1);
64 7 equemene
   // uint jsr=123456789;
65 7 equemene
   // uint jcong=380116160;
66 7 equemene
67 7 equemene
   int total=0;
68 7 equemene
69 7 equemene
   for (uint i=0;i<iterations;i++) {
70 7 equemene
71 7 equemene
      float x=MWCfp ;
72 7 equemene
      float y=MWCfp ;
73 7 equemene
74 7 equemene
      // Matching test
75 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
76 7 equemene
      total+=inside;
77 7 equemene
78 7 equemene
   }
79 7 equemene
80 7 equemene
   s[blockIdx.x]=total;
81 7 equemene
   __syncthreads();
82 7 equemene
83 7 equemene
}
84 7 equemene
85 7 equemene
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
86 7 equemene
{
87 7 equemene
   uint z=seed_z/(threadIdx.x+1);
88 7 equemene
   uint w=seed_w/(threadIdx.x+1);
89 7 equemene
   // uint jsr=123456789;
90 7 equemene
   // uint jcong=380116160;
91 7 equemene
92 7 equemene
   int total=0;
93 7 equemene
94 7 equemene
   for (uint i=0;i<iterations;i++) {
95 7 equemene
96 7 equemene
      float x=MWCfp ;
97 7 equemene
      float y=MWCfp ;
98 7 equemene
99 7 equemene
      // Matching test
100 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
101 7 equemene
      total+=inside;
102 7 equemene
103 7 equemene
   }
104 7 equemene
105 7 equemene
   s[threadIdx.x]=total;
106 7 equemene
   __syncthreads();
107 7 equemene
108 7 equemene
}
109 7 equemene
"""
110 7 equemene
111 7 equemene
KERNEL_CODE_OPENCL="""
112 7 equemene
113 7 equemene
// Marsaglia RNG very simple implementation
114 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
115 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
116 7 equemene
#define MWC   (znew+wnew)
117 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
118 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
119 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
120 7 equemene
121 7 equemene
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
122 7 equemene
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
123 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
124 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
125 7 equemene
126 7 equemene
__kernel void MainLoopGlobal(__global int *s,uint iterations,uint seed_w,uint seed_z)
127 7 equemene
{
128 7 equemene
   uint z=seed_z/(get_global_id(0)+1);
129 7 equemene
   uint w=seed_w/(get_global_id(0)+1);
130 7 equemene
   // uint jsr=123456789;
131 7 equemene
   // uint jcong=380116160;
132 7 equemene
133 7 equemene
   int total=0;
134 7 equemene
135 7 equemene
   for (uint i=0;i<iterations;i++) {
136 7 equemene
137 7 equemene
      float x=MWCfp ;
138 7 equemene
      float y=MWCfp ;
139 7 equemene
140 7 equemene
      // Matching test
141 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
142 7 equemene
      total+=inside;
143 7 equemene
   }
144 7 equemene
   s[get_global_id(0)]=total;
145 7 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
146 7 equemene
147 7 equemene
}
148 7 equemene
149 7 equemene
__kernel void MainLoopLocal(__global int *s,uint iterations,uint seed_w,uint seed_z)
150 7 equemene
{
151 7 equemene
   uint z=seed_z/(get_local_id(0)+1);
152 7 equemene
   uint w=seed_w/(get_local_id(0)+1);
153 7 equemene
   // uint jsr=123456789;
154 7 equemene
   // uint jcong=380116160;
155 7 equemene
156 7 equemene
   int total=0;
157 7 equemene
158 7 equemene
   for (uint i=0;i<iterations;i++) {
159 7 equemene
160 7 equemene
      float x=MWCfp ;
161 7 equemene
      float y=MWCfp ;
162 7 equemene
163 7 equemene
      // Matching test
164 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
165 7 equemene
      total+=inside;
166 7 equemene
   }
167 7 equemene
   s[get_local_id(0)]=total;
168 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
169 7 equemene
170 7 equemene
}
171 7 equemene
"""
172 7 equemene
173 7 equemene
def MetropolisCuda(circle,iterations,steps,threads,ParaStyle):
174 7 equemene
175 7 equemene
  # Avec PyCUDA autoinit, rien a faire !
176 7 equemene
177 7 equemene
  circleCU = cuda.InOut(circle)
178 7 equemene
179 7 equemene
  mod = SourceModule(KERNEL_CODE_CUDA)
180 7 equemene
181 7 equemene
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
182 7 equemene
  MetropolisThreadsCU=mod.get_function("MainLoopThreads")
183 7 equemene
184 7 equemene
  start = pycuda.driver.Event()
185 7 equemene
  stop = pycuda.driver.Event()
186 7 equemene
187 7 equemene
  MyPi=numpy.zeros(steps)
188 7 equemene
  MyDuration=numpy.zeros(steps)
189 7 equemene
190 7 equemene
  if iterations%threads==0:
191 7 equemene
    iterationsCL=numpy.uint32(iterations/threads+1)
192 7 equemene
    iterationsNew=iterationsCL*threads
193 7 equemene
  else:
194 7 equemene
    iterationsCL=numpy.uint32(iterations/threads)
195 7 equemene
    iterationsNew=iterations
196 7 equemene
197 7 equemene
  for i in range(steps):
198 7 equemene
    start.record()
199 7 equemene
    start.synchronize()
200 7 equemene
    if ParaStyle=='Blocks':
201 7 equemene
      MetropolisBlocksCU(circleCU,
202 7 equemene
                         numpy.uint32(iterationsCL),
203 7 equemene
                         numpy.uint32(nprnd(2**32/threads)),
204 7 equemene
                         numpy.uint32(nprnd(2**32/threads)),
205 7 equemene
                         grid=(threads,1),
206 7 equemene
                         block=(1,1,1))
207 7 equemene
    else:
208 7 equemene
      MetropolisThreadsCU(circleCU,
209 7 equemene
                          numpy.uint32(iterationsCL),
210 7 equemene
                          numpy.uint32(nprnd(2**32/threads)),
211 7 equemene
                          numpy.uint32(nprnd(2**32/threads)),
212 7 equemene
                          grid=(1,1),
213 7 equemene
                          block=(threads,1,1))
214 7 equemene
    print "GPU done"
215 7 equemene
    stop.record()
216 7 equemene
    stop.synchronize()
217 7 equemene
218 7 equemene
    #elapsed = stop.time_since(start)*1e-3
219 7 equemene
    elapsed = start.time_till(stop)*1e-3
220 7 equemene
221 7 equemene
    #print circle,float(numpy.sum(circle))
222 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
223 7 equemene
    MyDuration[i]=elapsed
224 7 equemene
    #print MyPi[i],MyDuration[i]
225 7 equemene
    #time.sleep(1)
226 7 equemene
227 7 equemene
  print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
228 7 equemene
229 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
230 7 equemene
231 7 equemene
232 7 equemene
def MetropolisOpenCL(circle,iterations,steps,threads,ParaStyle,Alu):
233 7 equemene
234 7 equemene
  # Initialisation des variables en les CASTant correctement
235 7 equemene
236 7 equemene
  # Je detecte un peripherique GPU dans la liste des peripheriques
237 7 equemene
  # for platform in cl.get_platforms():
238 7 equemene
  # 	for device in platform.get_devices():
239 7 equemene
  # 		if cl.device_type.to_string(device.type)=='GPU':
240 7 equemene
  # 			GPU=device
241 7 equemene
  #print "GPU detected: ",device.name
242 7 equemene
243 7 equemene
  HasGPU=False
244 7 equemene
  # Device selection based on choice (default is GPU)
245 7 equemene
  for platform in cl.get_platforms():
246 7 equemene
    for device in platform.get_devices():
247 7 equemene
      if not HasGPU:
248 7 equemene
        deviceType=cl.device_type.to_string(device.type)
249 7 equemene
        if deviceType=="GPU" and Alu=="GPU":
250 7 equemene
          GPU=device
251 7 equemene
          print "GPU selected: ",device.name
252 7 equemene
          HasGPU=True
253 7 equemene
        if deviceType=="CPU" and Alu=="CPU":
254 7 equemene
          GPU=device
255 7 equemene
          print "CPU selected: ",device.name
256 7 equemene
          HasGPU=True
257 7 equemene
258 7 equemene
  # Je cree le contexte et la queue pour son execution
259 7 equemene
  #ctx = cl.create_some_context()
260 7 equemene
  ctx = cl.Context([GPU])
261 7 equemene
  queue = cl.CommandQueue(ctx,
262 7 equemene
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
263 7 equemene
264 7 equemene
  # Je recupere les flag possibles pour les buffers
265 7 equemene
  mf = cl.mem_flags
266 7 equemene
267 7 equemene
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
268 7 equemene
269 7 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
270 7 equemene
    options = "-cl-mad-enable -cl-fast-relaxed-math")
271 7 equemene
272 7 equemene
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
273 7 equemene
274 7 equemene
  i=0
275 7 equemene
276 7 equemene
  MyPi=numpy.zeros(steps)
277 7 equemene
  MyDuration=numpy.zeros(steps)
278 7 equemene
279 7 equemene
  if iterations%threads==0:
280 7 equemene
    iterationsCL=numpy.uint32(iterations/threads+1)
281 7 equemene
    iterationsNew=iterationsCL*threads
282 7 equemene
  else:
283 7 equemene
    iterationsCL=numpy.uint32(iterations/threads)
284 7 equemene
    iterationsNew=iterations
285 7 equemene
286 7 equemene
  for i in range(steps):
287 7 equemene
288 7 equemene
    if ParaStyle=='Blocks':
289 7 equemene
      # Call OpenCL kernel
290 7 equemene
      # (1,) is Global work size (only 1 work size)
291 7 equemene
      # (1,) is local work size
292 7 equemene
      # circleCL is lattice translated in CL format
293 7 equemene
      # SeedZCL is lattice translated in CL format
294 7 equemene
      # SeedWCL is lattice translated in CL format
295 7 equemene
      # step is number of iterations
296 7 equemene
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(threads,),(1,),
297 7 equemene
                                           circleCL,
298 7 equemene
                                           numpy.uint32(iterationsCL),
299 7 equemene
                                           numpy.uint32(nprnd(2**32/threads)),
300 7 equemene
                                           numpy.uint32(nprnd(2**32/threads)))
301 7 equemene
      print "%s with %s done" % (Alu,ParaStyle)
302 7 equemene
    else:
303 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
304 7 equemene
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(threads,),(threads,),
305 7 equemene
                                          circleCL,
306 7 equemene
                                          numpy.uint32(iterationsCL),
307 7 equemene
                                          numpy.uint32(nprnd(2**32/threads)),
308 7 equemene
                                          numpy.uint32(nprnd(2**32/threads)))
309 7 equemene
      print "%s with %s done" % (Alu,ParaStyle)
310 7 equemene
311 7 equemene
    CLLaunch.wait()
312 7 equemene
    cl.enqueue_copy(queue, circle, circleCL).wait()
313 7 equemene
314 7 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
315 7 equemene
316 7 equemene
    #print circle,float(numpy.sum(circle))
317 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
318 7 equemene
    MyDuration[i]=elapsed
319 7 equemene
    #print MyPi[i],MyDuration[i]
320 7 equemene
321 7 equemene
  circleCL.release()
322 7 equemene
323 7 equemene
  #print threads,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
324 7 equemene
  print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
325 7 equemene
326 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
327 7 equemene
328 7 equemene
329 7 equemene
def FitAndPrint(N,D,Curves):
330 7 equemene
331 7 equemene
  try:
332 7 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
333 7 equemene
334 7 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
335 7 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
336 7 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
337 7 equemene
    coeffs_Amdahl[0]=D[0]
338 7 equemene
    print "Amdahl Normalized: T=%.2f(%.5f+%.5f/N)" % \
339 7 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
340 7 equemene
  except:
341 7 equemene
    print "Impossible to fit for Amdahl law : only %i elements" % len(D)
342 7 equemene
343 7 equemene
  try:
344 7 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
345 7 equemene
346 7 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
347 7 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
348 7 equemene
    coeffs_AmdahlR[0]=D[0]
349 7 equemene
    print "Amdahl Reduced Normalized: T=%.2f(%.5f+%.5f/N)" % \
350 7 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
351 7 equemene
352 7 equemene
  except:
353 7 equemene
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
354 7 equemene
355 7 equemene
  try:
356 7 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
357 7 equemene
358 7 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
359 7 equemene
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
360 7 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
361 7 equemene
    coeffs_Mylq[0]=D[0]
362 7 equemene
    print "Mylq Normalized : T=%.2f(%.5f+%.5f*N+%.5f/N)" % (coeffs_Mylq[0],
363 7 equemene
                                                            coeffs_Mylq[1],
364 7 equemene
                                                            coeffs_Mylq[2],
365 7 equemene
                                                            coeffs_Mylq[3])
366 7 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
367 7 equemene
                coeffs_Mylq[3])
368 7 equemene
  except:
369 7 equemene
    print "Impossible to fit for Mylq law : only %i elements" % len(D)
370 7 equemene
371 7 equemene
  try:
372 7 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
373 7 equemene
374 7 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
375 7 equemene
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
376 7 equemene
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
377 7 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
378 7 equemene
    coeffs_Mylq2[0]=D[0]
379 7 equemene
    print "Mylq 2nd order Normalized: T=%.2f(%.5f+%.5f*N+%.5f*N^2+%.5f/N)" % \
380 7 equemene
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
381 7 equemene
         coeffs_Mylq2[4])
382 7 equemene
383 7 equemene
  except:
384 7 equemene
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
385 7 equemene
386 7 equemene
  if Curves:
387 7 equemene
    plt.xlabel("Number of Threads/work Items")
388 7 equemene
    plt.ylabel("Total Elapsed Time")
389 7 equemene
390 7 equemene
    Experience,=plt.plot(N,D,'ro')
391 7 equemene
    try:
392 7 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
393 7 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
394 7 equemene
    except:
395 7 equemene
      print "Fit curves seem not to be available"
396 7 equemene
397 7 equemene
    plt.legend()
398 7 equemene
    plt.show()
399 7 equemene
400 7 equemene
if __name__=='__main__':
401 7 equemene
402 7 equemene
  # Set defaults values
403 7 equemene
  # Alu can be CPU or GPU
404 7 equemene
  Alu='CPU'
405 7 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
406 7 equemene
  GpuStyle='OpenCL'
407 7 equemene
  # Parallel distribution can be on Threads or Blocks
408 7 equemene
  ParaStyle='Threads'
409 7 equemene
  # Iterations is integer
410 7 equemene
  Iterations=1000000000
411 7 equemene
  # ThreadStart in first number of Threads to explore
412 7 equemene
  ThreadStart=1
413 7 equemene
  # ThreadEnd is last number of Threads to explore
414 7 equemene
  ThreadEnd=512
415 7 equemene
  # Redo is the times to redo the test to improve metrology
416 7 equemene
  Redo=1
417 7 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
418 7 equemene
  OutMetrology=False
419 7 equemene
  # Curves is True to print the curves
420 7 equemene
  Curves=False
421 7 equemene
422 7 equemene
  try:
423 7 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:",["alu=","gpustyle=","parastyle=","iterations=","threadstart=","threadend=","redo="])
424 7 equemene
  except getopt.GetoptError:
425 7 equemene
    print '%s -o -a <CPU/GPU> -g <CUDA/OpenCL> -p <ParaStyle> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
426 7 equemene
    sys.exit(2)
427 7 equemene
428 7 equemene
  for opt, arg in opts:
429 7 equemene
    if opt == '-h':
430 7 equemene
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -g <CUDA/OpenCL> -p <Threads/Blocks> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
431 7 equemene
      sys.exit()
432 7 equemene
    elif opt == '-o':
433 7 equemene
      OutMetrology=True
434 7 equemene
    elif opt == '-c':
435 7 equemene
      Curves=True
436 7 equemene
    elif opt in ("-a", "--alu"):
437 7 equemene
      Alu = arg
438 7 equemene
    elif opt in ("-g", "--gpustyle"):
439 7 equemene
      GpuStyle = arg
440 7 equemene
    elif opt in ("-p", "--parastyle"):
441 7 equemene
      ParaStyle = arg
442 7 equemene
    elif opt in ("-i", "--iterations"):
443 7 equemene
      Iterations = numpy.uint32(arg)
444 7 equemene
    elif opt in ("-s", "--threadstart"):
445 7 equemene
      ThreadStart = int(arg)
446 7 equemene
    elif opt in ("-e", "--threadend"):
447 7 equemene
      ThreadEnd = int(arg)
448 7 equemene
    elif opt in ("-r", "--redo"):
449 7 equemene
      Redo = int(arg)
450 7 equemene
451 7 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
452 7 equemene
    print "Alu can't be CPU for CUDA, set Alu to GPU"
453 7 equemene
    Alu='GPU'
454 7 equemene
455 7 equemene
  if ParaStyle not in ('Blocks','Threads'):
456 7 equemene
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
457 7 equemene
    ParaStyle='Threads'
458 7 equemene
459 7 equemene
  print "Compute unit : %s" % Alu
460 7 equemene
  print "GpuStyle used : %s" % GpuStyle
461 7 equemene
  print "Parallel Style used : %s" % ParaStyle
462 7 equemene
  print "Iterations : %s" % Iterations
463 7 equemene
  print "Number of threads on start : %s" % ThreadStart
464 7 equemene
  print "Number of threads on end : %s" % ThreadEnd
465 7 equemene
  print "Number of redo : %s" % Redo
466 7 equemene
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
467 7 equemene
468 7 equemene
  if GpuStyle=='CUDA':
469 7 equemene
    # For PyCUDA import
470 7 equemene
    import pycuda.driver as cuda
471 7 equemene
    import pycuda.gpuarray as gpuarray
472 7 equemene
    import pycuda.autoinit
473 7 equemene
    from pycuda.compiler import SourceModule
474 7 equemene
475 7 equemene
  if GpuStyle=='OpenCL':
476 7 equemene
    # For PyOpenCL import
477 7 equemene
    import pyopencl as cl
478 7 equemene
479 7 equemene
  average=numpy.array([]).astype(numpy.float32)
480 7 equemene
  median=numpy.array([]).astype(numpy.float32)
481 7 equemene
  stddev=numpy.array([]).astype(numpy.float32)
482 7 equemene
483 7 equemene
  ExploredThreads=numpy.array([]).astype(numpy.uint32)
484 7 equemene
485 7 equemene
  Threads=ThreadStart
486 7 equemene
487 7 equemene
  while Threads <= ThreadEnd:
488 7 equemene
    ExploredThreads=numpy.append(ExploredThreads,Threads)
489 7 equemene
    circle=numpy.zeros(Threads).astype(numpy.uint32)
490 7 equemene
491 7 equemene
    if OutMetrology:
492 7 equemene
      duration=numpy.array([]).astype(numpy.float32)
493 7 equemene
      for i in range(Redo):
494 7 equemene
        start=time.time()
495 7 equemene
        if GpuStyle=='CUDA':
496 7 equemene
          try:
497 7 equemene
            MetropolisCuda(circle,Iterations,1,Threads,ParaStyle)
498 7 equemene
          except:
499 7 equemene
            print "Problem with %i // computations on Cuda" % Threads
500 7 equemene
        elif GpuStyle=='OpenCL':
501 7 equemene
          try:
502 7 equemene
            MetropolisOpenCL(circle,Iterations,1,Threads,ParaStyle,Alu)
503 7 equemene
          except:
504 7 equemene
            print "Problem with %i // computations on OpenCL" % Threads
505 7 equemene
        duration=numpy.append(duration,time.time()-start)
506 7 equemene
      avg=numpy.mean(duration)
507 7 equemene
      med=numpy.median(duration)
508 7 equemene
      std=numpy.std(duration)
509 7 equemene
    else:
510 7 equemene
      if GpuStyle=='CUDA':
511 7 equemene
        avg,med,std=MetropolisCuda(circle,Iterations,Redo,Threads,ParaStyle)
512 7 equemene
      elif GpuStyle=='OpenCL':
513 7 equemene
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Threads,
514 7 equemene
                                     ParaStyle,Alu)
515 7 equemene
    print "avg,med,std",avg,med,std
516 7 equemene
    average=numpy.append(average,avg)
517 7 equemene
    median=numpy.append(median,med)
518 7 equemene
    stddev=numpy.append(stddev,std)
519 7 equemene
    #THREADS*=2
520 7 equemene
    numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,ThreadStart,ThreadEnd,Iterations,gethostname()),(ExploredThreads,average,median,stddev))
521 7 equemene
    Threads+=1
522 7 equemene
523 7 equemene
  FitAndPrint(ExploredThreads,median,Curves)