Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py-20130213 @ 49

Historique | Voir | Annoter | Télécharger (18,32 ko)

1 7 equemene
#!/usr/bin/env python
2 7 equemene
3 7 equemene
#
4 7 equemene
# Pi-by-MC using PyCUDA
5 7 equemene
#
6 7 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
7 7 equemene
#
8 7 equemene
# Thanks to Andreas Klockner for PyCUDA:
9 7 equemene
# http://mathema.tician.de/software/pycuda
10 7 equemene
#
11 7 equemene
12 7 equemene
# 2013-01-01 : problems with launch timeout
13 7 equemene
# nvidia-smi -c 1 : ko
14 7 equemene
# nvidia-smi -c 3 : ko
15 7 equemene
# nvidia-smi --gom=1 : not supported
16 7 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 7 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 7 equemene
19 7 equemene
# Common tools
20 7 equemene
import numpy
21 7 equemene
from numpy.random import randint as nprnd
22 7 equemene
import sys
23 7 equemene
import getopt
24 7 equemene
import time
25 7 equemene
import matplotlib.pyplot as plt
26 7 equemene
import math
27 7 equemene
from scipy.optimize import curve_fit
28 7 equemene
from socket import gethostname
29 7 equemene
30 7 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
31 7 equemene
def AmdahlR(N, T1, p):
32 7 equemene
  return (T1*(1-p+p/N))
33 7 equemene
34 7 equemene
# Predicted Amdahl Law
35 7 equemene
def Amdahl(N, T1, s, p):
36 7 equemene
  return (T1*(s+p/N))
37 7 equemene
38 7 equemene
# Predicted Mylq Law with first order
39 7 equemene
def Mylq(N, T1,s,c,p):
40 7 equemene
  return (T1*(s+c*N+p/N))
41 7 equemene
42 7 equemene
# Predicted Mylq Law with second order
43 7 equemene
def Mylq2(N, T1,s,c1,c2,p):
44 7 equemene
  return (T1*(s+c1*N+c2*N*N+p/N))
45 7 equemene
46 7 equemene
KERNEL_CODE_CUDA="""
47 7 equemene
48 7 equemene
// Marsaglia RNG very simple implementation
49 7 equemene
50 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
51 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
52 7 equemene
#define MWC   (znew+wnew)
53 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
54 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
55 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
56 7 equemene
57 7 equemene
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
58 7 equemene
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
59 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
60 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
61 7 equemene
62 7 equemene
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
63 7 equemene
{
64 7 equemene
   uint z=seed_z/(blockIdx.x+1);
65 7 equemene
   uint w=seed_w/(blockIdx.x+1);
66 7 equemene
   // uint jsr=123456789;
67 7 equemene
   // uint jcong=380116160;
68 7 equemene
69 7 equemene
   int total=0;
70 7 equemene
71 7 equemene
   for (uint i=0;i<iterations;i++) {
72 7 equemene
73 7 equemene
      float x=MWCfp ;
74 7 equemene
      float y=MWCfp ;
75 7 equemene
76 7 equemene
      // Matching test
77 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
78 7 equemene
      total+=inside;
79 7 equemene
80 7 equemene
   }
81 7 equemene
82 7 equemene
   s[blockIdx.x]=total;
83 7 equemene
   __syncthreads();
84 7 equemene
85 7 equemene
}
86 7 equemene
87 7 equemene
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
88 7 equemene
{
89 7 equemene
   uint z=seed_z/(threadIdx.x+1);
90 7 equemene
   uint w=seed_w/(threadIdx.x+1);
91 7 equemene
   // uint jsr=123456789;
92 7 equemene
   // uint jcong=380116160;
93 7 equemene
94 7 equemene
   int total=0;
95 7 equemene
96 7 equemene
   for (uint i=0;i<iterations;i++) {
97 7 equemene
98 7 equemene
      float x=MWCfp ;
99 7 equemene
      float y=MWCfp ;
100 7 equemene
101 7 equemene
      // Matching test
102 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
103 7 equemene
      total+=inside;
104 7 equemene
105 7 equemene
   }
106 7 equemene
107 7 equemene
   s[threadIdx.x]=total;
108 7 equemene
   __syncthreads();
109 7 equemene
110 7 equemene
}
111 7 equemene
112 7 equemene
__global__ void MainLoopHybrid(uint *s,uint iterations,uint seed_w,uint seed_z)
113 7 equemene
{
114 7 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
115 7 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
116 7 equemene
   // uint jsr=123456789;
117 7 equemene
   // uint jcong=380116160;
118 7 equemene
119 7 equemene
   int total=0;
120 7 equemene
121 7 equemene
   for (uint i=0;i<iterations;i++) {
122 7 equemene
123 7 equemene
      float x=MWCfp ;
124 7 equemene
      float y=MWCfp ;
125 7 equemene
126 7 equemene
      // Matching test
127 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
128 7 equemene
      total+=inside;
129 7 equemene
130 7 equemene
   }
131 7 equemene
132 7 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
133 7 equemene
   __syncthreads();
134 7 equemene
135 7 equemene
}
136 7 equemene
"""
137 7 equemene
138 7 equemene
KERNEL_CODE_OPENCL="""
139 7 equemene
140 7 equemene
// Marsaglia RNG very simple implementation
141 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
142 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
143 7 equemene
#define MWC   (znew+wnew)
144 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
145 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
146 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
147 7 equemene
148 7 equemene
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
149 7 equemene
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
150 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
151 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
152 7 equemene
153 7 equemene
__kernel void MainLoopGlobal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
154 7 equemene
{
155 7 equemene
   uint z=seed_z/(get_global_id(0)+1);
156 7 equemene
   uint w=seed_w/(get_global_id(0)+1);
157 7 equemene
   // uint jsr=123456789;
158 7 equemene
   // uint jcong=380116160;
159 7 equemene
160 7 equemene
   int total=0;
161 7 equemene
162 7 equemene
   for (uint i=0;i<iterations;i++) {
163 7 equemene
164 7 equemene
      float x=MWCfp ;
165 7 equemene
      float y=MWCfp ;
166 7 equemene
167 7 equemene
      // Matching test
168 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
169 7 equemene
      total+=inside;
170 7 equemene
   }
171 7 equemene
   s[get_global_id(0)]=total;
172 7 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
173 7 equemene
174 7 equemene
}
175 7 equemene
176 7 equemene
__kernel void MainLoopLocal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
177 7 equemene
{
178 7 equemene
   uint z=seed_z/(get_local_id(0)+1);
179 7 equemene
   uint w=seed_w/(get_local_id(0)+1);
180 7 equemene
   // uint jsr=123456789;
181 7 equemene
   // uint jcong=380116160;
182 7 equemene
183 7 equemene
   int total=0;
184 7 equemene
185 7 equemene
   for (uint i=0;i<iterations;i++) {
186 7 equemene
187 7 equemene
      float x=MWCfp ;
188 7 equemene
      float y=MWCfp ;
189 7 equemene
190 7 equemene
      // Matching test
191 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
192 7 equemene
      total+=inside;
193 7 equemene
   }
194 7 equemene
   s[get_local_id(0)]=total;
195 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
196 7 equemene
197 7 equemene
}
198 7 equemene
199 7 equemene
__kernel void MainLoopHybrid(__global uint *s,uint iterations,uint seed_w,uint seed_z)
200 7 equemene
{
201 7 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
202 7 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
203 7 equemene
204 7 equemene
   // uint jsr=123456789;
205 7 equemene
   // uint jcong=380116160;
206 7 equemene
207 7 equemene
   int total=0;
208 7 equemene
209 7 equemene
   for (uint i=0;i<iterations;i++) {
210 7 equemene
211 7 equemene
      float x=MWCfp ;
212 7 equemene
     float y=MWCfp ;
213 7 equemene
214 7 equemene
      // Matching test
215 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
216 7 equemene
      total+=inside;
217 7 equemene
   }
218 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
219 7 equemene
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
220 7 equemene
221 7 equemene
}
222 7 equemene
"""
223 7 equemene
224 7 equemene
def MetropolisCuda(circle,iterations,steps,threads,ParaStyle):
225 7 equemene
226 7 equemene
  # Avec PyCUDA autoinit, rien a faire !
227 7 equemene
228 7 equemene
  circleCU = cuda.InOut(circle)
229 7 equemene
230 7 equemene
  mod = SourceModule(KERNEL_CODE_CUDA)
231 7 equemene
232 7 equemene
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
233 7 equemene
  MetropolisThreadsCU=mod.get_function("MainLoopThreads")
234 7 equemene
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
235 7 equemene
236 7 equemene
  start = pycuda.driver.Event()
237 7 equemene
  stop = pycuda.driver.Event()
238 7 equemene
239 7 equemene
  MyPi=numpy.zeros(steps)
240 7 equemene
  MyDuration=numpy.zeros(steps)
241 7 equemene
242 7 equemene
  if iterations%threads==0:
243 7 equemene
    iterationsCL=numpy.uint32(iterations/threads+1)
244 7 equemene
    iterationsNew=iterationsCL*threads
245 7 equemene
  else:
246 7 equemene
    iterationsCL=numpy.uint32(iterations/threads)
247 7 equemene
    iterationsNew=iterations
248 7 equemene
249 7 equemene
  for i in range(steps):
250 7 equemene
    start.record()
251 7 equemene
    start.synchronize()
252 7 equemene
    if ParaStyle=='Blocks':
253 7 equemene
      MetropolisBlocksCU(circleCU,
254 7 equemene
                         numpy.uint32(iterationsCL),
255 7 equemene
                         numpy.uint32(nprnd(2**32/threads)),
256 7 equemene
                         numpy.uint32(nprnd(2**32/threads)),
257 7 equemene
                         grid=(threads,1),
258 7 equemene
                         block=(1,1,1))
259 7 equemene
      print "GPU with %i %s done" % (threads,ParaStyle)
260 7 equemene
    elif ParaStyle=='Hybrid':
261 7 equemene
      blocks=threads/int(math.sqrt(float(threads)))
262 7 equemene
      MetropolisHybridCU(circleCU,
263 7 equemene
                          numpy.uint32(iterationsCL),
264 7 equemene
                          numpy.uint32(nprnd(2**32/threads)),
265 7 equemene
                          numpy.uint32(nprnd(2**32/threads)),
266 7 equemene
                          grid=(blocks,1),
267 7 equemene
                          block=(threads/blocks,1,1))
268 7 equemene
      print "GPU with (blocks,threads)=(%i,%i) %s done" % (blocks,threads/blocks,ParaStyle)
269 7 equemene
    else:
270 7 equemene
      MetropolisThreadsCU(circleCU,
271 7 equemene
                          numpy.uint32(iterationsCL),
272 7 equemene
                          numpy.uint32(nprnd(2**32/threads)),
273 7 equemene
                          numpy.uint32(nprnd(2**32/threads)),
274 7 equemene
                          grid=(1,1),
275 7 equemene
                          block=(threads,1,1))
276 7 equemene
      print "GPU with %i %s done" % (threads,ParaStyle)
277 7 equemene
    stop.record()
278 7 equemene
    stop.synchronize()
279 7 equemene
280 7 equemene
    #elapsed = stop.time_since(start)*1e-3
281 7 equemene
    elapsed = start.time_till(stop)*1e-3
282 7 equemene
283 7 equemene
    #print circle,float(numpy.sum(circle))
284 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
285 7 equemene
    MyDuration[i]=elapsed
286 7 equemene
    #print MyPi[i],MyDuration[i]
287 7 equemene
    #time.sleep(1)
288 7 equemene
289 7 equemene
  print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
290 7 equemene
291 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
292 7 equemene
293 7 equemene
294 7 equemene
def MetropolisOpenCL(circle,iterations,steps,threads,ParaStyle,Alu):
295 7 equemene
296 7 equemene
  # Initialisation des variables en les CASTant correctement
297 7 equemene
298 7 equemene
  # Je detecte un peripherique GPU dans la liste des peripheriques
299 7 equemene
  # for platform in cl.get_platforms():
300 7 equemene
  # 	for device in platform.get_devices():
301 7 equemene
  # 		if cl.device_type.to_string(device.type)=='GPU':
302 7 equemene
  # 			GPU=device
303 7 equemene
  #print "GPU detected: ",device.name
304 7 equemene
305 7 equemene
  HasGPU=False
306 7 equemene
  # Device selection based on choice (default is GPU)
307 7 equemene
  for platform in cl.get_platforms():
308 7 equemene
    for device in platform.get_devices():
309 7 equemene
      if not HasGPU:
310 7 equemene
        deviceType=cl.device_type.to_string(device.type)
311 7 equemene
        if deviceType=="GPU" and Alu=="GPU":
312 7 equemene
          GPU=device
313 7 equemene
          print "GPU selected: ",device.name
314 7 equemene
          HasGPU=True
315 7 equemene
        if deviceType=="CPU" and Alu=="CPU":
316 7 equemene
          GPU=device
317 7 equemene
          print "CPU selected: ",device.name
318 7 equemene
          HasGPU=True
319 7 equemene
320 7 equemene
  # Je cree le contexte et la queue pour son execution
321 7 equemene
  #ctx = cl.create_some_context()
322 7 equemene
  ctx = cl.Context([GPU])
323 7 equemene
  queue = cl.CommandQueue(ctx,
324 7 equemene
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
325 7 equemene
326 7 equemene
  # Je recupere les flag possibles pour les buffers
327 7 equemene
  mf = cl.mem_flags
328 7 equemene
329 7 equemene
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
330 7 equemene
331 7 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
332 7 equemene
    options = "-cl-mad-enable -cl-fast-relaxed-math")
333 7 equemene
334 7 equemene
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
335 7 equemene
336 7 equemene
  i=0
337 7 equemene
338 7 equemene
  MyPi=numpy.zeros(steps)
339 7 equemene
  MyDuration=numpy.zeros(steps)
340 7 equemene
341 7 equemene
  if iterations%threads==0:
342 7 equemene
    iterationsCL=numpy.uint32(iterations/threads+1)
343 7 equemene
    iterationsNew=iterationsCL*threads
344 7 equemene
  else:
345 7 equemene
    iterationsCL=numpy.uint32(iterations/threads)
346 7 equemene
    iterationsNew=iterations
347 7 equemene
348 7 equemene
  for i in range(steps):
349 7 equemene
350 7 equemene
    if ParaStyle=='Blocks':
351 7 equemene
      # Call OpenCL kernel
352 7 equemene
      # (1,) is Global work size (only 1 work size)
353 7 equemene
      # (1,) is local work size
354 7 equemene
      # circleCL is lattice translated in CL format
355 7 equemene
      # SeedZCL is lattice translated in CL format
356 7 equemene
      # SeedWCL is lattice translated in CL format
357 7 equemene
      # step is number of iterations
358 7 equemene
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(threads,),None,
359 7 equemene
                                           circleCL,
360 7 equemene
                                           numpy.uint32(iterationsCL),
361 7 equemene
                                           numpy.uint32(nprnd(2**32/threads)),
362 7 equemene
                                           numpy.uint32(nprnd(2**32/threads)))
363 7 equemene
      print "%s with %i %s done" % (Alu,threads,ParaStyle)
364 7 equemene
    elif ParaStyle=='Hybrid':
365 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
366 7 equemene
      blocks=int(math.sqrt(threads))
367 7 equemene
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(threads,),(blocks,),
368 7 equemene
                                          circleCL,
369 7 equemene
                                          numpy.uint32(iterationsCL),
370 7 equemene
                                          numpy.uint32(nprnd(2**32/threads)),
371 7 equemene
                                          numpy.uint32(nprnd(2**32/threads)))
372 7 equemene
      print "%s with (blocks,threads)=(%i,%i) %s done" % (Alu,blocks,threads/blocks,ParaStyle)
373 7 equemene
    else:
374 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
375 7 equemene
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(threads,),(threads,),
376 7 equemene
                                          circleCL,
377 7 equemene
                                          numpy.uint32(iterationsCL),
378 7 equemene
                                          numpy.uint32(nprnd(2**32/threads)),
379 7 equemene
                                          numpy.uint32(nprnd(2**32/threads)))
380 7 equemene
      print "%s with %i %s done" % (Alu,threads,ParaStyle)
381 7 equemene
382 7 equemene
    CLLaunch.wait()
383 7 equemene
    cl.enqueue_copy(queue, circle, circleCL).wait()
384 7 equemene
385 7 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
386 7 equemene
387 7 equemene
    print circle,float(numpy.sum(circle))
388 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
389 7 equemene
    MyDuration[i]=elapsed
390 7 equemene
    #print MyPi[i],MyDuration[i]
391 7 equemene
392 7 equemene
  circleCL.release()
393 7 equemene
394 7 equemene
  #print threads,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
395 7 equemene
  print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
396 7 equemene
397 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
398 7 equemene
399 7 equemene
400 7 equemene
def FitAndPrint(N,D,Curves):
401 7 equemene
402 7 equemene
  try:
403 7 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
404 7 equemene
405 7 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
406 7 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
407 7 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
408 7 equemene
    coeffs_Amdahl[0]=D[0]
409 7 equemene
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
410 7 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
411 7 equemene
  except:
412 7 equemene
    print "Impossible to fit for Amdahl law : only %i elements" % len(D)
413 7 equemene
414 7 equemene
  try:
415 7 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
416 7 equemene
417 7 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
418 7 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
419 7 equemene
    coeffs_AmdahlR[0]=D[0]
420 7 equemene
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
421 7 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
422 7 equemene
423 7 equemene
  except:
424 7 equemene
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
425 7 equemene
426 7 equemene
  try:
427 7 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
428 7 equemene
429 7 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
430 7 equemene
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
431 7 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
432 7 equemene
    coeffs_Mylq[0]=D[0]
433 7 equemene
    print "Mylq Normalized : T=%.2f(%.6f+%.6f*N+%.6f/N)" % (coeffs_Mylq[0],
434 7 equemene
                                                            coeffs_Mylq[1],
435 7 equemene
                                                            coeffs_Mylq[2],
436 7 equemene
                                                            coeffs_Mylq[3])
437 7 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
438 7 equemene
                coeffs_Mylq[3])
439 7 equemene
  except:
440 7 equemene
    print "Impossible to fit for Mylq law : only %i elements" % len(D)
441 7 equemene
442 7 equemene
  try:
443 7 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
444 7 equemene
445 7 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
446 7 equemene
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
447 7 equemene
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
448 7 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
449 7 equemene
    coeffs_Mylq2[0]=D[0]
450 7 equemene
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f*N+%.6f*N^2+%.6f/N)" % \
451 7 equemene
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
452 7 equemene
         coeffs_Mylq2[4])
453 7 equemene
454 7 equemene
  except:
455 7 equemene
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
456 7 equemene
457 7 equemene
  if Curves:
458 7 equemene
    plt.xlabel("Number of Threads/work Items")
459 7 equemene
    plt.ylabel("Total Elapsed Time")
460 7 equemene
461 7 equemene
    Experience,=plt.plot(N,D,'ro')
462 7 equemene
    try:
463 7 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
464 7 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
465 7 equemene
    except:
466 7 equemene
      print "Fit curves seem not to be available"
467 7 equemene
468 7 equemene
    plt.legend()
469 7 equemene
    plt.show()
470 7 equemene
471 7 equemene
if __name__=='__main__':
472 7 equemene
473 7 equemene
  # Set defaults values
474 7 equemene
  # Alu can be CPU or GPU
475 7 equemene
  Alu='CPU'
476 7 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
477 7 equemene
  GpuStyle='OpenCL'
478 7 equemene
  # Parallel distribution can be on Threads or Blocks
479 7 equemene
  ParaStyle='Threads'
480 7 equemene
  # Iterations is integer
481 7 equemene
  Iterations=1000000000
482 7 equemene
  # ThreadStart in first number of Threads to explore
483 7 equemene
  ThreadStart=1
484 7 equemene
  # ThreadEnd is last number of Threads to explore
485 7 equemene
  ThreadEnd=512
486 7 equemene
  # Redo is the times to redo the test to improve metrology
487 7 equemene
  Redo=1
488 7 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
489 7 equemene
  OutMetrology=False
490 7 equemene
  # Curves is True to print the curves
491 7 equemene
  Curves=False
492 7 equemene
493 7 equemene
  try:
494 7 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:",["alu=","gpustyle=","parastyle=","iterations=","threadstart=","threadend=","redo="])
495 7 equemene
  except getopt.GetoptError:
496 7 equemene
    print '%s -o -a <CPU/GPU> -g <CUDA/OpenCL> -p <ParaStyle> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
497 7 equemene
    sys.exit(2)
498 7 equemene
499 7 equemene
  for opt, arg in opts:
500 7 equemene
    if opt == '-h':
501 7 equemene
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -g <CUDA/OpenCL> -p <Threads/Blocks> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
502 7 equemene
      sys.exit()
503 7 equemene
    elif opt == '-o':
504 7 equemene
      OutMetrology=True
505 7 equemene
    elif opt == '-c':
506 7 equemene
      Curves=True
507 7 equemene
    elif opt in ("-a", "--alu"):
508 7 equemene
      Alu = arg
509 7 equemene
    elif opt in ("-g", "--gpustyle"):
510 7 equemene
      GpuStyle = arg
511 7 equemene
    elif opt in ("-p", "--parastyle"):
512 7 equemene
      ParaStyle = arg
513 7 equemene
    elif opt in ("-i", "--iterations"):
514 7 equemene
      Iterations = numpy.uint32(arg)
515 7 equemene
    elif opt in ("-s", "--threadstart"):
516 7 equemene
      ThreadStart = int(arg)
517 7 equemene
    elif opt in ("-e", "--threadend"):
518 7 equemene
      ThreadEnd = int(arg)
519 7 equemene
    elif opt in ("-r", "--redo"):
520 7 equemene
      Redo = int(arg)
521 7 equemene
522 7 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
523 7 equemene
    print "Alu can't be CPU for CUDA, set Alu to GPU"
524 7 equemene
    Alu='GPU'
525 7 equemene
526 7 equemene
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
527 7 equemene
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
528 7 equemene
    ParaStyle='Threads'
529 7 equemene
530 7 equemene
  print "Compute unit : %s" % Alu
531 7 equemene
  print "GpuStyle used : %s" % GpuStyle
532 7 equemene
  print "Parallel Style used : %s" % ParaStyle
533 7 equemene
  print "Iterations : %s" % Iterations
534 7 equemene
  print "Number of threads on start : %s" % ThreadStart
535 7 equemene
  print "Number of threads on end : %s" % ThreadEnd
536 7 equemene
  print "Number of redo : %s" % Redo
537 7 equemene
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
538 7 equemene
539 7 equemene
  if GpuStyle=='CUDA':
540 7 equemene
    # For PyCUDA import
541 7 equemene
    import pycuda.driver as cuda
542 7 equemene
    import pycuda.gpuarray as gpuarray
543 7 equemene
    import pycuda.autoinit
544 7 equemene
    from pycuda.compiler import SourceModule
545 7 equemene
546 7 equemene
  if GpuStyle=='OpenCL':
547 7 equemene
    # For PyOpenCL import
548 7 equemene
    import pyopencl as cl
549 7 equemene
550 7 equemene
  average=numpy.array([]).astype(numpy.float32)
551 7 equemene
  median=numpy.array([]).astype(numpy.float32)
552 7 equemene
  stddev=numpy.array([]).astype(numpy.float32)
553 7 equemene
554 7 equemene
  ExploredThreads=numpy.array([]).astype(numpy.uint32)
555 7 equemene
556 7 equemene
  Threads=ThreadStart
557 7 equemene
558 7 equemene
  while Threads <= ThreadEnd:
559 7 equemene
    ExploredThreads=numpy.append(ExploredThreads,Threads)
560 7 equemene
    circle=numpy.zeros(Threads).astype(numpy.uint32)
561 7 equemene
562 7 equemene
    if OutMetrology:
563 7 equemene
      duration=numpy.array([]).astype(numpy.float32)
564 7 equemene
      for i in range(Redo):
565 7 equemene
        start=time.time()
566 7 equemene
        if GpuStyle=='CUDA':
567 7 equemene
          try:
568 7 equemene
            MetropolisCuda(circle,Iterations,1,Threads,ParaStyle)
569 7 equemene
          except:
570 7 equemene
            print "Problem with %i // computations on Cuda" % Threads
571 7 equemene
        elif GpuStyle=='OpenCL':
572 7 equemene
          try:
573 7 equemene
            MetropolisOpenCL(circle,Iterations,1,Threads,ParaStyle,Alu)
574 7 equemene
          except:
575 7 equemene
            print "Problem with %i // computations on OpenCL" % Threads
576 7 equemene
        duration=numpy.append(duration,time.time()-start)
577 7 equemene
      avg=numpy.mean(duration)
578 7 equemene
      med=numpy.median(duration)
579 7 equemene
      std=numpy.std(duration)
580 7 equemene
    else:
581 7 equemene
      if GpuStyle=='CUDA':
582 7 equemene
        avg,med,std=MetropolisCuda(circle,Iterations,Redo,Threads,ParaStyle)
583 7 equemene
      elif GpuStyle=='OpenCL':
584 7 equemene
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Threads,
585 7 equemene
                                     ParaStyle,Alu)
586 7 equemene
    print "avg,med,std",avg,med,std
587 7 equemene
    average=numpy.append(average,avg)
588 7 equemene
    median=numpy.append(median,med)
589 7 equemene
    stddev=numpy.append(stddev,std)
590 7 equemene
    #THREADS*=2
591 7 equemene
    numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,ThreadStart,ThreadEnd,Iterations,gethostname()),(ExploredThreads,average,median,stddev))
592 7 equemene
    Threads+=1
593 7 equemene
594 7 equemene
  FitAndPrint(ExploredThreads,median,Curves)