Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py-20130213-2 @ 7

Historique | Voir | Annoter | Télécharger (18,28 ko)

1 7 equemene
#!/usr/bin/env python
2 7 equemene
3 7 equemene
#
4 7 equemene
# Pi-by-MC using PyCUDA/PyOpenCL
5 7 equemene
#
6 7 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
7 7 equemene
#
8 7 equemene
# Thanks to Andreas Klockner for PyCUDA:
9 7 equemene
# http://mathema.tician.de/software/pycuda
10 7 equemene
#
11 7 equemene
12 7 equemene
# 2013-01-01 : problems with launch timeout
13 7 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14 7 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
15 7 equemene
16 7 equemene
# Common tools
17 7 equemene
import numpy
18 7 equemene
from numpy.random import randint as nprnd
19 7 equemene
import sys
20 7 equemene
import getopt
21 7 equemene
import time
22 7 equemene
import matplotlib.pyplot as plt
23 7 equemene
import math
24 7 equemene
from scipy.optimize import curve_fit
25 7 equemene
from socket import gethostname
26 7 equemene
27 7 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
28 7 equemene
def AmdahlR(N, T1, p):
29 7 equemene
  return (T1*(1-p+p/N))
30 7 equemene
31 7 equemene
# Predicted Amdahl Law
32 7 equemene
def Amdahl(N, T1, s, p):
33 7 equemene
  return (T1*(s+p/N))
34 7 equemene
35 7 equemene
# Predicted Mylq Law with first order
36 7 equemene
def Mylq(N, T1,s,c,p):
37 7 equemene
  return (T1*(s+c*N+p/N))
38 7 equemene
39 7 equemene
# Predicted Mylq Law with second order
40 7 equemene
def Mylq2(N, T1,s,c1,c2,p):
41 7 equemene
  return (T1*(s+c1*N+c2*N*N+p/N))
42 7 equemene
43 7 equemene
KERNEL_CODE_CUDA="""
44 7 equemene
45 7 equemene
// Marsaglia RNG very simple implementation
46 7 equemene
47 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
48 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
49 7 equemene
#define MWC   (znew+wnew)
50 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
51 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
52 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
53 7 equemene
54 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
55 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
56 7 equemene
57 7 equemene
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
58 7 equemene
{
59 7 equemene
   uint z=seed_z/(blockIdx.x+1);
60 7 equemene
   uint w=seed_w/(blockIdx.x+1);
61 7 equemene
62 7 equemene
   int total=0;
63 7 equemene
64 7 equemene
   for (uint i=0;i<iterations;i++) {
65 7 equemene
66 7 equemene
      float x=MWCfp ;
67 7 equemene
      float y=MWCfp ;
68 7 equemene
69 7 equemene
      // Matching test
70 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
71 7 equemene
      total+=inside;
72 7 equemene
73 7 equemene
   }
74 7 equemene
75 7 equemene
   s[blockIdx.x]=total;
76 7 equemene
   __syncthreads();
77 7 equemene
78 7 equemene
}
79 7 equemene
80 7 equemene
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
81 7 equemene
{
82 7 equemene
   uint z=seed_z/(threadIdx.x+1);
83 7 equemene
   uint w=seed_w/(threadIdx.x+1);
84 7 equemene
85 7 equemene
   int total=0;
86 7 equemene
87 7 equemene
   for (uint i=0;i<iterations;i++) {
88 7 equemene
89 7 equemene
      float x=MWCfp ;
90 7 equemene
      float y=MWCfp ;
91 7 equemene
92 7 equemene
      // Matching test
93 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
94 7 equemene
      total+=inside;
95 7 equemene
96 7 equemene
   }
97 7 equemene
98 7 equemene
   s[threadIdx.x]=total;
99 7 equemene
   __syncthreads();
100 7 equemene
101 7 equemene
}
102 7 equemene
103 7 equemene
__global__ void MainLoopHybrid(uint *s,uint iterations,uint seed_w,uint seed_z)
104 7 equemene
{
105 7 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
106 7 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
107 7 equemene
108 7 equemene
   int total=0;
109 7 equemene
110 7 equemene
   for (uint i=0;i<iterations;i++) {
111 7 equemene
112 7 equemene
      float x=MWCfp ;
113 7 equemene
      float y=MWCfp ;
114 7 equemene
115 7 equemene
      // Matching test
116 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
117 7 equemene
      total+=inside;
118 7 equemene
119 7 equemene
   }
120 7 equemene
121 7 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
122 7 equemene
   __syncthreads();
123 7 equemene
124 7 equemene
}
125 7 equemene
"""
126 7 equemene
127 7 equemene
KERNEL_CODE_OPENCL="""
128 7 equemene
129 7 equemene
// Marsaglia RNG very simple implementation
130 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
131 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
132 7 equemene
#define MWC   (znew+wnew)
133 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
134 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
135 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
136 7 equemene
137 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
138 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
139 7 equemene
140 7 equemene
__kernel void MainLoopGlobal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
141 7 equemene
{
142 7 equemene
   uint z=seed_z/(get_global_id(0)+1);
143 7 equemene
   uint w=seed_w/(get_global_id(0)+1);
144 7 equemene
145 7 equemene
   int total=0;
146 7 equemene
147 7 equemene
   for (uint i=0;i<iterations;i++) {
148 7 equemene
149 7 equemene
      float x=MWCfp ;
150 7 equemene
      float y=MWCfp ;
151 7 equemene
152 7 equemene
      // Matching test
153 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
154 7 equemene
      total+=inside;
155 7 equemene
   }
156 7 equemene
   s[get_global_id(0)]=total;
157 7 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
158 7 equemene
159 7 equemene
}
160 7 equemene
161 7 equemene
__kernel void MainLoopLocal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
162 7 equemene
{
163 7 equemene
   uint z=seed_z/(get_local_id(0)+1);
164 7 equemene
   uint w=seed_w/(get_local_id(0)+1);
165 7 equemene
166 7 equemene
   int total=0;
167 7 equemene
168 7 equemene
   for (uint i=0;i<iterations;i++) {
169 7 equemene
170 7 equemene
      float x=MWCfp ;
171 7 equemene
      float y=MWCfp ;
172 7 equemene
173 7 equemene
      // Matching test
174 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
175 7 equemene
      total+=inside;
176 7 equemene
   }
177 7 equemene
   s[get_local_id(0)]=total;
178 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
179 7 equemene
180 7 equemene
}
181 7 equemene
182 7 equemene
__kernel void MainLoopHybrid(__global uint *s,uint iterations,uint seed_w,uint seed_z)
183 7 equemene
{
184 7 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
185 7 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
186 7 equemene
187 7 equemene
   // uint jsr=123456789;
188 7 equemene
   // uint jcong=380116160;
189 7 equemene
190 7 equemene
   int total=0;
191 7 equemene
192 7 equemene
   for (uint i=0;i<iterations;i++) {
193 7 equemene
194 7 equemene
      float x=MWCfp ;
195 7 equemene
     float y=MWCfp ;
196 7 equemene
197 7 equemene
      // Matching test
198 7 equemene
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
199 7 equemene
      total+=inside;
200 7 equemene
   }
201 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
202 7 equemene
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
203 7 equemene
204 7 equemene
}
205 7 equemene
"""
206 7 equemene
207 7 equemene
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
208 7 equemene
209 7 equemene
  # Avec PyCUDA autoinit, rien a faire !
210 7 equemene
211 7 equemene
  circleCU = cuda.InOut(circle)
212 7 equemene
213 7 equemene
  mod = SourceModule(KERNEL_CODE_CUDA)
214 7 equemene
215 7 equemene
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
216 7 equemene
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
217 7 equemene
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
218 7 equemene
219 7 equemene
  start = pycuda.driver.Event()
220 7 equemene
  stop = pycuda.driver.Event()
221 7 equemene
222 7 equemene
  MyPi=numpy.zeros(steps)
223 7 equemene
  MyDuration=numpy.zeros(steps)
224 7 equemene
225 7 equemene
  if iterations%jobs==0:
226 7 equemene
    iterationsCL=numpy.uint32(iterations/jobs+1)
227 7 equemene
    iterationsNew=iterationsCL*jobs
228 7 equemene
  else:
229 7 equemene
    iterationsCL=numpy.uint32(iterations/jobs)
230 7 equemene
    iterationsNew=iterations
231 7 equemene
232 7 equemene
  for i in range(steps):
233 7 equemene
    start.record()
234 7 equemene
    start.synchronize()
235 7 equemene
    if ParaStyle=='Blocks':
236 7 equemene
      MetropolisBlocksCU(circleCU,
237 7 equemene
                         numpy.uint32(iterationsCL),
238 7 equemene
                         numpy.uint32(nprnd(2**32/jobs)),
239 7 equemene
                         numpy.uint32(nprnd(2**32/jobs)),
240 7 equemene
                         grid=(jobs,1),
241 7 equemene
                         block=(1,1,1))
242 7 equemene
      print "GPU with %i %s done" % (jobs,ParaStyle)
243 7 equemene
    elif ParaStyle=='Hybrid':
244 7 equemene
      blocks=jobs/int(math.sqrt(float(jobs)))
245 7 equemene
      MetropolisHybridCU(circleCU,
246 7 equemene
                          numpy.uint32(iterationsCL),
247 7 equemene
                          numpy.uint32(nprnd(2**32/jobs)),
248 7 equemene
                          numpy.uint32(nprnd(2**32/jobs)),
249 7 equemene
                          grid=(blocks,1),
250 7 equemene
                          block=(jobs/blocks,1,1))
251 7 equemene
      print "GPU with (blocks,jobs)=(%i,%i) %s done" % (blocks,jobs/blocks,ParaStyle)
252 7 equemene
    else:
253 7 equemene
      MetropolisJobsCU(circleCU,
254 7 equemene
                          numpy.uint32(iterationsCL),
255 7 equemene
                          numpy.uint32(nprnd(2**32/jobs)),
256 7 equemene
                          numpy.uint32(nprnd(2**32/jobs)),
257 7 equemene
                          grid=(1,1),
258 7 equemene
                          block=(jobs,1,1))
259 7 equemene
      print "GPU with %i %s done" % (jobs,ParaStyle)
260 7 equemene
    stop.record()
261 7 equemene
    stop.synchronize()
262 7 equemene
263 7 equemene
    #elapsed = stop.time_since(start)*1e-3
264 7 equemene
    elapsed = start.time_till(stop)*1e-3
265 7 equemene
266 7 equemene
    #print circle,float(numpy.sum(circle))
267 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
268 7 equemene
    MyDuration[i]=elapsed
269 7 equemene
    #print MyPi[i],MyDuration[i]
270 7 equemene
    #time.sleep(1)
271 7 equemene
272 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
273 7 equemene
274 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
275 7 equemene
276 7 equemene
277 7 equemene
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
278 7 equemene
279 7 equemene
  # Initialisation des variables en les CASTant correctement
280 7 equemene
281 7 equemene
  # Je detecte un peripherique GPU dans la liste des peripheriques
282 7 equemene
  # for platform in cl.get_platforms():
283 7 equemene
  # 	for device in platform.get_devices():
284 7 equemene
  # 		if cl.device_type.to_string(device.type)=='GPU':
285 7 equemene
  # 			GPU=device
286 7 equemene
  #print "GPU detected: ",device.name
287 7 equemene
288 7 equemene
  HasGPU=False
289 7 equemene
  Id=1
290 7 equemene
  # Device selection based on choice (default is GPU)
291 7 equemene
  for platform in cl.get_platforms():
292 7 equemene
    for device in platform.get_devices():
293 7 equemene
      if not HasGPU:
294 7 equemene
        deviceType=cl.device_type.to_string(device.type)
295 7 equemene
        if deviceType=="GPU" and Alu=="GPU" and Id==Device:
296 7 equemene
          GPU=device
297 7 equemene
          print "GPU selected: ",device.name
298 7 equemene
          HasGPU=True
299 7 equemene
        if deviceType=="CPU" and Alu=="CPU":
300 7 equemene
          GPU=device
301 7 equemene
          print "CPU selected: ",device.name
302 7 equemene
          HasGPU=True
303 7 equemene
      Id=Id+1
304 7 equemene
305 7 equemene
  # Je cree le contexte et la queue pour son execution
306 7 equemene
  #ctx = cl.create_some_context()
307 7 equemene
  ctx = cl.Context([GPU])
308 7 equemene
  queue = cl.CommandQueue(ctx,
309 7 equemene
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
310 7 equemene
311 7 equemene
  # Je recupere les flag possibles pour les buffers
312 7 equemene
  mf = cl.mem_flags
313 7 equemene
314 7 equemene
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
315 7 equemene
316 7 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
317 7 equemene
    options = "-cl-mad-enable -cl-fast-relaxed-math")
318 7 equemene
319 7 equemene
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
320 7 equemene
321 7 equemene
  i=0
322 7 equemene
323 7 equemene
  MyPi=numpy.zeros(steps)
324 7 equemene
  MyDuration=numpy.zeros(steps)
325 7 equemene
326 7 equemene
  if iterations%jobs==0:
327 7 equemene
    iterationsCL=numpy.uint32(iterations/jobs+1)
328 7 equemene
    iterationsNew=iterationsCL*jobs
329 7 equemene
  else:
330 7 equemene
    iterationsCL=numpy.uint32(iterations/jobs)
331 7 equemene
    iterationsNew=iterations
332 7 equemene
333 7 equemene
  blocks=int(math.sqrt(jobs))
334 7 equemene
335 7 equemene
  for i in range(steps):
336 7 equemene
337 7 equemene
    if ParaStyle=='Blocks':
338 7 equemene
      # Call OpenCL kernel
339 7 equemene
      # (1,) is Global work size (only 1 work size)
340 7 equemene
      # (1,) is local work size
341 7 equemene
      # circleCL is lattice translated in CL format
342 7 equemene
      # SeedZCL is lattice translated in CL format
343 7 equemene
      # SeedWCL is lattice translated in CL format
344 7 equemene
      # step is number of iterations
345 7 equemene
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
346 7 equemene
                                           circleCL,
347 7 equemene
                                           numpy.uint32(iterationsCL),
348 7 equemene
                                           numpy.uint32(nprnd(2**32/jobs)),
349 7 equemene
                                           numpy.uint32(nprnd(2**32/jobs)))
350 7 equemene
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
351 7 equemene
    elif ParaStyle=='Hybrid':
352 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
353 7 equemene
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(blocks*blocks,),(blocks,),
354 7 equemene
                                          circleCL,
355 7 equemene
                                          numpy.uint32(iterationsCL),
356 7 equemene
                                          numpy.uint32(nprnd(2**32/jobs)),
357 7 equemene
                                          numpy.uint32(nprnd(2**32/jobs)))
358 7 equemene
      print "%s with (Blocks,Threads)=(%i,%i) %s done" % (Alu,blocks,blocks,ParaStyle)
359 7 equemene
    else:
360 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
361 7 equemene
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
362 7 equemene
                                          circleCL,
363 7 equemene
                                          numpy.uint32(iterationsCL),
364 7 equemene
                                          numpy.uint32(nprnd(2**32/jobs)),
365 7 equemene
                                          numpy.uint32(nprnd(2**32/jobs)))
366 7 equemene
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
367 7 equemene
368 7 equemene
    CLLaunch.wait()
369 7 equemene
    cl.enqueue_copy(queue, circle, circleCL).wait()
370 7 equemene
371 7 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
372 7 equemene
373 7 equemene
    #print circle,float(numpy.sum(circle))
374 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
375 7 equemene
    MyDuration[i]=elapsed
376 7 equemene
    #print MyPi[i],MyDuration[i]
377 7 equemene
378 7 equemene
  circleCL.release()
379 7 equemene
380 7 equemene
  #print jobs,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
381 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
382 7 equemene
383 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
384 7 equemene
385 7 equemene
386 7 equemene
def FitAndPrint(N,D,Curves):
387 7 equemene
388 7 equemene
  try:
389 7 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
390 7 equemene
391 7 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
392 7 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
393 7 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
394 7 equemene
    coeffs_Amdahl[0]=D[0]
395 7 equemene
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
396 7 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
397 7 equemene
  except:
398 7 equemene
    print "Impossible to fit for Amdahl law : only %i elements" % len(D)
399 7 equemene
400 7 equemene
  try:
401 7 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
402 7 equemene
403 7 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
404 7 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
405 7 equemene
    coeffs_AmdahlR[0]=D[0]
406 7 equemene
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
407 7 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
408 7 equemene
409 7 equemene
  except:
410 7 equemene
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
411 7 equemene
412 7 equemene
  try:
413 7 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
414 7 equemene
415 7 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
416 7 equemene
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
417 7 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
418 7 equemene
    coeffs_Mylq[0]=D[0]
419 7 equemene
    print "Mylq Normalized : T=%.2f(%.6f+%.6f*N+%.6f/N)" % (coeffs_Mylq[0],
420 7 equemene
                                                            coeffs_Mylq[1],
421 7 equemene
                                                            coeffs_Mylq[2],
422 7 equemene
                                                            coeffs_Mylq[3])
423 7 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
424 7 equemene
                coeffs_Mylq[3])
425 7 equemene
  except:
426 7 equemene
    print "Impossible to fit for Mylq law : only %i elements" % len(D)
427 7 equemene
428 7 equemene
  try:
429 7 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
430 7 equemene
431 7 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
432 7 equemene
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
433 7 equemene
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
434 7 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
435 7 equemene
    coeffs_Mylq2[0]=D[0]
436 7 equemene
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f*N+%.6f*N^2+%.6f/N)" % \
437 7 equemene
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
438 7 equemene
         coeffs_Mylq2[4])
439 7 equemene
440 7 equemene
  except:
441 7 equemene
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
442 7 equemene
443 7 equemene
  if Curves:
444 7 equemene
    plt.xlabel("Number of Threads/work Items")
445 7 equemene
    plt.ylabel("Total Elapsed Time")
446 7 equemene
447 7 equemene
    Experience,=plt.plot(N,D,'ro')
448 7 equemene
    try:
449 7 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
450 7 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
451 7 equemene
    except:
452 7 equemene
      print "Fit curves seem not to be available"
453 7 equemene
454 7 equemene
    plt.legend()
455 7 equemene
    plt.show()
456 7 equemene
457 7 equemene
if __name__=='__main__':
458 7 equemene
459 7 equemene
  # Set defaults values
460 7 equemene
  # Alu can be CPU or GPU
461 7 equemene
  Alu='CPU'
462 7 equemene
  # Id of GPU
463 7 equemene
  Device=1
464 7 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
465 7 equemene
  GpuStyle='OpenCL'
466 7 equemene
  # Parallel distribution can be on Threads or Blocks
467 7 equemene
  ParaStyle='Threads'
468 7 equemene
  # Iterations is integer
469 7 equemene
  Iterations=1000000
470 7 equemene
  # JobStart in first number of Jobs to explore
471 7 equemene
  JobStart=1
472 7 equemene
  # JobEnd is last number of Jobs to explore
473 7 equemene
  JobEnd=512
474 7 equemene
  # Redo is the times to redo the test to improve metrology
475 7 equemene
  Redo=1
476 7 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
477 7 equemene
  OutMetrology=False
478 7 equemene
  # Curves is True to print the curves
479 7 equemene
  Curves=False
480 7 equemene
481 7 equemene
  try:
482 7 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","redo=","device="])
483 7 equemene
  except getopt.GetoptError:
484 7 equemene
    print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
485 7 equemene
    sys.exit(2)
486 7 equemene
487 7 equemene
  for opt, arg in opts:
488 7 equemene
    if opt == '-h':
489 7 equemene
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
490 7 equemene
      sys.exit()
491 7 equemene
    elif opt == '-o':
492 7 equemene
      OutMetrology=True
493 7 equemene
    elif opt == '-c':
494 7 equemene
      Curves=True
495 7 equemene
    elif opt in ("-a", "--alu"):
496 7 equemene
      Alu = arg
497 7 equemene
    elif opt in ("-d", "--device"):
498 7 equemene
      Device = int(arg)
499 7 equemene
    elif opt in ("-g", "--gpustyle"):
500 7 equemene
      GpuStyle = arg
501 7 equemene
    elif opt in ("-p", "--parastyle"):
502 7 equemene
      ParaStyle = arg
503 7 equemene
    elif opt in ("-i", "--iterations"):
504 7 equemene
      Iterations = numpy.uint32(arg)
505 7 equemene
    elif opt in ("-s", "--jobstart"):
506 7 equemene
      JobStart = int(arg)
507 7 equemene
    elif opt in ("-e", "--jobend"):
508 7 equemene
      JobEnd = int(arg)
509 7 equemene
    elif opt in ("-r", "--redo"):
510 7 equemene
      Redo = int(arg)
511 7 equemene
512 7 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
513 7 equemene
    print "Alu can't be CPU for CUDA, set Alu to GPU"
514 7 equemene
    Alu='GPU'
515 7 equemene
516 7 equemene
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
517 7 equemene
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
518 7 equemene
    ParaStyle='Threads'
519 7 equemene
520 7 equemene
  print "Compute unit : %s" % Alu
521 7 equemene
  print "Device Identification : %s" % Device
522 7 equemene
  print "GpuStyle used : %s" % GpuStyle
523 7 equemene
  print "Parallel Style used : %s" % ParaStyle
524 7 equemene
  print "Iterations : %s" % Iterations
525 7 equemene
  print "Number of threads on start : %s" % JobStart
526 7 equemene
  print "Number of threads on end : %s" % JobEnd
527 7 equemene
  print "Number of redo : %s" % Redo
528 7 equemene
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
529 7 equemene
530 7 equemene
  if GpuStyle=='CUDA':
531 7 equemene
    # For PyCUDA import
532 7 equemene
    import pycuda.driver as cuda
533 7 equemene
    import pycuda.gpuarray as gpuarray
534 7 equemene
    import pycuda.autoinit
535 7 equemene
    from pycuda.compiler import SourceModule
536 7 equemene
537 7 equemene
  if GpuStyle=='OpenCL':
538 7 equemene
    # For PyOpenCL import
539 7 equemene
    import pyopencl as cl
540 7 equemene
    Id=1
541 7 equemene
    for platform in cl.get_platforms():
542 7 equemene
      for device in platform.get_devices():
543 7 equemene
        deviceType=cl.device_type.to_string(device.type)
544 7 equemene
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
545 7 equemene
        Id=Id+1
546 7 equemene
547 7 equemene
  average=numpy.array([]).astype(numpy.float32)
548 7 equemene
  median=numpy.array([]).astype(numpy.float32)
549 7 equemene
  stddev=numpy.array([]).astype(numpy.float32)
550 7 equemene
551 7 equemene
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
552 7 equemene
553 7 equemene
  Jobs=JobStart
554 7 equemene
555 7 equemene
  while Jobs <= JobEnd:
556 7 equemene
    avg,med,std=0,0,0
557 7 equemene
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
558 7 equemene
    circle=numpy.zeros(Jobs).astype(numpy.uint32)
559 7 equemene
560 7 equemene
    if OutMetrology:
561 7 equemene
      duration=numpy.array([]).astype(numpy.float32)
562 7 equemene
      for i in range(Redo):
563 7 equemene
        start=time.time()
564 7 equemene
        if GpuStyle=='CUDA':
565 7 equemene
          try:
566 7 equemene
            MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
567 7 equemene
          except:
568 7 equemene
            print "Problem with %i // computations on Cuda" % Jobs
569 7 equemene
        elif GpuStyle=='OpenCL':
570 7 equemene
          try:
571 7 equemene
            MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device)
572 7 equemene
          except:
573 7 equemene
            print "Problem with %i // computations on OpenCL" % Jobs
574 7 equemene
        duration=numpy.append(duration,time.time()-start)
575 7 equemene
      avg=numpy.mean(duration)
576 7 equemene
      med=numpy.median(duration)
577 7 equemene
      std=numpy.std(duration)
578 7 equemene
    else:
579 7 equemene
      if GpuStyle=='CUDA':
580 7 equemene
        try:
581 7 equemene
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
582 7 equemene
        except:
583 7 equemene
          print "Problem with %i // computations on Cuda" % Jobs
584 7 equemene
      elif GpuStyle=='OpenCL':
585 7 equemene
        try:
586 7 equemene
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
587 7 equemene
        except:
588 7 equemene
          print "Problem with %i // computations on OpenCL" % Jobs
589 7 equemene
590 7 equemene
    if (avg,med,std) != (0,0,0):
591 7 equemene
      print "avg,med,std",avg,med,std
592 7 equemene
      average=numpy.append(average,avg)
593 7 equemene
      median=numpy.append(median,med)
594 7 equemene
      stddev=numpy.append(stddev,std)
595 7 equemene
    else:
596 7 equemene
      print "Values seem to be wrong..."
597 7 equemene
    #THREADS*=2
598 7 equemene
    numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,gethostname()),(ExploredJobs,average,median,stddev))
599 7 equemene
    Jobs+=1
600 7 equemene
601 7 equemene
  FitAndPrint(ExploredJobs,median,Curves)