Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 34

Historique | Voir | Annoter | Télécharger (19,56 ko)

1 7 equemene
#!/usr/bin/env python
2 7 equemene
3 7 equemene
#
4 7 equemene
# Pi-by-MC using PyCUDA/PyOpenCL
5 7 equemene
#
6 7 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
7 7 equemene
#
8 7 equemene
# Thanks to Andreas Klockner for PyCUDA:
9 7 equemene
# http://mathema.tician.de/software/pycuda
10 7 equemene
#
11 7 equemene
12 7 equemene
# 2013-01-01 : problems with launch timeout
13 7 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14 7 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
15 7 equemene
16 7 equemene
# Common tools
17 7 equemene
import numpy
18 7 equemene
from numpy.random import randint as nprnd
19 7 equemene
import sys
20 7 equemene
import getopt
21 7 equemene
import time
22 7 equemene
import matplotlib.pyplot as plt
23 7 equemene
import math
24 7 equemene
from scipy.optimize import curve_fit
25 7 equemene
from socket import gethostname
26 7 equemene
27 17 equemene
# find prime factors of a number
28 17 equemene
# Get for WWW :
29 17 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
30 17 equemene
def PrimeFactors(x):
31 17 equemene
  factorlist=numpy.array([]).astype('uint32')
32 17 equemene
  loop=2
33 17 equemene
  while loop<=x:
34 17 equemene
    if x%loop==0:
35 17 equemene
      x/=loop
36 17 equemene
      factorlist=numpy.append(factorlist,[loop])
37 17 equemene
    else:
38 17 equemene
      loop+=1
39 17 equemene
  return factorlist
40 17 equemene
41 17 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
42 17 equemene
# output is thread number
43 17 equemene
def BestThreadsNumber(jobs):
44 17 equemene
  factors=PrimeFactors(jobs)
45 17 equemene
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
46 17 equemene
  threads=1
47 17 equemene
  for factor in matrix.transpose().ravel():
48 17 equemene
    threads=threads*factor
49 17 equemene
    if threads*threads>jobs:
50 17 equemene
      break
51 17 equemene
  return(long(threads))
52 17 equemene
53 7 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
54 7 equemene
def AmdahlR(N, T1, p):
55 7 equemene
  return (T1*(1-p+p/N))
56 7 equemene
57 7 equemene
# Predicted Amdahl Law
58 7 equemene
def Amdahl(N, T1, s, p):
59 7 equemene
  return (T1*(s+p/N))
60 7 equemene
61 7 equemene
# Predicted Mylq Law with first order
62 7 equemene
def Mylq(N, T1,s,c,p):
63 7 equemene
  return (T1*(s+c*N+p/N))
64 7 equemene
65 7 equemene
# Predicted Mylq Law with second order
66 7 equemene
def Mylq2(N, T1,s,c1,c2,p):
67 7 equemene
  return (T1*(s+c1*N+c2*N*N+p/N))
68 7 equemene
69 7 equemene
KERNEL_CODE_CUDA="""
70 7 equemene

71 7 equemene
// Marsaglia RNG very simple implementation
72 7 equemene

73 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
74 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
75 7 equemene
#define MWC   (znew+wnew)
76 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
77 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
78 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
79 7 equemene

80 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
81 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
82 7 equemene

83 17 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
84 7 equemene
{
85 7 equemene
   uint z=seed_z/(blockIdx.x+1);
86 7 equemene
   uint w=seed_w/(blockIdx.x+1);
87 7 equemene

88 17 equemene
   ulong total=0;
89 7 equemene

90 17 equemene
   for (ulong i=0;i<iterations;i++) {
91 7 equemene

92 7 equemene
      float x=MWCfp ;
93 7 equemene
      float y=MWCfp ;
94 7 equemene

95 7 equemene
      // Matching test
96 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
97 7 equemene
      total+=inside;
98 7 equemene

99 7 equemene
   }
100 7 equemene

101 7 equemene
   s[blockIdx.x]=total;
102 7 equemene
   __syncthreads();
103 7 equemene

104 7 equemene
}
105 7 equemene

106 17 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
107 7 equemene
{
108 7 equemene
   uint z=seed_z/(threadIdx.x+1);
109 7 equemene
   uint w=seed_w/(threadIdx.x+1);
110 7 equemene

111 17 equemene
   ulong total=0;
112 7 equemene

113 17 equemene
   for (ulong i=0;i<iterations;i++) {
114 7 equemene

115 7 equemene
      float x=MWCfp ;
116 7 equemene
      float y=MWCfp ;
117 7 equemene

118 7 equemene
      // Matching test
119 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
120 7 equemene
      total+=inside;
121 7 equemene

122 7 equemene
   }
123 7 equemene

124 7 equemene
   s[threadIdx.x]=total;
125 7 equemene
   __syncthreads();
126 7 equemene

127 7 equemene
}
128 7 equemene

129 17 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
130 7 equemene
{
131 7 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
132 7 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
133 7 equemene

134 17 equemene
   ulong total=0;
135 7 equemene

136 17 equemene
   for (ulong i=0;i<iterations;i++) {
137 7 equemene

138 7 equemene
      float x=MWCfp ;
139 7 equemene
      float y=MWCfp ;
140 7 equemene

141 7 equemene
      // Matching test
142 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
143 7 equemene
      total+=inside;
144 7 equemene

145 7 equemene
   }
146 7 equemene

147 7 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
148 7 equemene
   __syncthreads();
149 7 equemene

150 7 equemene
}
151 7 equemene
"""
152 7 equemene
153 7 equemene
KERNEL_CODE_OPENCL="""
154 7 equemene

155 7 equemene
// Marsaglia RNG very simple implementation
156 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
157 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
158 7 equemene
#define MWC   (znew+wnew)
159 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
160 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
161 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
162 7 equemene

163 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
164 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
165 7 equemene

166 17 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
167 7 equemene
{
168 7 equemene
   uint z=seed_z/(get_global_id(0)+1);
169 7 equemene
   uint w=seed_w/(get_global_id(0)+1);
170 7 equemene

171 17 equemene
   ulong total=0;
172 7 equemene

173 17 equemene
   for (ulong i=0;i<iterations;i++) {
174 7 equemene

175 7 equemene
      float x=MWCfp ;
176 7 equemene
      float y=MWCfp ;
177 7 equemene

178 7 equemene
      // Matching test
179 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
180 7 equemene
      total+=inside;
181 7 equemene
   }
182 7 equemene
   s[get_global_id(0)]=total;
183 7 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
184 7 equemene

185 7 equemene
}
186 7 equemene

187 17 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
188 7 equemene
{
189 7 equemene
   uint z=seed_z/(get_local_id(0)+1);
190 7 equemene
   uint w=seed_w/(get_local_id(0)+1);
191 7 equemene

192 17 equemene
   ulong total=0;
193 7 equemene

194 17 equemene
   for (ulong i=0;i<iterations;i++) {
195 7 equemene

196 7 equemene
      float x=MWCfp ;
197 7 equemene
      float y=MWCfp ;
198 7 equemene

199 7 equemene
      // Matching test
200 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
201 7 equemene
      total+=inside;
202 7 equemene
   }
203 7 equemene
   s[get_local_id(0)]=total;
204 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
205 7 equemene

206 7 equemene
}
207 7 equemene

208 17 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
209 7 equemene
{
210 7 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
211 7 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
212 7 equemene

213 17 equemene
   ulong total=0;
214 7 equemene

215 7 equemene
   for (uint i=0;i<iterations;i++) {
216 7 equemene

217 7 equemene
      float x=MWCfp ;
218 7 equemene
     float y=MWCfp ;
219 7 equemene

220 7 equemene
      // Matching test
221 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
222 7 equemene
      total+=inside;
223 7 equemene
   }
224 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
225 7 equemene
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
226 7 equemene

227 7 equemene
}
228 7 equemene
"""
229 7 equemene
230 7 equemene
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
231 7 equemene
232 7 equemene
  # Avec PyCUDA autoinit, rien a faire !
233 7 equemene
234 7 equemene
  circleCU = cuda.InOut(circle)
235 7 equemene
236 7 equemene
  mod = SourceModule(KERNEL_CODE_CUDA)
237 7 equemene
238 7 equemene
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
239 7 equemene
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
240 7 equemene
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
241 7 equemene
242 7 equemene
  start = pycuda.driver.Event()
243 7 equemene
  stop = pycuda.driver.Event()
244 7 equemene
245 7 equemene
  MyPi=numpy.zeros(steps)
246 7 equemene
  MyDuration=numpy.zeros(steps)
247 7 equemene
248 7 equemene
  if iterations%jobs==0:
249 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
250 7 equemene
    iterationsNew=iterationsCL*jobs
251 7 equemene
  else:
252 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
253 7 equemene
    iterationsNew=iterations
254 7 equemene
255 7 equemene
  for i in range(steps):
256 7 equemene
    start.record()
257 7 equemene
    start.synchronize()
258 7 equemene
    if ParaStyle=='Blocks':
259 7 equemene
      MetropolisBlocksCU(circleCU,
260 17 equemene
                         numpy.uint64(iterationsCL),
261 16 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
262 16 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
263 7 equemene
                         grid=(jobs,1),
264 7 equemene
                         block=(1,1,1))
265 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
266 17 equemene
            (Alu,jobs,1,ParaStyle)
267 7 equemene
    elif ParaStyle=='Hybrid':
268 17 equemene
      threads=BestThreadsNumber(jobs)
269 7 equemene
      MetropolisHybridCU(circleCU,
270 17 equemene
                          numpy.uint64(iterationsCL),
271 16 equemene
                          numpy.uint32(nprnd(2**30/jobs)),
272 16 equemene
                          numpy.uint32(nprnd(2**30/jobs)),
273 17 equemene
                          grid=(jobs,1),
274 17 equemene
                          block=(threads,1,1))
275 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
276 17 equemene
            (Alu,jobs/threads,threads,ParaStyle)
277 7 equemene
    else:
278 7 equemene
      MetropolisJobsCU(circleCU,
279 17 equemene
                          numpy.uint64(iterationsCL),
280 16 equemene
                          numpy.uint32(nprnd(2**30/jobs)),
281 16 equemene
                          numpy.uint32(nprnd(2**30/jobs)),
282 7 equemene
                          grid=(1,1),
283 7 equemene
                          block=(jobs,1,1))
284 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
285 17 equemene
            (Alu,jobs,1,ParaStyle)
286 7 equemene
    stop.record()
287 7 equemene
    stop.synchronize()
288 7 equemene
289 7 equemene
    #elapsed = stop.time_since(start)*1e-3
290 7 equemene
    elapsed = start.time_till(stop)*1e-3
291 7 equemene
292 7 equemene
    #print circle,float(numpy.sum(circle))
293 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
294 7 equemene
    MyDuration[i]=elapsed
295 7 equemene
    #print MyPi[i],MyDuration[i]
296 7 equemene
    #time.sleep(1)
297 7 equemene
298 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
299 7 equemene
300 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
301 7 equemene
302 7 equemene
303 7 equemene
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
304 7 equemene
305 7 equemene
  # Initialisation des variables en les CASTant correctement
306 7 equemene
307 7 equemene
  # Je detecte un peripherique GPU dans la liste des peripheriques
308 7 equemene
  # for platform in cl.get_platforms():
309 7 equemene
  #         for device in platform.get_devices():
310 7 equemene
  #                 if cl.device_type.to_string(device.type)=='GPU':
311 7 equemene
  #                         GPU=device
312 7 equemene
  #print "GPU detected: ",device.name
313 7 equemene
314 7 equemene
  HasGPU=False
315 7 equemene
  Id=1
316 17 equemene
  # Primary Device selection based on Device Id
317 7 equemene
  for platform in cl.get_platforms():
318 7 equemene
    for device in platform.get_devices():
319 17 equemene
      deviceType=cl.device_type.to_string(device.type)
320 17 equemene
      if Id==Device and not HasGPU:
321 17 equemene
        GPU=device
322 17 equemene
        print "CPU/GPU selected: ",device.name
323 17 equemene
        HasGPU=True
324 7 equemene
      Id=Id+1
325 17 equemene
  # Default Device selection based on ALU Type
326 17 equemene
  for platform in cl.get_platforms():
327 17 equemene
    for device in platform.get_devices():
328 17 equemene
      deviceType=cl.device_type.to_string(device.type)
329 17 equemene
      if deviceType=="GPU" and Alu=="GPU" and not HasGPU:
330 17 equemene
        GPU=device
331 17 equemene
        print "GPU selected: ",device.name
332 17 equemene
        HasGPU=True
333 17 equemene
      if deviceType=="CPU" and Alu=="CPU" and not HasGPU:
334 17 equemene
        GPU=device
335 17 equemene
        print "CPU selected: ",device.name
336 17 equemene
        HasGPU=True
337 7 equemene
338 7 equemene
  # Je cree le contexte et la queue pour son execution
339 7 equemene
  #ctx = cl.create_some_context()
340 7 equemene
  ctx = cl.Context([GPU])
341 7 equemene
  queue = cl.CommandQueue(ctx,
342 7 equemene
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
343 7 equemene
344 7 equemene
  # Je recupere les flag possibles pour les buffers
345 7 equemene
  mf = cl.mem_flags
346 7 equemene
347 7 equemene
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
348 7 equemene
349 7 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
350 7 equemene
    options = "-cl-mad-enable -cl-fast-relaxed-math")
351 7 equemene
352 7 equemene
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
353 7 equemene
354 7 equemene
  i=0
355 7 equemene
356 7 equemene
  MyPi=numpy.zeros(steps)
357 7 equemene
  MyDuration=numpy.zeros(steps)
358 7 equemene
359 7 equemene
  if iterations%jobs==0:
360 7 equemene
    iterationsCL=numpy.uint32(iterations/jobs)
361 7 equemene
    iterationsNew=iterationsCL*jobs
362 7 equemene
  else:
363 7 equemene
    iterationsCL=numpy.uint32(iterations/jobs+1)
364 7 equemene
    iterationsNew=iterations
365 7 equemene
366 7 equemene
  for i in range(steps):
367 7 equemene
368 7 equemene
    if ParaStyle=='Blocks':
369 7 equemene
      # Call OpenCL kernel
370 7 equemene
      # (1,) is Global work size (only 1 work size)
371 7 equemene
      # (1,) is local work size
372 7 equemene
      # circleCL is lattice translated in CL format
373 7 equemene
      # SeedZCL is lattice translated in CL format
374 7 equemene
      # SeedWCL is lattice translated in CL format
375 7 equemene
      # step is number of iterations
376 7 equemene
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
377 7 equemene
                                           circleCL,
378 17 equemene
                                           numpy.uint64(iterationsCL),
379 16 equemene
                                           numpy.uint32(nprnd(2**30/jobs)),
380 16 equemene
                                           numpy.uint32(nprnd(2**30/jobs)))
381 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
382 17 equemene
            (Alu,jobs,1,ParaStyle)
383 7 equemene
    elif ParaStyle=='Hybrid':
384 17 equemene
      threads=BestThreadsNumber(jobs)
385 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
386 17 equemene
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
387 7 equemene
                                          circleCL,
388 17 equemene
                                          numpy.uint64(iterationsCL),
389 16 equemene
                                          numpy.uint32(nprnd(2**30/jobs)),
390 16 equemene
                                          numpy.uint32(nprnd(2**30/jobs)))
391 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
392 17 equemene
            (Alu,jobs/threads,threads,ParaStyle)
393 7 equemene
    else:
394 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
395 7 equemene
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
396 7 equemene
                                          circleCL,
397 17 equemene
                                          numpy.uint64(iterationsCL),
398 16 equemene
                                          numpy.uint32(nprnd(2**30/jobs)),
399 16 equemene
                                          numpy.uint32(nprnd(2**30/jobs)))
400 7 equemene
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
401 7 equemene
402 7 equemene
    CLLaunch.wait()
403 7 equemene
    cl.enqueue_copy(queue, circle, circleCL).wait()
404 7 equemene
405 7 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
406 7 equemene
407 7 equemene
    #print circle,float(numpy.sum(circle))
408 7 equemene
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
409 7 equemene
    MyDuration[i]=elapsed
410 7 equemene
    #print MyPi[i],MyDuration[i]
411 7 equemene
412 7 equemene
  circleCL.release()
413 7 equemene
414 7 equemene
  #print jobs,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
415 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
416 7 equemene
417 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
418 7 equemene
419 7 equemene
420 7 equemene
def FitAndPrint(N,D,Curves):
421 7 equemene
422 7 equemene
  try:
423 7 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
424 7 equemene
425 7 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
426 7 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
427 7 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
428 7 equemene
    coeffs_Amdahl[0]=D[0]
429 7 equemene
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
430 7 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
431 7 equemene
  except:
432 7 equemene
    print "Impossible to fit for Amdahl law : only %i elements" % len(D)
433 7 equemene
434 7 equemene
  try:
435 7 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
436 7 equemene
437 7 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
438 7 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
439 7 equemene
    coeffs_AmdahlR[0]=D[0]
440 7 equemene
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
441 7 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
442 7 equemene
443 7 equemene
  except:
444 7 equemene
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
445 7 equemene
446 7 equemene
  try:
447 7 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
448 7 equemene
449 7 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
450 7 equemene
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
451 7 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
452 7 equemene
    coeffs_Mylq[0]=D[0]
453 7 equemene
    print "Mylq Normalized : T=%.2f(%.6f+%.6f*N+%.6f/N)" % (coeffs_Mylq[0],
454 7 equemene
                                                            coeffs_Mylq[1],
455 7 equemene
                                                            coeffs_Mylq[2],
456 7 equemene
                                                            coeffs_Mylq[3])
457 7 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
458 7 equemene
                coeffs_Mylq[3])
459 7 equemene
  except:
460 7 equemene
    print "Impossible to fit for Mylq law : only %i elements" % len(D)
461 7 equemene
462 7 equemene
  try:
463 7 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
464 7 equemene
465 7 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
466 7 equemene
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
467 7 equemene
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
468 7 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
469 7 equemene
    coeffs_Mylq2[0]=D[0]
470 7 equemene
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f*N+%.6f*N^2+%.6f/N)" % \
471 7 equemene
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
472 7 equemene
         coeffs_Mylq2[4])
473 7 equemene
474 7 equemene
  except:
475 7 equemene
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
476 7 equemene
477 7 equemene
  if Curves:
478 7 equemene
    plt.xlabel("Number of Threads/work Items")
479 7 equemene
    plt.ylabel("Total Elapsed Time")
480 7 equemene
481 7 equemene
    Experience,=plt.plot(N,D,'ro')
482 7 equemene
    try:
483 7 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
484 7 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
485 7 equemene
    except:
486 7 equemene
      print "Fit curves seem not to be available"
487 7 equemene
488 7 equemene
    plt.legend()
489 7 equemene
    plt.show()
490 7 equemene
491 7 equemene
if __name__=='__main__':
492 7 equemene
493 7 equemene
  # Set defaults values
494 7 equemene
  # Alu can be CPU or GPU
495 7 equemene
  Alu='CPU'
496 17 equemene
  # Id of GPU : 0 is for first find !
497 17 equemene
  Device=0
498 7 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
499 7 equemene
  GpuStyle='OpenCL'
500 7 equemene
  # Parallel distribution can be on Threads or Blocks
501 7 equemene
  ParaStyle='Blocks'
502 7 equemene
  # Iterations is integer
503 7 equemene
  Iterations=100000000
504 7 equemene
  # JobStart in first number of Jobs to explore
505 7 equemene
  JobStart=1
506 7 equemene
  # JobEnd is last number of Jobs to explore
507 7 equemene
  JobEnd=16
508 7 equemene
  # Redo is the times to redo the test to improve metrology
509 7 equemene
  Redo=1
510 7 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
511 7 equemene
  OutMetrology=False
512 7 equemene
  # Curves is True to print the curves
513 7 equemene
  Curves=False
514 7 equemene
515 7 equemene
  try:
516 7 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","redo=","device="])
517 7 equemene
  except getopt.GetoptError:
518 7 equemene
    print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
519 7 equemene
    sys.exit(2)
520 7 equemene
521 7 equemene
  for opt, arg in opts:
522 7 equemene
    if opt == '-h':
523 7 equemene
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -r <RedoToImproveStats>' % sys.argv[0]
524 7 equemene
      sys.exit()
525 7 equemene
    elif opt == '-o':
526 7 equemene
      OutMetrology=True
527 7 equemene
    elif opt == '-c':
528 7 equemene
      Curves=True
529 7 equemene
    elif opt in ("-a", "--alu"):
530 7 equemene
      Alu = arg
531 7 equemene
    elif opt in ("-d", "--device"):
532 7 equemene
      Device = int(arg)
533 7 equemene
    elif opt in ("-g", "--gpustyle"):
534 7 equemene
      GpuStyle = arg
535 7 equemene
    elif opt in ("-p", "--parastyle"):
536 7 equemene
      ParaStyle = arg
537 7 equemene
    elif opt in ("-i", "--iterations"):
538 7 equemene
      Iterations = numpy.uint32(arg)
539 7 equemene
    elif opt in ("-s", "--jobstart"):
540 7 equemene
      JobStart = int(arg)
541 7 equemene
    elif opt in ("-e", "--jobend"):
542 7 equemene
      JobEnd = int(arg)
543 7 equemene
    elif opt in ("-r", "--redo"):
544 7 equemene
      Redo = int(arg)
545 7 equemene
546 7 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
547 7 equemene
    print "Alu can't be CPU for CUDA, set Alu to GPU"
548 7 equemene
    Alu='GPU'
549 7 equemene
550 7 equemene
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
551 7 equemene
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
552 7 equemene
    ParaStyle='Threads'
553 7 equemene
554 7 equemene
  print "Compute unit : %s" % Alu
555 7 equemene
  print "Device Identification : %s" % Device
556 7 equemene
  print "GpuStyle used : %s" % GpuStyle
557 7 equemene
  print "Parallel Style used : %s" % ParaStyle
558 7 equemene
  print "Iterations : %s" % Iterations
559 7 equemene
  print "Number of threads on start : %s" % JobStart
560 7 equemene
  print "Number of threads on end : %s" % JobEnd
561 7 equemene
  print "Number of redo : %s" % Redo
562 7 equemene
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
563 7 equemene
564 7 equemene
  if GpuStyle=='CUDA':
565 7 equemene
    # For PyCUDA import
566 7 equemene
    import pycuda.driver as cuda
567 7 equemene
    import pycuda.gpuarray as gpuarray
568 7 equemene
    import pycuda.autoinit
569 7 equemene
    from pycuda.compiler import SourceModule
570 7 equemene
571 7 equemene
  if GpuStyle=='OpenCL':
572 7 equemene
    # For PyOpenCL import
573 7 equemene
    import pyopencl as cl
574 7 equemene
    Id=1
575 7 equemene
    for platform in cl.get_platforms():
576 7 equemene
      for device in platform.get_devices():
577 7 equemene
        deviceType=cl.device_type.to_string(device.type)
578 7 equemene
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
579 7 equemene
        Id=Id+1
580 7 equemene
581 7 equemene
  average=numpy.array([]).astype(numpy.float32)
582 7 equemene
  median=numpy.array([]).astype(numpy.float32)
583 7 equemene
  stddev=numpy.array([]).astype(numpy.float32)
584 7 equemene
585 7 equemene
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
586 7 equemene
587 7 equemene
  Jobs=JobStart
588 7 equemene
589 7 equemene
  while Jobs <= JobEnd:
590 7 equemene
    avg,med,std=0,0,0
591 7 equemene
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
592 17 equemene
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
593 7 equemene
594 7 equemene
    if OutMetrology:
595 7 equemene
      duration=numpy.array([]).astype(numpy.float32)
596 7 equemene
      for i in range(Redo):
597 7 equemene
        start=time.time()
598 7 equemene
        if GpuStyle=='CUDA':
599 7 equemene
          try:
600 7 equemene
            MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
601 7 equemene
          except:
602 7 equemene
            print "Problem with %i // computations on Cuda" % Jobs
603 7 equemene
        elif GpuStyle=='OpenCL':
604 7 equemene
          try:
605 7 equemene
            MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device)
606 7 equemene
          except:
607 7 equemene
            print "Problem with %i // computations on OpenCL" % Jobs
608 7 equemene
        duration=numpy.append(duration,time.time()-start)
609 7 equemene
      avg=numpy.mean(duration)
610 7 equemene
      med=numpy.median(duration)
611 7 equemene
      std=numpy.std(duration)
612 7 equemene
    else:
613 7 equemene
      if GpuStyle=='CUDA':
614 7 equemene
        try:
615 7 equemene
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
616 7 equemene
        except:
617 7 equemene
          print "Problem with %i // computations on Cuda" % Jobs
618 7 equemene
      elif GpuStyle=='OpenCL':
619 16 equemene
        # try:
620 16 equemene
        #   avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
621 16 equemene
        # except:
622 16 equemene
        #   print "Problem with %i // computations on OpenCL" % Jobs
623 16 equemene
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
624 7 equemene
625 7 equemene
    if (avg,med,std) != (0,0,0):
626 15 equemene
      print "jobs,avg,med,std",Jobs,avg,med,std
627 7 equemene
      average=numpy.append(average,avg)
628 7 equemene
      median=numpy.append(median,med)
629 7 equemene
      stddev=numpy.append(stddev,std)
630 7 equemene
    else:
631 7 equemene
      print "Values seem to be wrong..."
632 7 equemene
    #THREADS*=2
633 7 equemene
    numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,gethostname()),(ExploredJobs,average,median,stddev))
634 7 equemene
    Jobs+=1
635 7 equemene
636 7 equemene
  FitAndPrint(ExploredJobs,median,Curves)