Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 50

Historique | Voir | Annoter | Télécharger (26,31 ko)

1 7 equemene
#!/usr/bin/env python
2 7 equemene
3 7 equemene
#
4 7 equemene
# Pi-by-MC using PyCUDA/PyOpenCL
5 7 equemene
#
6 7 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
7 7 equemene
#
8 7 equemene
# Thanks to Andreas Klockner for PyCUDA:
9 7 equemene
# http://mathema.tician.de/software/pycuda
10 7 equemene
#
11 7 equemene
12 7 equemene
# 2013-01-01 : problems with launch timeout
13 7 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14 7 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
15 7 equemene
16 7 equemene
# Common tools
17 7 equemene
import numpy
18 7 equemene
from numpy.random import randint as nprnd
19 7 equemene
import sys
20 7 equemene
import getopt
21 7 equemene
import time
22 7 equemene
import matplotlib.pyplot as plt
23 7 equemene
import math
24 7 equemene
from scipy.optimize import curve_fit
25 7 equemene
from socket import gethostname
26 7 equemene
27 17 equemene
# find prime factors of a number
28 17 equemene
# Get for WWW :
29 17 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
30 17 equemene
def PrimeFactors(x):
31 17 equemene
  factorlist=numpy.array([]).astype('uint32')
32 17 equemene
  loop=2
33 17 equemene
  while loop<=x:
34 17 equemene
    if x%loop==0:
35 17 equemene
      x/=loop
36 17 equemene
      factorlist=numpy.append(factorlist,[loop])
37 17 equemene
    else:
38 17 equemene
      loop+=1
39 17 equemene
  return factorlist
40 17 equemene
41 17 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
42 17 equemene
# output is thread number
43 17 equemene
def BestThreadsNumber(jobs):
44 17 equemene
  factors=PrimeFactors(jobs)
45 17 equemene
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
46 17 equemene
  threads=1
47 17 equemene
  for factor in matrix.transpose().ravel():
48 17 equemene
    threads=threads*factor
49 17 equemene
    if threads*threads>jobs:
50 17 equemene
      break
51 17 equemene
  return(long(threads))
52 17 equemene
53 7 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
54 7 equemene
def AmdahlR(N, T1, p):
55 7 equemene
  return (T1*(1-p+p/N))
56 7 equemene
57 7 equemene
# Predicted Amdahl Law
58 7 equemene
def Amdahl(N, T1, s, p):
59 7 equemene
  return (T1*(s+p/N))
60 7 equemene
61 7 equemene
# Predicted Mylq Law with first order
62 7 equemene
def Mylq(N, T1,s,c,p):
63 45 equemene
  return (T1*(s+p/N)+c*N)
64 7 equemene
65 7 equemene
# Predicted Mylq Law with second order
66 7 equemene
def Mylq2(N, T1,s,c1,c2,p):
67 45 equemene
  return (T1*(s+p/N)+c1*N+c2*N*N)
68 7 equemene
69 7 equemene
KERNEL_CODE_CUDA="""
70 7 equemene

71 7 equemene
// Marsaglia RNG very simple implementation
72 7 equemene

73 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
74 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
75 7 equemene
#define MWC   (znew+wnew)
76 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
77 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
78 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
79 7 equemene

80 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
81 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
82 7 equemene

83 17 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
84 7 equemene
{
85 7 equemene
   uint z=seed_z/(blockIdx.x+1);
86 7 equemene
   uint w=seed_w/(blockIdx.x+1);
87 7 equemene

88 17 equemene
   ulong total=0;
89 7 equemene

90 17 equemene
   for (ulong i=0;i<iterations;i++) {
91 7 equemene

92 7 equemene
      float x=MWCfp ;
93 7 equemene
      float y=MWCfp ;
94 7 equemene

95 7 equemene
      // Matching test
96 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
97 7 equemene
      total+=inside;
98 7 equemene

99 7 equemene
   }
100 7 equemene

101 7 equemene
   s[blockIdx.x]=total;
102 7 equemene
   __syncthreads();
103 7 equemene

104 7 equemene
}
105 7 equemene

106 17 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
107 7 equemene
{
108 7 equemene
   uint z=seed_z/(threadIdx.x+1);
109 7 equemene
   uint w=seed_w/(threadIdx.x+1);
110 7 equemene

111 17 equemene
   ulong total=0;
112 7 equemene

113 17 equemene
   for (ulong i=0;i<iterations;i++) {
114 7 equemene

115 7 equemene
      float x=MWCfp ;
116 7 equemene
      float y=MWCfp ;
117 7 equemene

118 7 equemene
      // Matching test
119 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
120 7 equemene
      total+=inside;
121 7 equemene

122 7 equemene
   }
123 7 equemene

124 7 equemene
   s[threadIdx.x]=total;
125 7 equemene
   __syncthreads();
126 7 equemene

127 7 equemene
}
128 7 equemene

129 17 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
130 7 equemene
{
131 7 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
132 7 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
133 7 equemene

134 17 equemene
   ulong total=0;
135 7 equemene

136 17 equemene
   for (ulong i=0;i<iterations;i++) {
137 7 equemene

138 7 equemene
      float x=MWCfp ;
139 7 equemene
      float y=MWCfp ;
140 7 equemene

141 7 equemene
      // Matching test
142 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
143 7 equemene
      total+=inside;
144 7 equemene

145 7 equemene
   }
146 7 equemene

147 7 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
148 7 equemene
   __syncthreads();
149 7 equemene

150 7 equemene
}
151 50 equemene

152 50 equemene
__global__ void MainLoopBlocks64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
153 50 equemene
{
154 50 equemene
   uint z=seed_z/(blockIdx.x+1);
155 50 equemene
   uint w=seed_w/(blockIdx.x+1);
156 50 equemene

157 50 equemene
   ulong total=0;
158 50 equemene

159 50 equemene
   for (ulong i=0;i<iterations;i++) {
160 50 equemene

161 50 equemene
      double x=(double)MWCfp ;
162 50 equemene
      double y=(double)MWCfp ;
163 50 equemene

164 50 equemene
      // Matching test
165 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
166 50 equemene
      total+=inside;
167 50 equemene

168 50 equemene
   }
169 50 equemene

170 50 equemene
   s[blockIdx.x]=total;
171 50 equemene
   __syncthreads();
172 50 equemene

173 50 equemene
}
174 50 equemene

175 50 equemene
__global__ void MainLoopThreads64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
176 50 equemene
{
177 50 equemene
   uint z=seed_z/(threadIdx.x+1);
178 50 equemene
   uint w=seed_w/(threadIdx.x+1);
179 50 equemene

180 50 equemene
   ulong total=0;
181 50 equemene

182 50 equemene
   for (ulong i=0;i<iterations;i++) {
183 50 equemene

184 50 equemene
      double x=(double)MWCfp ;
185 50 equemene
      double y=(double)MWCfp ;
186 50 equemene

187 50 equemene
      // Matching test
188 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
189 50 equemene
      total+=inside;
190 50 equemene

191 50 equemene
   }
192 50 equemene

193 50 equemene
   s[threadIdx.x]=total;
194 50 equemene
   __syncthreads();
195 50 equemene

196 50 equemene
}
197 50 equemene

198 50 equemene
__global__ void MainLoopHybrid64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
199 50 equemene
{
200 50 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
201 50 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
202 50 equemene

203 50 equemene
   ulong total=0;
204 50 equemene

205 50 equemene
   for (ulong i=0;i<iterations;i++) {
206 50 equemene

207 50 equemene
      double x=(double)MWCfp ;
208 50 equemene
      double y=(double)MWCfp ;
209 50 equemene

210 50 equemene
      // Matching test
211 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
212 50 equemene
      total+=inside;
213 50 equemene

214 50 equemene
   }
215 50 equemene

216 50 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
217 50 equemene
   __syncthreads();
218 50 equemene

219 50 equemene
}
220 7 equemene
"""
221 7 equemene
222 7 equemene
KERNEL_CODE_OPENCL="""
223 50 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
224 7 equemene

225 7 equemene
// Marsaglia RNG very simple implementation
226 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
227 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
228 7 equemene
#define MWC   (znew+wnew)
229 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
230 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
231 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
232 7 equemene

233 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
234 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
235 7 equemene

236 17 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
237 7 equemene
{
238 7 equemene
   uint z=seed_z/(get_global_id(0)+1);
239 7 equemene
   uint w=seed_w/(get_global_id(0)+1);
240 7 equemene

241 17 equemene
   ulong total=0;
242 7 equemene

243 17 equemene
   for (ulong i=0;i<iterations;i++) {
244 7 equemene

245 7 equemene
      float x=MWCfp ;
246 7 equemene
      float y=MWCfp ;
247 7 equemene

248 7 equemene
      // Matching test
249 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
250 7 equemene
      total+=inside;
251 7 equemene
   }
252 7 equemene
   s[get_global_id(0)]=total;
253 7 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
254 7 equemene

255 7 equemene
}
256 7 equemene

257 17 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
258 7 equemene
{
259 7 equemene
   uint z=seed_z/(get_local_id(0)+1);
260 7 equemene
   uint w=seed_w/(get_local_id(0)+1);
261 7 equemene

262 17 equemene
   ulong total=0;
263 7 equemene

264 17 equemene
   for (ulong i=0;i<iterations;i++) {
265 7 equemene

266 7 equemene
      float x=MWCfp ;
267 7 equemene
      float y=MWCfp ;
268 7 equemene

269 7 equemene
      // Matching test
270 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
271 7 equemene
      total+=inside;
272 7 equemene
   }
273 7 equemene
   s[get_local_id(0)]=total;
274 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
275 7 equemene

276 7 equemene
}
277 7 equemene

278 17 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
279 7 equemene
{
280 7 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
281 7 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
282 7 equemene

283 17 equemene
   ulong total=0;
284 7 equemene

285 7 equemene
   for (uint i=0;i<iterations;i++) {
286 7 equemene

287 7 equemene
      float x=MWCfp ;
288 7 equemene
     float y=MWCfp ;
289 7 equemene

290 7 equemene
      // Matching test
291 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
292 7 equemene
      total+=inside;
293 7 equemene
   }
294 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
295 7 equemene
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
296 7 equemene

297 7 equemene
}
298 50 equemene

299 50 equemene
__kernel void MainLoopGlobal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
300 50 equemene
{
301 50 equemene
   uint z=seed_z/(get_global_id(0)+1);
302 50 equemene
   uint w=seed_w/(get_global_id(0)+1);
303 50 equemene

304 50 equemene
   ulong total=0;
305 50 equemene

306 50 equemene
   for (ulong i=0;i<iterations;i++) {
307 50 equemene

308 50 equemene
      double x=(double)MWCfp ;
309 50 equemene
      double y=(double)MWCfp ;
310 50 equemene

311 50 equemene
      // Matching test
312 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
313 50 equemene
      total+=inside;
314 50 equemene
   }
315 50 equemene
   s[get_global_id(0)]=total;
316 50 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
317 50 equemene

318 50 equemene
}
319 50 equemene

320 50 equemene
__kernel void MainLoopLocal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
321 50 equemene
{
322 50 equemene
   uint z=seed_z/(get_local_id(0)+1);
323 50 equemene
   uint w=seed_w/(get_local_id(0)+1);
324 50 equemene

325 50 equemene
   ulong total=0;
326 50 equemene

327 50 equemene
   for (ulong i=0;i<iterations;i++) {
328 50 equemene

329 50 equemene
      double x=(double)MWCfp ;
330 50 equemene
      double y=(double)MWCfp ;
331 50 equemene

332 50 equemene
      // Matching test
333 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
334 50 equemene
      total+=inside;
335 50 equemene
   }
336 50 equemene
   s[get_local_id(0)]=total;
337 50 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
338 50 equemene

339 50 equemene
}
340 50 equemene

341 50 equemene
__kernel void MainLoopHybrid64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
342 50 equemene
{
343 50 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
344 50 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
345 50 equemene

346 50 equemene
   ulong total=0;
347 50 equemene

348 50 equemene
   for (uint i=0;i<iterations;i++) {
349 50 equemene

350 50 equemene
      double x=(double)MWCfp ;
351 50 equemene
      double y=(double)MWCfp ;
352 50 equemene

353 50 equemene
      // Matching test
354 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
355 50 equemene
      total+=inside;
356 50 equemene
   }
357 50 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
358 50 equemene
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
359 50 equemene

360 50 equemene
}
361 7 equemene
"""
362 7 equemene
363 50 equemene
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,DoublePrecision):
364 7 equemene
365 7 equemene
  # Avec PyCUDA autoinit, rien a faire !
366 7 equemene
367 7 equemene
  circleCU = cuda.InOut(circle)
368 7 equemene
369 7 equemene
  mod = SourceModule(KERNEL_CODE_CUDA)
370 7 equemene
371 7 equemene
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
372 7 equemene
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
373 7 equemene
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
374 50 equemene
  MetropolisBlocks64CU=mod.get_function("MainLoopBlocks64")
375 50 equemene
  MetropolisJobs64CU=mod.get_function("MainLoopThreads64")
376 50 equemene
  MetropolisHybrid64CU=mod.get_function("MainLoopHybrid64")
377 7 equemene
378 7 equemene
  start = pycuda.driver.Event()
379 7 equemene
  stop = pycuda.driver.Event()
380 7 equemene
381 7 equemene
  MyPi=numpy.zeros(steps)
382 7 equemene
  MyDuration=numpy.zeros(steps)
383 50 equemene
384 7 equemene
  if iterations%jobs==0:
385 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
386 7 equemene
    iterationsNew=iterationsCL*jobs
387 7 equemene
  else:
388 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
389 7 equemene
    iterationsNew=iterations
390 7 equemene
391 7 equemene
  for i in range(steps):
392 7 equemene
    start.record()
393 7 equemene
    start.synchronize()
394 7 equemene
    if ParaStyle=='Blocks':
395 50 equemene
      if DoublePrecision:
396 50 equemene
        MetropolisBlocksCU(circleCU,
397 50 equemene
                           numpy.uint64(iterationsCL),
398 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
399 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
400 50 equemene
                           grid=(jobs,1),
401 50 equemene
                           block=(1,1,1))
402 50 equemene
      else:
403 50 equemene
        MetropolisBlocks64CU(circleCU,
404 50 equemene
                             numpy.uint64(iterationsCL),
405 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
406 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
407 50 equemene
                             grid=(jobs,1),
408 50 equemene
                             block=(1,1,1))
409 50 equemene
410 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
411 17 equemene
            (Alu,jobs,1,ParaStyle)
412 7 equemene
    elif ParaStyle=='Hybrid':
413 17 equemene
      threads=BestThreadsNumber(jobs)
414 50 equemene
      if DoublePrecision:
415 50 equemene
        MetropolisHybrid64CU(circleCU,
416 50 equemene
                             numpy.uint64(iterationsCL),
417 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
418 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
419 50 equemene
                             grid=(jobs,1),
420 50 equemene
                             block=(threads,1,1))
421 50 equemene
      else:
422 50 equemene
        MetropolisHybridCU(circleCU,
423 50 equemene
                           numpy.uint64(iterationsCL),
424 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
425 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
426 50 equemene
                           grid=(jobs,1),
427 50 equemene
                           block=(threads,1,1))
428 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
429 17 equemene
            (Alu,jobs/threads,threads,ParaStyle)
430 7 equemene
    else:
431 50 equemene
      if DoublePrecision:
432 50 equemene
        MetropolisJobs64CU(circleCU,
433 50 equemene
                           numpy.uint64(iterationsCL),
434 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
435 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
436 50 equemene
                           grid=(1,1),
437 50 equemene
                           block=(jobs,1,1))
438 50 equemene
      else:
439 50 equemene
        MetropolisJobsCU(circleCU,
440 50 equemene
                         numpy.uint64(iterationsCL),
441 50 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
442 50 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
443 50 equemene
                         grid=(1,1),
444 50 equemene
                         block=(jobs,1,1))
445 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
446 17 equemene
            (Alu,jobs,1,ParaStyle)
447 7 equemene
    stop.record()
448 7 equemene
    stop.synchronize()
449 7 equemene
450 7 equemene
    elapsed = start.time_till(stop)*1e-3
451 7 equemene
452 7 equemene
    MyDuration[i]=elapsed
453 50 equemene
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
454 50 equemene
    MyPi[i]=numpy.median(AllPi)
455 50 equemene
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
456 7 equemene
457 50 equemene
458 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
459 7 equemene
460 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
461 7 equemene
462 7 equemene
463 50 equemene
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
464 50 equemene
                     DoublePrecision):
465 7 equemene
466 7 equemene
  # Initialisation des variables en les CASTant correctement
467 7 equemene
468 46 equemene
  if Device==0:
469 46 equemene
    print "Enter XPU selector based on ALU type: first selected"
470 46 equemene
    HasXPU=False
471 46 equemene
    # Default Device selection based on ALU Type
472 46 equemene
    for platform in cl.get_platforms():
473 46 equemene
      for device in platform.get_devices():
474 46 equemene
        deviceType=cl.device_type.to_string(device.type)
475 46 equemene
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
476 46 equemene
          XPU=device
477 46 equemene
          print "GPU selected: ",device.name
478 46 equemene
          HasXPU=True
479 46 equemene
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:
480 46 equemene
          XPU=device
481 46 equemene
          print "CPU selected: ",device.name
482 46 equemene
          HasXPU=True
483 46 equemene
  else:
484 46 equemene
    print "Enter XPU selector based on device number & ALU type"
485 46 equemene
    Id=1
486 46 equemene
    HasXPU=False
487 46 equemene
    # Primary Device selection based on Device Id
488 46 equemene
    for platform in cl.get_platforms():
489 46 equemene
      for device in platform.get_devices():
490 46 equemene
        deviceType=cl.device_type.to_string(device.type)
491 46 equemene
        if Id==Device and Alu==deviceType and HasXPU==False:
492 46 equemene
          XPU=device
493 46 equemene
          print "CPU/GPU selected: ",device.name
494 46 equemene
          HasXPU=True
495 46 equemene
        Id=Id+1
496 46 equemene
    if HasXPU==False:
497 46 equemene
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
498 46 equemene
          (Device,Alu,Id-1)
499 46 equemene
      return(0,0,0)
500 7 equemene
501 7 equemene
  # Je cree le contexte et la queue pour son execution
502 46 equemene
  ctx = cl.Context([XPU])
503 7 equemene
  queue = cl.CommandQueue(ctx,
504 7 equemene
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
505 7 equemene
506 7 equemene
  # Je recupere les flag possibles pour les buffers
507 7 equemene
  mf = cl.mem_flags
508 7 equemene
509 7 equemene
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
510 7 equemene
511 7 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
512 7 equemene
    options = "-cl-mad-enable -cl-fast-relaxed-math")
513 7 equemene
514 7 equemene
  i=0
515 7 equemene
516 7 equemene
  MyPi=numpy.zeros(steps)
517 7 equemene
  MyDuration=numpy.zeros(steps)
518 7 equemene
519 7 equemene
  if iterations%jobs==0:
520 41 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
521 47 equemene
    iterationsNew=numpy.uint64(iterationsCL*jobs)
522 7 equemene
  else:
523 41 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
524 47 equemene
    iterationsNew=numpy.uint64(iterations)
525 7 equemene
526 7 equemene
  for i in range(steps):
527 7 equemene
528 7 equemene
    if ParaStyle=='Blocks':
529 7 equemene
      # Call OpenCL kernel
530 7 equemene
      # (1,) is Global work size (only 1 work size)
531 7 equemene
      # (1,) is local work size
532 7 equemene
      # circleCL is lattice translated in CL format
533 7 equemene
      # SeedZCL is lattice translated in CL format
534 7 equemene
      # SeedWCL is lattice translated in CL format
535 7 equemene
      # step is number of iterations
536 50 equemene
      if DoublePrecision:
537 50 equemene
        CLLaunch=MetropolisCL.MainLoopGlobal64(queue,(jobs,),None,
538 50 equemene
                                               circleCL,
539 50 equemene
                                               numpy.uint64(iterationsCL),
540 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)),
541 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)))
542 50 equemene
      else:
543 50 equemene
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
544 50 equemene
                                             circleCL,
545 50 equemene
                                             numpy.uint64(iterationsCL),
546 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)),
547 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)))
548 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
549 17 equemene
            (Alu,jobs,1,ParaStyle)
550 7 equemene
    elif ParaStyle=='Hybrid':
551 17 equemene
      threads=BestThreadsNumber(jobs)
552 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
553 50 equemene
      if DoublePrecision:
554 50 equemene
        CLLaunch=MetropolisCL.MainLoopHybrid64(queue,(jobs,),(threads,),
555 50 equemene
                                               circleCL,
556 50 equemene
                                               numpy.uint64(iterationsCL),
557 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)),
558 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)))
559 50 equemene
      else:
560 50 equemene
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
561 50 equemene
                                             circleCL,
562 50 equemene
                                             numpy.uint64(iterationsCL),
563 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)),
564 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)))
565 50 equemene
566 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
567 17 equemene
            (Alu,jobs/threads,threads,ParaStyle)
568 7 equemene
    else:
569 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
570 50 equemene
      if DoublePrecision:
571 50 equemene
        CLLaunch=MetropolisCL.MainLoopLocal64(queue,(jobs,),(jobs,),
572 50 equemene
                                              circleCL,
573 50 equemene
                                              numpy.uint64(iterationsCL),
574 50 equemene
                                              numpy.uint32(nprnd(2**30/jobs)),
575 50 equemene
                                              numpy.uint32(nprnd(2**30/jobs)))
576 50 equemene
      else:
577 50 equemene
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
578 50 equemene
                                            circleCL,
579 50 equemene
                                            numpy.uint64(iterationsCL),
580 50 equemene
                                            numpy.uint32(nprnd(2**30/jobs)),
581 50 equemene
                                            numpy.uint32(nprnd(2**30/jobs)))
582 7 equemene
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
583 7 equemene
584 7 equemene
    CLLaunch.wait()
585 7 equemene
    cl.enqueue_copy(queue, circle, circleCL).wait()
586 7 equemene
587 7 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
588 7 equemene
589 7 equemene
    MyDuration[i]=elapsed
590 49 equemene
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
591 49 equemene
    MyPi[i]=numpy.median(AllPi)
592 49 equemene
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
593 7 equemene
594 7 equemene
  circleCL.release()
595 7 equemene
596 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
597 7 equemene
598 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
599 7 equemene
600 7 equemene
601 7 equemene
def FitAndPrint(N,D,Curves):
602 7 equemene
603 7 equemene
  try:
604 7 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
605 7 equemene
606 7 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
607 7 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
608 7 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
609 7 equemene
    coeffs_Amdahl[0]=D[0]
610 7 equemene
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
611 7 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
612 7 equemene
  except:
613 7 equemene
    print "Impossible to fit for Amdahl law : only %i elements" % len(D)
614 7 equemene
615 7 equemene
  try:
616 7 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
617 7 equemene
618 7 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
619 7 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
620 7 equemene
    coeffs_AmdahlR[0]=D[0]
621 7 equemene
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
622 7 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
623 7 equemene
624 7 equemene
  except:
625 7 equemene
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
626 7 equemene
627 7 equemene
  try:
628 7 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
629 7 equemene
630 7 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
631 45 equemene
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
632 7 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
633 7 equemene
    coeffs_Mylq[0]=D[0]
634 45 equemene
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
635 7 equemene
                                                            coeffs_Mylq[1],
636 45 equemene
                                                            coeffs_Mylq[3],
637 45 equemene
                                                            coeffs_Mylq[2])
638 7 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
639 7 equemene
                coeffs_Mylq[3])
640 7 equemene
  except:
641 7 equemene
    print "Impossible to fit for Mylq law : only %i elements" % len(D)
642 7 equemene
643 7 equemene
  try:
644 7 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
645 7 equemene
646 7 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
647 45 equemene
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
648 45 equemene
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
649 7 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
650 7 equemene
    coeffs_Mylq2[0]=D[0]
651 45 equemene
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
652 45 equemene
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
653 45 equemene
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
654 7 equemene
655 7 equemene
  except:
656 7 equemene
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
657 7 equemene
658 7 equemene
  if Curves:
659 7 equemene
    plt.xlabel("Number of Threads/work Items")
660 7 equemene
    plt.ylabel("Total Elapsed Time")
661 7 equemene
662 7 equemene
    Experience,=plt.plot(N,D,'ro')
663 7 equemene
    try:
664 7 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
665 7 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
666 7 equemene
    except:
667 7 equemene
      print "Fit curves seem not to be available"
668 7 equemene
669 7 equemene
    plt.legend()
670 7 equemene
    plt.show()
671 7 equemene
672 7 equemene
if __name__=='__main__':
673 7 equemene
674 7 equemene
  # Set defaults values
675 7 equemene
  # Alu can be CPU or GPU
676 7 equemene
  Alu='CPU'
677 46 equemene
  # Id of GPU : 1 is for first find !
678 17 equemene
  Device=0
679 7 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
680 7 equemene
  GpuStyle='OpenCL'
681 7 equemene
  # Parallel distribution can be on Threads or Blocks
682 7 equemene
  ParaStyle='Blocks'
683 7 equemene
  # Iterations is integer
684 7 equemene
  Iterations=100000000
685 7 equemene
  # JobStart in first number of Jobs to explore
686 7 equemene
  JobStart=1
687 7 equemene
  # JobEnd is last number of Jobs to explore
688 7 equemene
  JobEnd=16
689 47 equemene
  # JobStep is the step of Jobs to explore
690 47 equemene
  JobStep=1
691 7 equemene
  # Redo is the times to redo the test to improve metrology
692 7 equemene
  Redo=1
693 7 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
694 7 equemene
  OutMetrology=False
695 45 equemene
  Metrology='InMetro'
696 7 equemene
  # Curves is True to print the curves
697 7 equemene
  Curves=False
698 50 equemene
  # DoublePrecision on FP calculus
699 50 equemene
  DoublePrecision=False
700 7 equemene
701 7 equemene
  try:
702 50 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hocla:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
703 7 equemene
  except getopt.GetoptError:
704 50 equemene
    print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
705 7 equemene
    sys.exit(2)
706 7 equemene
707 7 equemene
  for opt, arg in opts:
708 7 equemene
    if opt == '-h':
709 50 equemene
      print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
710 46 equemene
711 46 equemene
      print "\nInformations about devices detected under OpenCL:"
712 46 equemene
      # For PyOpenCL import
713 46 equemene
      import pyopencl as cl
714 46 equemene
      Id=1
715 46 equemene
      for platform in cl.get_platforms():
716 46 equemene
        for device in platform.get_devices():
717 46 equemene
          deviceType=cl.device_type.to_string(device.type)
718 46 equemene
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
719 46 equemene
          Id=Id+1
720 46 equemene
721 7 equemene
      sys.exit()
722 7 equemene
    elif opt == '-o':
723 7 equemene
      OutMetrology=True
724 45 equemene
      Metrology='OutMetro'
725 50 equemene
    elif opt == '-l':
726 50 equemene
      DoublePrecision=True
727 7 equemene
    elif opt == '-c':
728 7 equemene
      Curves=True
729 7 equemene
    elif opt in ("-a", "--alu"):
730 7 equemene
      Alu = arg
731 7 equemene
    elif opt in ("-d", "--device"):
732 7 equemene
      Device = int(arg)
733 7 equemene
    elif opt in ("-g", "--gpustyle"):
734 7 equemene
      GpuStyle = arg
735 7 equemene
    elif opt in ("-p", "--parastyle"):
736 7 equemene
      ParaStyle = arg
737 7 equemene
    elif opt in ("-i", "--iterations"):
738 40 equemene
      Iterations = numpy.uint64(arg)
739 7 equemene
    elif opt in ("-s", "--jobstart"):
740 7 equemene
      JobStart = int(arg)
741 7 equemene
    elif opt in ("-e", "--jobend"):
742 7 equemene
      JobEnd = int(arg)
743 47 equemene
    elif opt in ("-t", "--jobstep"):
744 47 equemene
      JobStep = int(arg)
745 7 equemene
    elif opt in ("-r", "--redo"):
746 7 equemene
      Redo = int(arg)
747 7 equemene
748 7 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
749 7 equemene
    print "Alu can't be CPU for CUDA, set Alu to GPU"
750 7 equemene
    Alu='GPU'
751 7 equemene
752 7 equemene
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
753 7 equemene
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
754 7 equemene
    ParaStyle='Threads'
755 7 equemene
756 7 equemene
  print "Compute unit : %s" % Alu
757 7 equemene
  print "Device Identification : %s" % Device
758 7 equemene
  print "GpuStyle used : %s" % GpuStyle
759 7 equemene
  print "Parallel Style used : %s" % ParaStyle
760 7 equemene
  print "Iterations : %s" % Iterations
761 7 equemene
  print "Number of threads on start : %s" % JobStart
762 7 equemene
  print "Number of threads on end : %s" % JobEnd
763 7 equemene
  print "Number of redo : %s" % Redo
764 7 equemene
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
765 50 equemene
  print "Double Precision in Kernels : %r" % DoublePrecision
766 7 equemene
767 7 equemene
  if GpuStyle=='CUDA':
768 7 equemene
    # For PyCUDA import
769 7 equemene
    import pycuda.driver as cuda
770 7 equemene
    import pycuda.gpuarray as gpuarray
771 7 equemene
    import pycuda.autoinit
772 7 equemene
    from pycuda.compiler import SourceModule
773 7 equemene
774 7 equemene
  if GpuStyle=='OpenCL':
775 7 equemene
    # For PyOpenCL import
776 7 equemene
    import pyopencl as cl
777 7 equemene
    Id=1
778 7 equemene
    for platform in cl.get_platforms():
779 7 equemene
      for device in platform.get_devices():
780 7 equemene
        deviceType=cl.device_type.to_string(device.type)
781 7 equemene
        print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
782 50 equemene
        if Id == Device:
783 50 equemene
          # Set the Alu as detected Device Type
784 50 equemene
          Alu=deviceType
785 7 equemene
        Id=Id+1
786 7 equemene
787 7 equemene
  average=numpy.array([]).astype(numpy.float32)
788 7 equemene
  median=numpy.array([]).astype(numpy.float32)
789 7 equemene
  stddev=numpy.array([]).astype(numpy.float32)
790 7 equemene
791 7 equemene
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
792 7 equemene
793 7 equemene
  Jobs=JobStart
794 7 equemene
795 7 equemene
  while Jobs <= JobEnd:
796 7 equemene
    avg,med,std=0,0,0
797 7 equemene
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
798 17 equemene
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
799 7 equemene
800 7 equemene
    if OutMetrology:
801 7 equemene
      duration=numpy.array([]).astype(numpy.float32)
802 7 equemene
      for i in range(Redo):
803 7 equemene
        start=time.time()
804 7 equemene
        if GpuStyle=='CUDA':
805 7 equemene
          try:
806 50 equemene
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
807 50 equemene
                                 DoublePrecision)
808 7 equemene
          except:
809 7 equemene
            print "Problem with %i // computations on Cuda" % Jobs
810 7 equemene
        elif GpuStyle=='OpenCL':
811 7 equemene
          try:
812 50 equemene
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
813 50 equemene
                                   Alu,Device,DoublePrecision)
814 7 equemene
          except:
815 7 equemene
            print "Problem with %i // computations on OpenCL" % Jobs
816 7 equemene
        duration=numpy.append(duration,time.time()-start)
817 46 equemene
      if (a,m,s) != (0,0,0):
818 46 equemene
        avg=numpy.mean(duration)
819 46 equemene
        med=numpy.median(duration)
820 46 equemene
        std=numpy.std(duration)
821 46 equemene
      else:
822 46 equemene
        print "Values seem to be wrong..."
823 7 equemene
    else:
824 7 equemene
      if GpuStyle=='CUDA':
825 7 equemene
        try:
826 50 equemene
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,
827 50 equemene
                                     DoublePrecision)
828 7 equemene
        except:
829 7 equemene
          print "Problem with %i // computations on Cuda" % Jobs
830 7 equemene
      elif GpuStyle=='OpenCL':
831 16 equemene
        # try:
832 16 equemene
        #   avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
833 16 equemene
        # except:
834 16 equemene
        #   print "Problem with %i // computations on OpenCL" % Jobs
835 50 equemene
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,DoublePrecision)
836 7 equemene
837 7 equemene
    if (avg,med,std) != (0,0,0):
838 15 equemene
      print "jobs,avg,med,std",Jobs,avg,med,std
839 7 equemene
      average=numpy.append(average,avg)
840 7 equemene
      median=numpy.append(median,med)
841 7 equemene
      stddev=numpy.append(stddev,std)
842 7 equemene
    else:
843 7 equemene
      print "Values seem to be wrong..."
844 7 equemene
    #THREADS*=2
845 50 equemene
    if DoublePrecision:
846 50 equemene
      Precision='DP'
847 50 equemene
    else:
848 50 equemene
      Precision='SP'
849 46 equemene
    if len(average)!=0:
850 50 equemene
      numpy.savez("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
851 50 equemene
      ToSave=[ ExploredJobs,average,median,stddev ]
852 50 equemene
      numpy.savetxt("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
853 47 equemene
    Jobs+=JobStep
854 7 equemene
855 7 equemene
  FitAndPrint(ExploredJobs,median,Curves)