Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 65

Historique | Voir | Annoter | Télécharger (26,72 ko)

1 7 equemene
#!/usr/bin/env python
2 7 equemene
3 7 equemene
#
4 55 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 7 equemene
#
6 7 equemene
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr>
7 7 equemene
#
8 7 equemene
# Thanks to Andreas Klockner for PyCUDA:
9 7 equemene
# http://mathema.tician.de/software/pycuda
10 7 equemene
#
11 7 equemene
12 7 equemene
# 2013-01-01 : problems with launch timeout
13 7 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
14 7 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
15 7 equemene
16 7 equemene
# Common tools
17 7 equemene
import numpy
18 7 equemene
from numpy.random import randint as nprnd
19 7 equemene
import sys
20 7 equemene
import getopt
21 7 equemene
import time
22 7 equemene
import math
23 7 equemene
from socket import gethostname
24 7 equemene
25 17 equemene
# find prime factors of a number
26 17 equemene
# Get for WWW :
27 17 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
28 17 equemene
def PrimeFactors(x):
29 17 equemene
  factorlist=numpy.array([]).astype('uint32')
30 17 equemene
  loop=2
31 17 equemene
  while loop<=x:
32 17 equemene
    if x%loop==0:
33 17 equemene
      x/=loop
34 17 equemene
      factorlist=numpy.append(factorlist,[loop])
35 17 equemene
    else:
36 17 equemene
      loop+=1
37 17 equemene
  return factorlist
38 17 equemene
39 17 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
40 17 equemene
# output is thread number
41 17 equemene
def BestThreadsNumber(jobs):
42 17 equemene
  factors=PrimeFactors(jobs)
43 17 equemene
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
44 17 equemene
  threads=1
45 17 equemene
  for factor in matrix.transpose().ravel():
46 17 equemene
    threads=threads*factor
47 17 equemene
    if threads*threads>jobs:
48 17 equemene
      break
49 17 equemene
  return(long(threads))
50 17 equemene
51 7 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
52 7 equemene
def AmdahlR(N, T1, p):
53 7 equemene
  return (T1*(1-p+p/N))
54 7 equemene
55 7 equemene
# Predicted Amdahl Law
56 7 equemene
def Amdahl(N, T1, s, p):
57 7 equemene
  return (T1*(s+p/N))
58 7 equemene
59 7 equemene
# Predicted Mylq Law with first order
60 7 equemene
def Mylq(N, T1,s,c,p):
61 45 equemene
  return (T1*(s+p/N)+c*N)
62 7 equemene
63 7 equemene
# Predicted Mylq Law with second order
64 7 equemene
def Mylq2(N, T1,s,c1,c2,p):
65 45 equemene
  return (T1*(s+p/N)+c1*N+c2*N*N)
66 7 equemene
67 7 equemene
KERNEL_CODE_CUDA="""
68 7 equemene

69 7 equemene
// Marsaglia RNG very simple implementation
70 7 equemene

71 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73 7 equemene
#define MWC   (znew+wnew)
74 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
75 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
76 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
77 7 equemene

78 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
79 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
80 7 equemene

81 17 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
82 7 equemene
{
83 7 equemene
   uint z=seed_z/(blockIdx.x+1);
84 7 equemene
   uint w=seed_w/(blockIdx.x+1);
85 7 equemene

86 17 equemene
   ulong total=0;
87 7 equemene

88 17 equemene
   for (ulong i=0;i<iterations;i++) {
89 7 equemene

90 7 equemene
      float x=MWCfp ;
91 7 equemene
      float y=MWCfp ;
92 7 equemene

93 7 equemene
      // Matching test
94 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
95 7 equemene
      total+=inside;
96 7 equemene

97 7 equemene
   }
98 7 equemene

99 7 equemene
   s[blockIdx.x]=total;
100 7 equemene
   __syncthreads();
101 7 equemene

102 7 equemene
}
103 7 equemene

104 17 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
105 7 equemene
{
106 7 equemene
   uint z=seed_z/(threadIdx.x+1);
107 7 equemene
   uint w=seed_w/(threadIdx.x+1);
108 7 equemene

109 17 equemene
   ulong total=0;
110 7 equemene

111 17 equemene
   for (ulong i=0;i<iterations;i++) {
112 7 equemene

113 7 equemene
      float x=MWCfp ;
114 7 equemene
      float y=MWCfp ;
115 7 equemene

116 7 equemene
      // Matching test
117 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
118 7 equemene
      total+=inside;
119 7 equemene

120 7 equemene
   }
121 7 equemene

122 7 equemene
   s[threadIdx.x]=total;
123 7 equemene
   __syncthreads();
124 7 equemene

125 7 equemene
}
126 7 equemene

127 17 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
128 7 equemene
{
129 7 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
130 7 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
131 7 equemene

132 17 equemene
   ulong total=0;
133 7 equemene

134 17 equemene
   for (ulong i=0;i<iterations;i++) {
135 7 equemene

136 7 equemene
      float x=MWCfp ;
137 7 equemene
      float y=MWCfp ;
138 7 equemene

139 7 equemene
      // Matching test
140 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
141 7 equemene
      total+=inside;
142 7 equemene

143 7 equemene
   }
144 7 equemene

145 7 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
146 7 equemene
   __syncthreads();
147 7 equemene

148 7 equemene
}
149 50 equemene

150 50 equemene
__global__ void MainLoopBlocks64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
151 50 equemene
{
152 50 equemene
   uint z=seed_z/(blockIdx.x+1);
153 50 equemene
   uint w=seed_w/(blockIdx.x+1);
154 50 equemene

155 50 equemene
   ulong total=0;
156 50 equemene

157 50 equemene
   for (ulong i=0;i<iterations;i++) {
158 50 equemene

159 50 equemene
      double x=(double)MWCfp ;
160 50 equemene
      double y=(double)MWCfp ;
161 50 equemene

162 50 equemene
      // Matching test
163 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
164 50 equemene
      total+=inside;
165 50 equemene

166 50 equemene
   }
167 50 equemene

168 50 equemene
   s[blockIdx.x]=total;
169 50 equemene
   __syncthreads();
170 50 equemene

171 50 equemene
}
172 50 equemene

173 50 equemene
__global__ void MainLoopThreads64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
174 50 equemene
{
175 50 equemene
   uint z=seed_z/(threadIdx.x+1);
176 50 equemene
   uint w=seed_w/(threadIdx.x+1);
177 50 equemene

178 50 equemene
   ulong total=0;
179 50 equemene

180 50 equemene
   for (ulong i=0;i<iterations;i++) {
181 50 equemene

182 50 equemene
      double x=(double)MWCfp ;
183 50 equemene
      double y=(double)MWCfp ;
184 50 equemene

185 50 equemene
      // Matching test
186 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
187 50 equemene
      total+=inside;
188 50 equemene

189 50 equemene
   }
190 50 equemene

191 50 equemene
   s[threadIdx.x]=total;
192 50 equemene
   __syncthreads();
193 50 equemene

194 50 equemene
}
195 50 equemene

196 50 equemene
__global__ void MainLoopHybrid64(ulong *s,ulong iterations,uint seed_w,uint seed_z)
197 50 equemene
{
198 50 equemene
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
199 50 equemene
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
200 50 equemene

201 50 equemene
   ulong total=0;
202 50 equemene

203 50 equemene
   for (ulong i=0;i<iterations;i++) {
204 50 equemene

205 50 equemene
      double x=(double)MWCfp ;
206 50 equemene
      double y=(double)MWCfp ;
207 50 equemene

208 50 equemene
      // Matching test
209 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
210 50 equemene
      total+=inside;
211 50 equemene

212 50 equemene
   }
213 50 equemene

214 50 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
215 50 equemene
   __syncthreads();
216 50 equemene

217 50 equemene
}
218 7 equemene
"""
219 7 equemene
220 7 equemene
KERNEL_CODE_OPENCL="""
221 50 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
222 7 equemene

223 7 equemene
// Marsaglia RNG very simple implementation
224 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
225 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
226 7 equemene
#define MWC   (znew+wnew)
227 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
228 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
229 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
230 7 equemene

231 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
232 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
233 7 equemene

234 17 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
235 7 equemene
{
236 7 equemene
   uint z=seed_z/(get_global_id(0)+1);
237 7 equemene
   uint w=seed_w/(get_global_id(0)+1);
238 7 equemene

239 17 equemene
   ulong total=0;
240 7 equemene

241 17 equemene
   for (ulong i=0;i<iterations;i++) {
242 7 equemene

243 7 equemene
      float x=MWCfp ;
244 7 equemene
      float y=MWCfp ;
245 7 equemene

246 7 equemene
      // Matching test
247 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
248 7 equemene
      total+=inside;
249 7 equemene
   }
250 7 equemene
   s[get_global_id(0)]=total;
251 7 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
252 7 equemene

253 7 equemene
}
254 7 equemene

255 17 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
256 7 equemene
{
257 7 equemene
   uint z=seed_z/(get_local_id(0)+1);
258 7 equemene
   uint w=seed_w/(get_local_id(0)+1);
259 7 equemene

260 17 equemene
   ulong total=0;
261 7 equemene

262 17 equemene
   for (ulong i=0;i<iterations;i++) {
263 7 equemene

264 7 equemene
      float x=MWCfp ;
265 7 equemene
      float y=MWCfp ;
266 7 equemene

267 7 equemene
      // Matching test
268 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
269 7 equemene
      total+=inside;
270 7 equemene
   }
271 7 equemene
   s[get_local_id(0)]=total;
272 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
273 7 equemene

274 7 equemene
}
275 7 equemene

276 17 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
277 7 equemene
{
278 7 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
279 7 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
280 7 equemene

281 17 equemene
   ulong total=0;
282 7 equemene

283 7 equemene
   for (uint i=0;i<iterations;i++) {
284 7 equemene

285 7 equemene
      float x=MWCfp ;
286 7 equemene
     float y=MWCfp ;
287 7 equemene

288 7 equemene
      // Matching test
289 17 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
290 7 equemene
      total+=inside;
291 7 equemene
   }
292 7 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
293 7 equemene
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
294 7 equemene

295 7 equemene
}
296 50 equemene

297 50 equemene
__kernel void MainLoopGlobal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
298 50 equemene
{
299 50 equemene
   uint z=seed_z/(get_global_id(0)+1);
300 50 equemene
   uint w=seed_w/(get_global_id(0)+1);
301 50 equemene

302 50 equemene
   ulong total=0;
303 50 equemene

304 50 equemene
   for (ulong i=0;i<iterations;i++) {
305 50 equemene

306 50 equemene
      double x=(double)MWCfp ;
307 50 equemene
      double y=(double)MWCfp ;
308 50 equemene

309 50 equemene
      // Matching test
310 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
311 50 equemene
      total+=inside;
312 50 equemene
   }
313 50 equemene
   s[get_global_id(0)]=total;
314 50 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
315 50 equemene

316 50 equemene
}
317 50 equemene

318 50 equemene
__kernel void MainLoopLocal64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
319 50 equemene
{
320 50 equemene
   uint z=seed_z/(get_local_id(0)+1);
321 50 equemene
   uint w=seed_w/(get_local_id(0)+1);
322 50 equemene

323 50 equemene
   ulong total=0;
324 50 equemene

325 50 equemene
   for (ulong i=0;i<iterations;i++) {
326 50 equemene

327 50 equemene
      double x=(double)MWCfp ;
328 50 equemene
      double y=(double)MWCfp ;
329 50 equemene

330 50 equemene
      // Matching test
331 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
332 50 equemene
      total+=inside;
333 50 equemene
   }
334 50 equemene
   s[get_local_id(0)]=total;
335 50 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
336 50 equemene

337 50 equemene
}
338 50 equemene

339 50 equemene
__kernel void MainLoopHybrid64(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
340 50 equemene
{
341 50 equemene
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
342 50 equemene
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
343 50 equemene

344 50 equemene
   ulong total=0;
345 50 equemene

346 50 equemene
   for (uint i=0;i<iterations;i++) {
347 50 equemene

348 50 equemene
      double x=(double)MWCfp ;
349 50 equemene
      double y=(double)MWCfp ;
350 50 equemene

351 50 equemene
      // Matching test
352 50 equemene
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
353 50 equemene
      total+=inside;
354 50 equemene
   }
355 50 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
356 50 equemene
   s[get_group_id(0)*get_num_groups(0)+get_local_id(0)]=total;
357 50 equemene

358 50 equemene
}
359 7 equemene
"""
360 7 equemene
361 50 equemene
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,DoublePrecision):
362 7 equemene
363 7 equemene
  # Avec PyCUDA autoinit, rien a faire !
364 7 equemene
365 7 equemene
  circleCU = cuda.InOut(circle)
366 7 equemene
367 7 equemene
  mod = SourceModule(KERNEL_CODE_CUDA)
368 7 equemene
369 7 equemene
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
370 7 equemene
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
371 7 equemene
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
372 50 equemene
  MetropolisBlocks64CU=mod.get_function("MainLoopBlocks64")
373 50 equemene
  MetropolisJobs64CU=mod.get_function("MainLoopThreads64")
374 50 equemene
  MetropolisHybrid64CU=mod.get_function("MainLoopHybrid64")
375 7 equemene
376 7 equemene
  start = pycuda.driver.Event()
377 7 equemene
  stop = pycuda.driver.Event()
378 7 equemene
379 7 equemene
  MyPi=numpy.zeros(steps)
380 7 equemene
  MyDuration=numpy.zeros(steps)
381 50 equemene
382 7 equemene
  if iterations%jobs==0:
383 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
384 7 equemene
    iterationsNew=iterationsCL*jobs
385 7 equemene
  else:
386 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
387 7 equemene
    iterationsNew=iterations
388 7 equemene
389 7 equemene
  for i in range(steps):
390 7 equemene
    start.record()
391 7 equemene
    start.synchronize()
392 7 equemene
    if ParaStyle=='Blocks':
393 50 equemene
      if DoublePrecision:
394 50 equemene
        MetropolisBlocksCU(circleCU,
395 50 equemene
                           numpy.uint64(iterationsCL),
396 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
397 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
398 50 equemene
                           grid=(jobs,1),
399 50 equemene
                           block=(1,1,1))
400 50 equemene
      else:
401 50 equemene
        MetropolisBlocks64CU(circleCU,
402 50 equemene
                             numpy.uint64(iterationsCL),
403 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
404 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
405 50 equemene
                             grid=(jobs,1),
406 50 equemene
                             block=(1,1,1))
407 50 equemene
408 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
409 17 equemene
            (Alu,jobs,1,ParaStyle)
410 7 equemene
    elif ParaStyle=='Hybrid':
411 17 equemene
      threads=BestThreadsNumber(jobs)
412 50 equemene
      if DoublePrecision:
413 50 equemene
        MetropolisHybrid64CU(circleCU,
414 50 equemene
                             numpy.uint64(iterationsCL),
415 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
416 50 equemene
                             numpy.uint32(nprnd(2**30/jobs)),
417 50 equemene
                             grid=(jobs,1),
418 50 equemene
                             block=(threads,1,1))
419 50 equemene
      else:
420 50 equemene
        MetropolisHybridCU(circleCU,
421 50 equemene
                           numpy.uint64(iterationsCL),
422 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
423 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
424 50 equemene
                           grid=(jobs,1),
425 50 equemene
                           block=(threads,1,1))
426 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
427 17 equemene
            (Alu,jobs/threads,threads,ParaStyle)
428 7 equemene
    else:
429 50 equemene
      if DoublePrecision:
430 50 equemene
        MetropolisJobs64CU(circleCU,
431 50 equemene
                           numpy.uint64(iterationsCL),
432 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
433 50 equemene
                           numpy.uint32(nprnd(2**30/jobs)),
434 50 equemene
                           grid=(1,1),
435 50 equemene
                           block=(jobs,1,1))
436 50 equemene
      else:
437 50 equemene
        MetropolisJobsCU(circleCU,
438 50 equemene
                         numpy.uint64(iterationsCL),
439 50 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
440 50 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
441 50 equemene
                         grid=(1,1),
442 50 equemene
                         block=(jobs,1,1))
443 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
444 17 equemene
            (Alu,jobs,1,ParaStyle)
445 7 equemene
    stop.record()
446 7 equemene
    stop.synchronize()
447 7 equemene
448 7 equemene
    elapsed = start.time_till(stop)*1e-3
449 7 equemene
450 7 equemene
    MyDuration[i]=elapsed
451 50 equemene
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
452 50 equemene
    MyPi[i]=numpy.median(AllPi)
453 50 equemene
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
454 7 equemene
455 50 equemene
456 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
457 7 equemene
458 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
459 7 equemene
460 7 equemene
461 50 equemene
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
462 50 equemene
                     DoublePrecision):
463 7 equemene
464 7 equemene
  # Initialisation des variables en les CASTant correctement
465 7 equemene
466 46 equemene
  if Device==0:
467 46 equemene
    print "Enter XPU selector based on ALU type: first selected"
468 46 equemene
    HasXPU=False
469 46 equemene
    # Default Device selection based on ALU Type
470 46 equemene
    for platform in cl.get_platforms():
471 46 equemene
      for device in platform.get_devices():
472 46 equemene
        deviceType=cl.device_type.to_string(device.type)
473 46 equemene
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
474 46 equemene
          XPU=device
475 46 equemene
          print "GPU selected: ",device.name
476 46 equemene
          HasXPU=True
477 46 equemene
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:
478 46 equemene
          XPU=device
479 46 equemene
          print "CPU selected: ",device.name
480 46 equemene
          HasXPU=True
481 46 equemene
  else:
482 46 equemene
    print "Enter XPU selector based on device number & ALU type"
483 46 equemene
    Id=1
484 46 equemene
    HasXPU=False
485 46 equemene
    # Primary Device selection based on Device Id
486 46 equemene
    for platform in cl.get_platforms():
487 46 equemene
      for device in platform.get_devices():
488 46 equemene
        deviceType=cl.device_type.to_string(device.type)
489 46 equemene
        if Id==Device and Alu==deviceType and HasXPU==False:
490 46 equemene
          XPU=device
491 46 equemene
          print "CPU/GPU selected: ",device.name
492 46 equemene
          HasXPU=True
493 46 equemene
        Id=Id+1
494 46 equemene
    if HasXPU==False:
495 46 equemene
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
496 46 equemene
          (Device,Alu,Id-1)
497 46 equemene
      return(0,0,0)
498 7 equemene
499 7 equemene
  # Je cree le contexte et la queue pour son execution
500 46 equemene
  ctx = cl.Context([XPU])
501 7 equemene
  queue = cl.CommandQueue(ctx,
502 7 equemene
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
503 7 equemene
504 7 equemene
  # Je recupere les flag possibles pour les buffers
505 7 equemene
  mf = cl.mem_flags
506 7 equemene
507 7 equemene
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
508 7 equemene
509 7 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
510 7 equemene
    options = "-cl-mad-enable -cl-fast-relaxed-math")
511 7 equemene
512 7 equemene
  i=0
513 7 equemene
514 7 equemene
  MyPi=numpy.zeros(steps)
515 7 equemene
  MyDuration=numpy.zeros(steps)
516 7 equemene
517 7 equemene
  if iterations%jobs==0:
518 41 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
519 47 equemene
    iterationsNew=numpy.uint64(iterationsCL*jobs)
520 7 equemene
  else:
521 41 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
522 47 equemene
    iterationsNew=numpy.uint64(iterations)
523 7 equemene
524 7 equemene
  for i in range(steps):
525 7 equemene
526 7 equemene
    if ParaStyle=='Blocks':
527 7 equemene
      # Call OpenCL kernel
528 7 equemene
      # (1,) is Global work size (only 1 work size)
529 7 equemene
      # (1,) is local work size
530 7 equemene
      # circleCL is lattice translated in CL format
531 7 equemene
      # SeedZCL is lattice translated in CL format
532 7 equemene
      # SeedWCL is lattice translated in CL format
533 7 equemene
      # step is number of iterations
534 50 equemene
      if DoublePrecision:
535 50 equemene
        CLLaunch=MetropolisCL.MainLoopGlobal64(queue,(jobs,),None,
536 50 equemene
                                               circleCL,
537 50 equemene
                                               numpy.uint64(iterationsCL),
538 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)),
539 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)))
540 50 equemene
      else:
541 50 equemene
        CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
542 50 equemene
                                             circleCL,
543 50 equemene
                                             numpy.uint64(iterationsCL),
544 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)),
545 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)))
546 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
547 17 equemene
            (Alu,jobs,1,ParaStyle)
548 7 equemene
    elif ParaStyle=='Hybrid':
549 17 equemene
      threads=BestThreadsNumber(jobs)
550 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
551 50 equemene
      if DoublePrecision:
552 50 equemene
        CLLaunch=MetropolisCL.MainLoopHybrid64(queue,(jobs,),(threads,),
553 50 equemene
                                               circleCL,
554 50 equemene
                                               numpy.uint64(iterationsCL),
555 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)),
556 50 equemene
                                               numpy.uint32(nprnd(2**30/jobs)))
557 50 equemene
      else:
558 50 equemene
        CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
559 50 equemene
                                             circleCL,
560 50 equemene
                                             numpy.uint64(iterationsCL),
561 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)),
562 50 equemene
                                             numpy.uint32(nprnd(2**30/jobs)))
563 50 equemene
564 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
565 17 equemene
            (Alu,jobs/threads,threads,ParaStyle)
566 7 equemene
    else:
567 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
568 50 equemene
      if DoublePrecision:
569 50 equemene
        CLLaunch=MetropolisCL.MainLoopLocal64(queue,(jobs,),(jobs,),
570 50 equemene
                                              circleCL,
571 50 equemene
                                              numpy.uint64(iterationsCL),
572 50 equemene
                                              numpy.uint32(nprnd(2**30/jobs)),
573 50 equemene
                                              numpy.uint32(nprnd(2**30/jobs)))
574 50 equemene
      else:
575 50 equemene
        CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
576 50 equemene
                                            circleCL,
577 50 equemene
                                            numpy.uint64(iterationsCL),
578 50 equemene
                                            numpy.uint32(nprnd(2**30/jobs)),
579 50 equemene
                                            numpy.uint32(nprnd(2**30/jobs)))
580 7 equemene
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
581 7 equemene
582 7 equemene
    CLLaunch.wait()
583 7 equemene
    cl.enqueue_copy(queue, circle, circleCL).wait()
584 7 equemene
585 7 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
586 7 equemene
587 7 equemene
    MyDuration[i]=elapsed
588 49 equemene
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
589 49 equemene
    MyPi[i]=numpy.median(AllPi)
590 49 equemene
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
591 7 equemene
592 7 equemene
  circleCL.release()
593 7 equemene
594 7 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
595 7 equemene
596 7 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
597 7 equemene
598 7 equemene
599 7 equemene
def FitAndPrint(N,D,Curves):
600 7 equemene
601 55 equemene
  from scipy.optimize import curve_fit
602 55 equemene
  import matplotlib.pyplot as plt
603 55 equemene
604 7 equemene
  try:
605 7 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
606 7 equemene
607 7 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
608 7 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
609 7 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
610 7 equemene
    coeffs_Amdahl[0]=D[0]
611 7 equemene
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
612 7 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
613 7 equemene
  except:
614 7 equemene
    print "Impossible to fit for Amdahl law : only %i elements" % len(D)
615 7 equemene
616 7 equemene
  try:
617 7 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
618 7 equemene
619 7 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
620 7 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
621 7 equemene
    coeffs_AmdahlR[0]=D[0]
622 7 equemene
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
623 7 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
624 7 equemene
625 7 equemene
  except:
626 7 equemene
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
627 7 equemene
628 7 equemene
  try:
629 7 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
630 7 equemene
631 7 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
632 45 equemene
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
633 7 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
634 7 equemene
    coeffs_Mylq[0]=D[0]
635 45 equemene
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
636 7 equemene
                                                            coeffs_Mylq[1],
637 45 equemene
                                                            coeffs_Mylq[3],
638 45 equemene
                                                            coeffs_Mylq[2])
639 7 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
640 7 equemene
                coeffs_Mylq[3])
641 7 equemene
  except:
642 7 equemene
    print "Impossible to fit for Mylq law : only %i elements" % len(D)
643 7 equemene
644 7 equemene
  try:
645 7 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
646 7 equemene
647 7 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
648 45 equemene
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
649 45 equemene
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
650 7 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
651 7 equemene
    coeffs_Mylq2[0]=D[0]
652 45 equemene
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
653 45 equemene
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
654 45 equemene
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
655 7 equemene
656 7 equemene
  except:
657 7 equemene
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
658 7 equemene
659 7 equemene
  if Curves:
660 7 equemene
    plt.xlabel("Number of Threads/work Items")
661 7 equemene
    plt.ylabel("Total Elapsed Time")
662 7 equemene
663 7 equemene
    Experience,=plt.plot(N,D,'ro')
664 7 equemene
    try:
665 7 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
666 7 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
667 7 equemene
    except:
668 7 equemene
      print "Fit curves seem not to be available"
669 7 equemene
670 7 equemene
    plt.legend()
671 7 equemene
    plt.show()
672 7 equemene
673 7 equemene
if __name__=='__main__':
674 7 equemene
675 7 equemene
  # Set defaults values
676 55 equemene
677 55 equemene
  # Alu can be CPU, GPU or ACCELERATOR
678 7 equemene
  Alu='CPU'
679 46 equemene
  # Id of GPU : 1 is for first find !
680 17 equemene
  Device=0
681 7 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
682 7 equemene
  GpuStyle='OpenCL'
683 7 equemene
  # Parallel distribution can be on Threads or Blocks
684 7 equemene
  ParaStyle='Blocks'
685 7 equemene
  # Iterations is integer
686 7 equemene
  Iterations=100000000
687 7 equemene
  # JobStart in first number of Jobs to explore
688 7 equemene
  JobStart=1
689 7 equemene
  # JobEnd is last number of Jobs to explore
690 7 equemene
  JobEnd=16
691 47 equemene
  # JobStep is the step of Jobs to explore
692 47 equemene
  JobStep=1
693 7 equemene
  # Redo is the times to redo the test to improve metrology
694 7 equemene
  Redo=1
695 7 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
696 7 equemene
  OutMetrology=False
697 45 equemene
  Metrology='InMetro'
698 7 equemene
  # Curves is True to print the curves
699 7 equemene
  Curves=False
700 55 equemene
  # Fit is True to print the curves
701 55 equemene
  Fit=False
702 50 equemene
  # DoublePrecision on FP calculus
703 50 equemene
  DoublePrecision=False
704 7 equemene
705 7 equemene
  try:
706 55 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
707 7 equemene
  except getopt.GetoptError:
708 55 equemene
    print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
709 7 equemene
    sys.exit(2)
710 7 equemene
711 7 equemene
  for opt, arg in opts:
712 7 equemene
    if opt == '-h':
713 55 equemene
      print '%s -o (Out of Core Metrology) -c (Print Curves) -l (Double Precision) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
714 46 equemene
715 46 equemene
      print "\nInformations about devices detected under OpenCL:"
716 46 equemene
      # For PyOpenCL import
717 55 equemene
      try:
718 55 equemene
        import pyopencl as cl
719 55 equemene
        Id=1
720 55 equemene
        for platform in cl.get_platforms():
721 55 equemene
          for device in platform.get_devices():
722 55 equemene
            deviceType=cl.device_type.to_string(device.type)
723 55 equemene
            print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
724 55 equemene
            Id=Id+1
725 46 equemene
726 55 equemene
        print
727 55 equemene
        sys.exit()
728 55 equemene
      except ImportError:
729 55 equemene
        print "Your platform does not seem to support OpenCL"
730 55 equemene
731 7 equemene
    elif opt == '-o':
732 7 equemene
      OutMetrology=True
733 45 equemene
      Metrology='OutMetro'
734 50 equemene
    elif opt == '-l':
735 50 equemene
      DoublePrecision=True
736 7 equemene
    elif opt == '-c':
737 7 equemene
      Curves=True
738 55 equemene
    elif opt == '-f':
739 55 equemene
      Fit=True
740 7 equemene
    elif opt in ("-a", "--alu"):
741 7 equemene
      Alu = arg
742 7 equemene
    elif opt in ("-d", "--device"):
743 7 equemene
      Device = int(arg)
744 7 equemene
    elif opt in ("-g", "--gpustyle"):
745 7 equemene
      GpuStyle = arg
746 7 equemene
    elif opt in ("-p", "--parastyle"):
747 7 equemene
      ParaStyle = arg
748 7 equemene
    elif opt in ("-i", "--iterations"):
749 40 equemene
      Iterations = numpy.uint64(arg)
750 7 equemene
    elif opt in ("-s", "--jobstart"):
751 7 equemene
      JobStart = int(arg)
752 7 equemene
    elif opt in ("-e", "--jobend"):
753 7 equemene
      JobEnd = int(arg)
754 47 equemene
    elif opt in ("-t", "--jobstep"):
755 47 equemene
      JobStep = int(arg)
756 7 equemene
    elif opt in ("-r", "--redo"):
757 7 equemene
      Redo = int(arg)
758 7 equemene
759 7 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
760 7 equemene
    print "Alu can't be CPU for CUDA, set Alu to GPU"
761 7 equemene
    Alu='GPU'
762 7 equemene
763 7 equemene
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
764 7 equemene
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
765 7 equemene
    ParaStyle='Threads'
766 7 equemene
767 7 equemene
  print "Compute unit : %s" % Alu
768 7 equemene
  print "Device Identification : %s" % Device
769 7 equemene
  print "GpuStyle used : %s" % GpuStyle
770 7 equemene
  print "Parallel Style used : %s" % ParaStyle
771 7 equemene
  print "Iterations : %s" % Iterations
772 7 equemene
  print "Number of threads on start : %s" % JobStart
773 7 equemene
  print "Number of threads on end : %s" % JobEnd
774 7 equemene
  print "Number of redo : %s" % Redo
775 7 equemene
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
776 50 equemene
  print "Double Precision in Kernels : %r" % DoublePrecision
777 7 equemene
778 7 equemene
  if GpuStyle=='CUDA':
779 55 equemene
    try:
780 55 equemene
      # For PyCUDA import
781 55 equemene
      import pycuda.driver as cuda
782 55 equemene
      import pycuda.gpuarray as gpuarray
783 55 equemene
      import pycuda.autoinit
784 55 equemene
      from pycuda.compiler import SourceModule
785 55 equemene
    except ImportError:
786 55 equemene
      print "Platform does not seem to support CUDA"
787 7 equemene
788 7 equemene
  if GpuStyle=='OpenCL':
789 55 equemene
    try:
790 55 equemene
      # For PyOpenCL import
791 55 equemene
      import pyopencl as cl
792 55 equemene
      Id=1
793 55 equemene
      for platform in cl.get_platforms():
794 55 equemene
        for device in platform.get_devices():
795 55 equemene
          deviceType=cl.device_type.to_string(device.type)
796 55 equemene
          print "Device #%i of type %s : %s" % (Id,deviceType,device.name)
797 55 equemene
          if Id == Device:
798 55 equemene
            # Set the Alu as detected Device Type
799 55 equemene
            Alu=deviceType
800 55 equemene
          Id=Id+1
801 55 equemene
    except ImportError:
802 55 equemene
      print "Platform does not seem to support CUDA"
803 55 equemene
804 7 equemene
  average=numpy.array([]).astype(numpy.float32)
805 7 equemene
  median=numpy.array([]).astype(numpy.float32)
806 7 equemene
  stddev=numpy.array([]).astype(numpy.float32)
807 7 equemene
808 7 equemene
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
809 7 equemene
810 7 equemene
  Jobs=JobStart
811 7 equemene
812 7 equemene
  while Jobs <= JobEnd:
813 7 equemene
    avg,med,std=0,0,0
814 7 equemene
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
815 17 equemene
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
816 7 equemene
817 7 equemene
    if OutMetrology:
818 7 equemene
      duration=numpy.array([]).astype(numpy.float32)
819 7 equemene
      for i in range(Redo):
820 7 equemene
        start=time.time()
821 7 equemene
        if GpuStyle=='CUDA':
822 7 equemene
          try:
823 50 equemene
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
824 50 equemene
                                 DoublePrecision)
825 7 equemene
          except:
826 7 equemene
            print "Problem with %i // computations on Cuda" % Jobs
827 7 equemene
        elif GpuStyle=='OpenCL':
828 7 equemene
          try:
829 50 equemene
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
830 50 equemene
                                   Alu,Device,DoublePrecision)
831 7 equemene
          except:
832 7 equemene
            print "Problem with %i // computations on OpenCL" % Jobs
833 7 equemene
        duration=numpy.append(duration,time.time()-start)
834 46 equemene
      if (a,m,s) != (0,0,0):
835 46 equemene
        avg=numpy.mean(duration)
836 46 equemene
        med=numpy.median(duration)
837 46 equemene
        std=numpy.std(duration)
838 46 equemene
      else:
839 46 equemene
        print "Values seem to be wrong..."
840 7 equemene
    else:
841 7 equemene
      if GpuStyle=='CUDA':
842 7 equemene
        try:
843 50 equemene
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,
844 50 equemene
                                     DoublePrecision)
845 7 equemene
        except:
846 7 equemene
          print "Problem with %i // computations on Cuda" % Jobs
847 7 equemene
      elif GpuStyle=='OpenCL':
848 55 equemene
        try:
849 55 equemene
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,DoublePrecision)
850 55 equemene
        except:
851 55 equemene
          print "Problem with %i // computations on OpenCL" % Jobs
852 7 equemene
853 7 equemene
    if (avg,med,std) != (0,0,0):
854 15 equemene
      print "jobs,avg,med,std",Jobs,avg,med,std
855 7 equemene
      average=numpy.append(average,avg)
856 7 equemene
      median=numpy.append(median,med)
857 7 equemene
      stddev=numpy.append(stddev,std)
858 7 equemene
    else:
859 7 equemene
      print "Values seem to be wrong..."
860 7 equemene
    #THREADS*=2
861 50 equemene
    if DoublePrecision:
862 50 equemene
      Precision='DP'
863 50 equemene
    else:
864 50 equemene
      Precision='SP'
865 46 equemene
    if len(average)!=0:
866 50 equemene
      numpy.savez("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
867 50 equemene
      ToSave=[ ExploredJobs,average,median,stddev ]
868 50 equemene
      numpy.savetxt("Pi%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (Precision,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
869 47 equemene
    Jobs+=JobStep
870 7 equemene
871 55 equemene
  if Fit:
872 55 equemene
    FitAndPrint(ExploredJobs,median,Curves)