Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 287

Historique | Voir | Annoter | Télécharger (31,71 ko)

1 127 equemene
#!/usr/bin/env python3
2 102 equemene
#
3 102 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
4 102 equemene
#
5 259 equemene
# performs an estimation of Pi using Monte Carlo method
6 259 equemene
# a large amount of iterations is divided and distributed to compute units
7 259 equemene
# a lot of options are provided to perform scalabilty tests
8 259 equemene
#
9 259 equemene
# use -h for complete set of options
10 259 equemene
#
11 262 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@ens-lyon.fr>
12 262 equemene
#
13 262 equemene
# Part of matrix programs from: https://forge.cbp.ens-lyon.fr/svn/bench4gpu/
14 259 equemene
15 102 equemene
# Thanks to Andreas Klockner for PyCUDA:
16 102 equemene
# http://mathema.tician.de/software/pycuda
17 102 equemene
# Thanks to Andreas Klockner for PyOpenCL:
18 102 equemene
# http://mathema.tician.de/software/pyopencl
19 102 equemene
#
20 102 equemene
21 102 equemene
# 2013-01-01 : problems with launch timeout
22 102 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
23 102 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
24 102 equemene
25 102 equemene
# Common tools
26 102 equemene
import numpy
27 102 equemene
from numpy.random import randint as nprnd
28 102 equemene
import sys
29 102 equemene
import getopt
30 102 equemene
import time
31 102 equemene
import itertools
32 102 equemene
from socket import gethostname
33 102 equemene
34 104 equemene
class PenStacle:
35 104 equemene
    """Pentacle of Statistics from data"""
36 104 equemene
    Avg=0
37 104 equemene
    Med=0
38 104 equemene
    Std=0
39 104 equemene
    Min=0
40 104 equemene
    Max=0
41 104 equemene
    def __init__(self,Data):
42 104 equemene
        self.Avg=numpy.average(Data)
43 104 equemene
        self.Med=numpy.median(Data)
44 104 equemene
        self.Std=numpy.std(Data)
45 104 equemene
        self.Max=numpy.max(Data)
46 104 equemene
        self.Min=numpy.min(Data)
47 104 equemene
    def display(self):
48 127 equemene
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
49 104 equemene
50 104 equemene
class Experience:
51 104 equemene
    """Metrology for experiences"""
52 104 equemene
    DeviceStyle=''
53 104 equemene
    DeviceId=0
54 104 equemene
    AvgD=0
55 104 equemene
    MedD=0
56 104 equemene
    StdD=0
57 104 equemene
    MinD=0
58 104 equemene
    MaxD=0
59 104 equemene
    AvgR=0
60 104 equemene
    MedR=0
61 104 equemene
    StdR=0
62 104 equemene
    MinR=0
63 104 equemene
    MaxR=0
64 104 equemene
    def __init__(self,DeviceStyle,DeviceId,Iterations):
65 104 equemene
        self.DeviceStyle=DeviceStyle
66 104 equemene
        self.DeviceId=DeviceId
67 104 equemene
        self.Iterations
68 104 equemene
69 104 equemene
    def Metrology(self,Data):
70 104 equemene
        Duration=PenStacle(Data)
71 104 equemene
        Rate=PenStacle(Iterations/Data)
72 127 equemene
        print("Duration %s" % Duration)
73 127 equemene
        print("Rate %s" % Rate)
74 104 equemene
75 105 equemene
76 105 equemene
77 103 equemene
def DictionariesAPI():
78 103 equemene
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
79 103 equemene
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
80 181 equemene
    Test={True:1,False:0}
81 181 equemene
    return(Marsaglia,Computing,Test)
82 103 equemene
83 102 equemene
# find prime factors of a number
84 102 equemene
# Get for WWW :
85 102 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
86 102 equemene
def PrimeFactors(x):
87 102 equemene
88 102 equemene
    factorlist=numpy.array([]).astype('uint32')
89 102 equemene
    loop=2
90 102 equemene
    while loop<=x:
91 102 equemene
        if x%loop==0:
92 102 equemene
            x/=loop
93 102 equemene
            factorlist=numpy.append(factorlist,[loop])
94 102 equemene
        else:
95 102 equemene
            loop+=1
96 102 equemene
    return factorlist
97 102 equemene
98 102 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
99 102 equemene
# output is thread number
100 102 equemene
def BestThreadsNumber(jobs):
101 102 equemene
    factors=PrimeFactors(jobs)
102 102 equemene
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
103 102 equemene
    threads=1
104 102 equemene
    for factor in matrix.transpose().ravel():
105 102 equemene
        threads=threads*factor
106 102 equemene
        if threads*threads>jobs or threads>512:
107 102 equemene
            break
108 102 equemene
    return(long(threads))
109 102 equemene
110 102 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
111 102 equemene
def AmdahlR(N, T1, p):
112 102 equemene
    return (T1*(1-p+p/N))
113 102 equemene
114 102 equemene
# Predicted Amdahl Law
115 102 equemene
def Amdahl(N, T1, s, p):
116 102 equemene
    return (T1*(s+p/N))
117 102 equemene
118 102 equemene
# Predicted Mylq Law with first order
119 102 equemene
def Mylq(N, T1,s,c,p):
120 102 equemene
    return (T1*(s+p/N)+c*N)
121 102 equemene
122 102 equemene
# Predicted Mylq Law with second order
123 102 equemene
def Mylq2(N, T1,s,c1,c2,p):
124 102 equemene
    return (T1*(s+p/N)+c1*N+c2*N*N)
125 102 equemene
126 103 equemene
def KernelCodeCuda():
127 103 equemene
    KERNEL_CODE_CUDA="""
128 102 equemene
#define TCONG 0
129 102 equemene
#define TSHR3 1
130 102 equemene
#define TMWC 2
131 102 equemene
#define TKISS 3
132 102 equemene

133 102 equemene
#define TINT32 0
134 102 equemene
#define TINT64 1
135 102 equemene
#define TFP32 2
136 102 equemene
#define TFP64 3
137 102 equemene

138 181 equemene
#define IFTHEN 1
139 181 equemene

140 102 equemene
// Marsaglia RNG very simple implementation
141 102 equemene

142 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
143 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
144 102 equemene
#define MWC   (znew+wnew)
145 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
146 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
147 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
148 102 equemene

149 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
150 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
151 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
152 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
153 102 equemene

154 102 equemene
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
155 102 equemene
{
156 102 equemene

157 102 equemene
#if TRNG == TCONG
158 102 equemene
   uint jcong=seed_z+work;
159 102 equemene
#elif TRNG == TSHR3
160 102 equemene
   uint jsr=seed_w+work;
161 102 equemene
#elif TRNG == TMWC
162 102 equemene
   uint z=seed_z+work;
163 102 equemene
   uint w=seed_w+work;
164 102 equemene
#elif TRNG == TKISS
165 102 equemene
   uint jcong=seed_z+work;
166 102 equemene
   uint jsr=seed_w+work;
167 102 equemene
   uint z=seed_z-work;
168 102 equemene
   uint w=seed_w-work;
169 102 equemene
#endif
170 102 equemene

171 102 equemene
   ulong total=0;
172 102 equemene

173 102 equemene
   for (ulong i=0;i<iterations;i++) {
174 102 equemene

175 102 equemene
#if TYPE == TINT32
176 102 equemene
    #define THEONE 1073741824
177 102 equemene
    #if TRNG == TCONG
178 102 equemene
        uint x=CONG>>17 ;
179 102 equemene
        uint y=CONG>>17 ;
180 102 equemene
    #elif TRNG == TSHR3
181 102 equemene
        uint x=SHR3>>17 ;
182 102 equemene
        uint y=SHR3>>17 ;
183 102 equemene
    #elif TRNG == TMWC
184 102 equemene
        uint x=MWC>>17 ;
185 102 equemene
        uint y=MWC>>17 ;
186 102 equemene
    #elif TRNG == TKISS
187 102 equemene
        uint x=KISS>>17 ;
188 102 equemene
        uint y=KISS>>17 ;
189 102 equemene
    #endif
190 102 equemene
#elif TYPE == TINT64
191 102 equemene
    #define THEONE 4611686018427387904
192 102 equemene
    #if TRNG == TCONG
193 102 equemene
        ulong x=(ulong)(CONG>>1) ;
194 102 equemene
        ulong y=(ulong)(CONG>>1) ;
195 102 equemene
    #elif TRNG == TSHR3
196 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
197 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
198 102 equemene
    #elif TRNG == TMWC
199 102 equemene
        ulong x=(ulong)(MWC>>1) ;
200 102 equemene
        ulong y=(ulong)(MWC>>1) ;
201 102 equemene
    #elif TRNG == TKISS
202 102 equemene
        ulong x=(ulong)(KISS>>1) ;
203 102 equemene
        ulong y=(ulong)(KISS>>1) ;
204 102 equemene
    #endif
205 102 equemene
#elif TYPE == TFP32
206 102 equemene
    #define THEONE 1.0f
207 102 equemene
    #if TRNG == TCONG
208 102 equemene
        float x=CONGfp ;
209 102 equemene
        float y=CONGfp ;
210 102 equemene
    #elif TRNG == TSHR3
211 102 equemene
        float x=SHR3fp ;
212 102 equemene
        float y=SHR3fp ;
213 102 equemene
    #elif TRNG == TMWC
214 102 equemene
        float x=MWCfp ;
215 102 equemene
        float y=MWCfp ;
216 102 equemene
    #elif TRNG == TKISS
217 102 equemene
      float x=KISSfp ;
218 102 equemene
      float y=KISSfp ;
219 102 equemene
    #endif
220 102 equemene
#elif TYPE == TFP64
221 102 equemene
    #define THEONE 1.0f
222 102 equemene
    #if TRNG == TCONG
223 102 equemene
        double x=(double)CONGfp ;
224 102 equemene
        double y=(double)CONGfp ;
225 102 equemene
    #elif TRNG == TSHR3
226 102 equemene
        double x=(double)SHR3fp ;
227 102 equemene
        double y=(double)SHR3fp ;
228 102 equemene
    #elif TRNG == TMWC
229 102 equemene
        double x=(double)MWCfp ;
230 102 equemene
        double y=(double)MWCfp ;
231 102 equemene
    #elif TRNG == TKISS
232 102 equemene
        double x=(double)KISSfp ;
233 102 equemene
        double y=(double)KISSfp ;
234 102 equemene
    #endif
235 102 equemene
#endif
236 102 equemene

237 181 equemene
#if TEST == IFTHEN
238 181 equemene
      if ((x*x+y*y) <=THEONE) {
239 181 equemene
         total+=1;
240 181 equemene
      }
241 181 equemene
#else
242 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
243 102 equemene
      total+=inside;
244 181 equemene
#endif
245 102 equemene
   }
246 102 equemene

247 102 equemene
   return(total);
248 102 equemene
}
249 102 equemene

250 102 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
251 102 equemene
{
252 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
253 102 equemene
   s[blockIdx.x]=total;
254 102 equemene
   __syncthreads();
255 102 equemene

256 102 equemene
}
257 102 equemene

258 102 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
259 102 equemene
{
260 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
261 102 equemene
   s[threadIdx.x]=total;
262 102 equemene
   __syncthreads();
263 102 equemene

264 102 equemene
}
265 102 equemene

266 102 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
267 102 equemene
{
268 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
269 102 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
270 102 equemene
   __syncthreads();
271 102 equemene
}
272 102 equemene

273 102 equemene
"""
274 103 equemene
    return(KERNEL_CODE_CUDA)
275 102 equemene
276 103 equemene
def KernelCodeOpenCL():
277 103 equemene
    KERNEL_CODE_OPENCL="""
278 102 equemene
#define TCONG 0
279 102 equemene
#define TSHR3 1
280 102 equemene
#define TMWC 2
281 102 equemene
#define TKISS 3
282 102 equemene

283 102 equemene
#define TINT32 0
284 102 equemene
#define TINT64 1
285 102 equemene
#define TFP32 2
286 102 equemene
#define TFP64 3
287 102 equemene

288 181 equemene
#define IFTHEN 1
289 181 equemene

290 102 equemene
// Marsaglia RNG very simple implementation
291 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
292 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
293 102 equemene

294 102 equemene
#define MWC   (znew+wnew)
295 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
296 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
297 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
298 102 equemene

299 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
300 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
301 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
302 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
303 102 equemene

304 102 equemene
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
305 102 equemene
{
306 102 equemene

307 102 equemene
#if TRNG == TCONG
308 102 equemene
   uint jcong=seed_z+work;
309 102 equemene
#elif TRNG == TSHR3
310 102 equemene
   uint jsr=seed_w+work;
311 102 equemene
#elif TRNG == TMWC
312 102 equemene
   uint z=seed_z+work;
313 102 equemene
   uint w=seed_w+work;
314 102 equemene
#elif TRNG == TKISS
315 102 equemene
   uint jcong=seed_z+work;
316 102 equemene
   uint jsr=seed_w+work;
317 102 equemene
   uint z=seed_z-work;
318 102 equemene
   uint w=seed_w-work;
319 102 equemene
#endif
320 102 equemene

321 102 equemene
   ulong total=0;
322 102 equemene

323 102 equemene
   for (ulong i=0;i<iterations;i++) {
324 102 equemene

325 102 equemene
#if TYPE == TINT32
326 102 equemene
    #define THEONE 1073741824
327 102 equemene
    #if TRNG == TCONG
328 102 equemene
        uint x=CONG>>17 ;
329 102 equemene
        uint y=CONG>>17 ;
330 102 equemene
    #elif TRNG == TSHR3
331 102 equemene
        uint x=SHR3>>17 ;
332 102 equemene
        uint y=SHR3>>17 ;
333 102 equemene
    #elif TRNG == TMWC
334 102 equemene
        uint x=MWC>>17 ;
335 102 equemene
        uint y=MWC>>17 ;
336 102 equemene
    #elif TRNG == TKISS
337 102 equemene
        uint x=KISS>>17 ;
338 102 equemene
        uint y=KISS>>17 ;
339 102 equemene
    #endif
340 102 equemene
#elif TYPE == TINT64
341 102 equemene
    #define THEONE 4611686018427387904
342 102 equemene
    #if TRNG == TCONG
343 102 equemene
        ulong x=(ulong)(CONG>>1) ;
344 102 equemene
        ulong y=(ulong)(CONG>>1) ;
345 102 equemene
    #elif TRNG == TSHR3
346 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
347 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
348 102 equemene
    #elif TRNG == TMWC
349 102 equemene
        ulong x=(ulong)(MWC>>1) ;
350 102 equemene
        ulong y=(ulong)(MWC>>1) ;
351 102 equemene
    #elif TRNG == TKISS
352 102 equemene
        ulong x=(ulong)(KISS>>1) ;
353 102 equemene
        ulong y=(ulong)(KISS>>1) ;
354 102 equemene
    #endif
355 102 equemene
#elif TYPE == TFP32
356 102 equemene
    #define THEONE 1.0f
357 102 equemene
    #if TRNG == TCONG
358 102 equemene
        float x=CONGfp ;
359 102 equemene
        float y=CONGfp ;
360 102 equemene
    #elif TRNG == TSHR3
361 102 equemene
        float x=SHR3fp ;
362 102 equemene
        float y=SHR3fp ;
363 102 equemene
    #elif TRNG == TMWC
364 102 equemene
        float x=MWCfp ;
365 102 equemene
        float y=MWCfp ;
366 102 equemene
    #elif TRNG == TKISS
367 102 equemene
      float x=KISSfp ;
368 102 equemene
      float y=KISSfp ;
369 102 equemene
    #endif
370 102 equemene
#elif TYPE == TFP64
371 102 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
372 102 equemene
    #define THEONE 1.0f
373 102 equemene
    #if TRNG == TCONG
374 102 equemene
        double x=(double)CONGfp ;
375 102 equemene
        double y=(double)CONGfp ;
376 102 equemene
    #elif TRNG == TSHR3
377 102 equemene
        double x=(double)SHR3fp ;
378 102 equemene
        double y=(double)SHR3fp ;
379 102 equemene
    #elif TRNG == TMWC
380 102 equemene
        double x=(double)MWCfp ;
381 102 equemene
        double y=(double)MWCfp ;
382 102 equemene
    #elif TRNG == TKISS
383 102 equemene
        double x=(double)KISSfp ;
384 102 equemene
        double y=(double)KISSfp ;
385 102 equemene
    #endif
386 102 equemene
#endif
387 102 equemene

388 181 equemene
#if TEST == IFTHEN
389 181 equemene
      if ((x*x+y*y) <= THEONE) {
390 181 equemene
         total+=1;
391 181 equemene
      }
392 181 equemene
#else
393 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
394 102 equemene
      total+=inside;
395 181 equemene
#endif
396 102 equemene
   }
397 102 equemene

398 102 equemene
   return(total);
399 102 equemene
}
400 102 equemene

401 102 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
402 102 equemene
{
403 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
404 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
405 102 equemene
   s[get_global_id(0)]=total;
406 102 equemene
}
407 102 equemene

408 102 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
409 102 equemene
{
410 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
411 102 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
412 102 equemene
   s[get_local_id(0)]=total;
413 102 equemene
}
414 102 equemene

415 102 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
416 102 equemene
{
417 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
418 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
419 102 equemene
   s[get_global_id(0)]=total;
420 102 equemene
}
421 102 equemene

422 102 equemene
"""
423 103 equemene
    return(KERNEL_CODE_OPENCL)
424 103 equemene
425 122 equemene
def MetropolisCuda(InputCU):
426 102 equemene
427 127 equemene
    print("Inside ",InputCU)
428 122 equemene
429 122 equemene
    iterations=InputCU['Iterations']
430 122 equemene
    steps=InputCU['Steps']
431 122 equemene
    blocks=InputCU['Blocks']
432 122 equemene
    threads=InputCU['Threads']
433 122 equemene
    Device=InputCU['Device']
434 122 equemene
    RNG=InputCU['RNG']
435 122 equemene
    ValueType=InputCU['ValueType']
436 181 equemene
    TestType=InputCU['IfThen']
437 239 equemene
    Seeds=InputCU['Seeds']
438 122 equemene
439 181 equemene
    Marsaglia,Computing,Test=DictionariesAPI()
440 103 equemene
441 122 equemene
    try:
442 122 equemene
        # For PyCUDA import
443 122 equemene
        import pycuda.driver as cuda
444 122 equemene
        from pycuda.compiler import SourceModule
445 122 equemene
446 122 equemene
        cuda.init()
447 122 equemene
        for Id in range(cuda.Device.count()):
448 122 equemene
            if Id==Device:
449 122 equemene
                XPU=cuda.Device(Id)
450 127 equemene
                print("GPU selected %s" % XPU.name())
451 122 equemene
        print
452 122 equemene
453 122 equemene
    except ImportError:
454 127 equemene
        print("Platform does not seem to support CUDA")
455 123 equemene
456 122 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
457 102 equemene
    circleCU = cuda.InOut(circle)
458 123 equemene
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
459 123 equemene
    #cuda.memcpy_htod(circleCU, circle)
460 102 equemene
461 152 equemene
    Context=XPU.make_context()
462 152 equemene
463 102 equemene
    try:
464 194 equemene
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
465 177 equemene
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
466 177 equemene
        # Needed to set the compiler via ccbin for CUDA9 implementation
467 194 equemene
        #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
468 102 equemene
    except:
469 177 equemene
        print("Compilation seems to break")
470 152 equemene
471 102 equemene
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
472 123 equemene
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
473 102 equemene
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
474 123 equemene
475 102 equemene
    MyDuration=numpy.zeros(steps)
476 102 equemene
477 122 equemene
    jobs=blocks*threads;
478 102 equemene
479 122 equemene
    iterationsCU=numpy.uint64(iterations/jobs)
480 122 equemene
    if iterations%jobs!=0:
481 123 equemene
        iterationsCU+=numpy.uint64(1)
482 122 equemene
483 102 equemene
    for i in range(steps):
484 122 equemene
        start_time=time.time()
485 122 equemene
486 123 equemene
        try:
487 123 equemene
            MetropolisHybridCU(circleCU,
488 123 equemene
                               numpy.uint64(iterationsCU),
489 239 equemene
                               numpy.uint32(Seeds[0]),
490 239 equemene
                               numpy.uint32(Seeds[1]),
491 123 equemene
                               grid=(blocks,1),block=(threads,1,1))
492 123 equemene
        except:
493 127 equemene
            print("Crash during CUDA call")
494 102 equemene
495 122 equemene
        elapsed = time.time()-start_time
496 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
497 122 equemene
498 102 equemene
        MyDuration[i]=elapsed
499 102 equemene
500 122 equemene
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
501 127 equemene
    print(OutputCU)
502 152 equemene
    Context.pop()
503 122 equemene
504 177 equemene
    Context.detach()
505 194 equemene
506 122 equemene
    return(OutputCU)
507 102 equemene
508 106 equemene
def MetropolisOpenCL(InputCL):
509 103 equemene
510 103 equemene
    import pyopencl as cl
511 105 equemene
512 106 equemene
    iterations=InputCL['Iterations']
513 106 equemene
    steps=InputCL['Steps']
514 106 equemene
    blocks=InputCL['Blocks']
515 106 equemene
    threads=InputCL['Threads']
516 106 equemene
    Device=InputCL['Device']
517 181 equemene
    RNG=InputCL['RNG']
518 106 equemene
    ValueType=InputCL['ValueType']
519 181 equemene
    TestType=InputCL['IfThen']
520 239 equemene
    Seeds=InputCL['Seeds']
521 181 equemene
522 181 equemene
    Marsaglia,Computing,Test=DictionariesAPI()
523 239 equemene
524 102 equemene
    # Initialisation des variables en les CASTant correctement
525 122 equemene
    Id=0
526 102 equemene
    HasXPU=False
527 102 equemene
    for platform in cl.get_platforms():
528 102 equemene
        for device in platform.get_devices():
529 102 equemene
            if Id==Device:
530 102 equemene
                XPU=device
531 127 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
532 102 equemene
                HasXPU=True
533 102 equemene
            Id+=1
534 252 equemene
            # print(Id)
535 102 equemene
536 102 equemene
    if HasXPU==False:
537 127 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
538 239 equemene
        sys.exit()
539 102 equemene
540 102 equemene
    # Je cree le contexte et la queue pour son execution
541 106 equemene
    try:
542 192 equemene
        ctx = cl.Context(devices=[XPU])
543 106 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
544 106 equemene
    except:
545 127 equemene
        print("Crash during context creation")
546 102 equemene
547 102 equemene
    # Je recupere les flag possibles pour les buffers
548 102 equemene
    mf = cl.mem_flags
549 102 equemene
550 106 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
551 266 equemene
    #    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
552 266 equemene
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY,circle.nbytes)
553 106 equemene
554 181 equemene
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
555 102 equemene
556 102 equemene
    MyDuration=numpy.zeros(steps)
557 102 equemene
558 102 equemene
    jobs=blocks*threads;
559 102 equemene
560 106 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
561 106 equemene
    if iterations%jobs!=0:
562 106 equemene
        iterationsCL+=1
563 102 equemene
564 102 equemene
    for i in range(steps):
565 102 equemene
        start_time=time.time()
566 102 equemene
        if threads == 1:
567 102 equemene
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
568 102 equemene
                                                 circleCL,
569 102 equemene
                                                 numpy.uint64(iterationsCL),
570 239 equemene
                                                 numpy.uint32(Seeds[0]),
571 239 equemene
                                                 numpy.uint32(Seeds[1]))
572 102 equemene
        else:
573 102 equemene
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
574 102 equemene
                                                 circleCL,
575 102 equemene
                                                 numpy.uint64(iterationsCL),
576 239 equemene
                                                 numpy.uint32(Seeds[0]),
577 239 equemene
                                                 numpy.uint32(Seeds[1]))
578 102 equemene
579 102 equemene
        CLLaunch.wait()
580 102 equemene
        cl.enqueue_copy(queue, circle, circleCL).wait()
581 102 equemene
582 102 equemene
        elapsed = time.time()-start_time
583 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
584 102 equemene
585 104 equemene
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
586 102 equemene
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
587 102 equemene
588 102 equemene
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
589 102 equemene
        MyDuration[i]=elapsed
590 104 equemene
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
591 104 equemene
        # MyPi[i]=numpy.median(AllPi)
592 102 equemene
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
593 102 equemene
594 102 equemene
    circleCL.release()
595 102 equemene
596 106 equemene
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
597 252 equemene
    # print(OutputCL)
598 106 equemene
    return(OutputCL)
599 102 equemene
600 102 equemene
601 102 equemene
def FitAndPrint(N,D,Curves):
602 102 equemene
603 102 equemene
    from scipy.optimize import curve_fit
604 102 equemene
    import matplotlib.pyplot as plt
605 102 equemene
606 102 equemene
    try:
607 102 equemene
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
608 102 equemene
609 102 equemene
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
610 102 equemene
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
611 102 equemene
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
612 102 equemene
        coeffs_Amdahl[0]=D[0]
613 127 equemene
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
614 102 equemene
    except:
615 127 equemene
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
616 102 equemene
617 102 equemene
    try:
618 102 equemene
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
619 102 equemene
620 102 equemene
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
621 102 equemene
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
622 102 equemene
        coeffs_AmdahlR[0]=D[0]
623 127 equemene
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
624 102 equemene
625 102 equemene
    except:
626 127 equemene
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
627 102 equemene
628 102 equemene
    try:
629 102 equemene
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
630 102 equemene
631 102 equemene
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
632 102 equemene
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
633 102 equemene
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
634 102 equemene
        coeffs_Mylq[0]=D[0]
635 127 equemene
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
636 102 equemene
                                                                coeffs_Mylq[1],
637 102 equemene
                                                                coeffs_Mylq[3],
638 127 equemene
                                                                coeffs_Mylq[2]))
639 102 equemene
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
640 102 equemene
                    coeffs_Mylq[3])
641 102 equemene
    except:
642 127 equemene
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
643 102 equemene
644 102 equemene
    try:
645 102 equemene
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
646 102 equemene
647 102 equemene
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
648 102 equemene
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
649 102 equemene
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
650 102 equemene
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
651 102 equemene
        coeffs_Mylq2[0]=D[0]
652 127 equemene
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
653 102 equemene
654 102 equemene
    except:
655 127 equemene
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
656 102 equemene
657 102 equemene
    if Curves:
658 102 equemene
        plt.xlabel("Number of Threads/work Items")
659 102 equemene
        plt.ylabel("Total Elapsed Time")
660 102 equemene
661 102 equemene
        Experience,=plt.plot(N,D,'ro')
662 102 equemene
    try:
663 102 equemene
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
664 102 equemene
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
665 102 equemene
    except:
666 127 equemene
        print("Fit curves seem not to be available")
667 102 equemene
668 102 equemene
    plt.legend()
669 102 equemene
    plt.show()
670 102 equemene
671 102 equemene
if __name__=='__main__':
672 103 equemene
673 102 equemene
    # Set defaults values
674 102 equemene
675 102 equemene
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
676 102 equemene
    GpuStyle='OpenCL'
677 102 equemene
    # Iterations is integer
678 259 equemene
    Iterations=1000000000
679 102 equemene
    # BlocksBlocks in first number of Blocks to explore
680 259 equemene
    BlocksBegin=1024
681 102 equemene
    # BlocksEnd is last number of Blocks to explore
682 259 equemene
    BlocksEnd=1024
683 102 equemene
    # BlocksStep is the step of Blocks to explore
684 102 equemene
    BlocksStep=1
685 102 equemene
    # ThreadsBlocks in first number of Blocks to explore
686 102 equemene
    ThreadsBegin=1
687 102 equemene
    # ThreadsEnd is last number of Blocks to explore
688 102 equemene
    ThreadsEnd=1
689 102 equemene
    # ThreadsStep is the step of Blocks to explore
690 102 equemene
    ThreadsStep=1
691 102 equemene
    # Redo is the times to redo the test to improve metrology
692 102 equemene
    Redo=1
693 102 equemene
    # OutMetrology is method for duration estimation : False is GPU inside
694 102 equemene
    OutMetrology=False
695 102 equemene
    Metrology='InMetro'
696 102 equemene
    # Curves is True to print the curves
697 102 equemene
    Curves=False
698 102 equemene
    # Fit is True to print the curves
699 102 equemene
    Fit=False
700 181 equemene
    # Inside based on If
701 181 equemene
    IfThen=False
702 102 equemene
    # Marsaglia RNG
703 102 equemene
    RNG='MWC'
704 102 equemene
    # Value type : INT32, INT64, FP32, FP64
705 102 equemene
    ValueType='FP32'
706 239 equemene
    # Seeds for RNG
707 239 equemene
    Seeds=110271,101008
708 102 equemene
709 181 equemene
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
710 102 equemene
711 102 equemene
    try:
712 181 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
713 102 equemene
    except getopt.GetoptError:
714 127 equemene
        print(HowToUse % sys.argv[0])
715 102 equemene
        sys.exit(2)
716 104 equemene
717 104 equemene
    # List of Devices
718 104 equemene
    Devices=[]
719 104 equemene
    Alu={}
720 104 equemene
721 102 equemene
    for opt, arg in opts:
722 102 equemene
        if opt == '-h':
723 127 equemene
            print(HowToUse % sys.argv[0])
724 102 equemene
725 127 equemene
            print("\nInformations about devices detected under OpenCL API:")
726 102 equemene
            # For PyOpenCL import
727 102 equemene
            try:
728 102 equemene
                import pyopencl as cl
729 122 equemene
                Id=0
730 102 equemene
                for platform in cl.get_platforms():
731 102 equemene
                    for device in platform.get_devices():
732 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
733 157 equemene
                        deviceType="xPU"
734 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
735 102 equemene
                        Id=Id+1
736 102 equemene
737 123 equemene
            except:
738 127 equemene
                print("Your platform does not seem to support OpenCL")
739 122 equemene
740 127 equemene
            print("\nInformations about devices detected under CUDA API:")
741 122 equemene
            # For PyCUDA import
742 122 equemene
            try:
743 122 equemene
                import pycuda.driver as cuda
744 122 equemene
                cuda.init()
745 122 equemene
                for Id in range(cuda.Device.count()):
746 122 equemene
                    device=cuda.Device(Id)
747 127 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
748 102 equemene
                print
749 123 equemene
            except:
750 127 equemene
                print("Your platform does not seem to support CUDA")
751 102 equemene
752 122 equemene
            sys.exit()
753 122 equemene
754 102 equemene
        elif opt == '-o':
755 102 equemene
            OutMetrology=True
756 102 equemene
            Metrology='OutMetro'
757 102 equemene
        elif opt == '-c':
758 102 equemene
            Curves=True
759 181 equemene
        elif opt == '-k':
760 181 equemene
            IfThen=True
761 102 equemene
        elif opt in ("-d", "--device"):
762 104 equemene
            Devices.append(int(arg))
763 102 equemene
        elif opt in ("-g", "--gpustyle"):
764 102 equemene
            GpuStyle = arg
765 102 equemene
        elif opt in ("-m", "--marsaglia"):
766 102 equemene
            RNG = arg
767 102 equemene
        elif opt in ("-v", "--valuetype"):
768 102 equemene
            ValueType = arg
769 102 equemene
        elif opt in ("-i", "--iterations"):
770 102 equemene
            Iterations = numpy.uint64(arg)
771 102 equemene
        elif opt in ("-b", "--blocksbegin"):
772 102 equemene
            BlocksBegin = int(arg)
773 192 equemene
            BlocksEnd= BlocksBegin
774 102 equemene
        elif opt in ("-e", "--blocksend"):
775 102 equemene
            BlocksEnd = int(arg)
776 102 equemene
        elif opt in ("-s", "--blocksstep"):
777 102 equemene
            BlocksStep = int(arg)
778 102 equemene
        elif opt in ("-f", "--threadsfirst"):
779 102 equemene
            ThreadsBegin = int(arg)
780 192 equemene
            ThreadsEnd = ThreadsBegin
781 102 equemene
        elif opt in ("-l", "--threadslast"):
782 102 equemene
            ThreadsEnd = int(arg)
783 102 equemene
        elif opt in ("-t", "--threadsstep"):
784 102 equemene
            ThreadsStep = int(arg)
785 102 equemene
        elif opt in ("-r", "--redo"):
786 102 equemene
            Redo = int(arg)
787 102 equemene
788 259 equemene
    # If no device has been specified, take the first one!
789 259 equemene
    if len(Devices)==0:
790 259 equemene
        Devices.append(0)
791 259 equemene
792 127 equemene
    print("Devices Identification : %s" % Devices)
793 127 equemene
    print("GpuStyle used : %s" % GpuStyle)
794 127 equemene
    print("Iterations : %s" % Iterations)
795 127 equemene
    print("Number of Blocks on begin : %s" % BlocksBegin)
796 127 equemene
    print("Number of Blocks on end : %s" % BlocksEnd)
797 127 equemene
    print("Step on Blocks : %s" % BlocksStep)
798 127 equemene
    print("Number of Threads on begin : %s" % ThreadsBegin)
799 127 equemene
    print("Number of Threads on end : %s" % ThreadsEnd)
800 127 equemene
    print("Step on Threads : %s" % ThreadsStep)
801 127 equemene
    print("Number of redo : %s" % Redo)
802 127 equemene
    print("Metrology done out of XPU : %r" % OutMetrology)
803 127 equemene
    print("Type of Marsaglia RNG used : %s" % RNG)
804 127 equemene
    print("Type of variable : %s" % ValueType)
805 102 equemene
806 102 equemene
    if GpuStyle=='CUDA':
807 102 equemene
        try:
808 102 equemene
            # For PyCUDA import
809 102 equemene
            import pycuda.driver as cuda
810 122 equemene
811 122 equemene
            cuda.init()
812 122 equemene
            for Id in range(cuda.Device.count()):
813 122 equemene
                device=cuda.Device(Id)
814 127 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
815 122 equemene
                if Id in Devices:
816 122 equemene
                    Alu[Id]='GPU'
817 122 equemene
818 102 equemene
        except ImportError:
819 127 equemene
            print("Platform does not seem to support CUDA")
820 102 equemene
821 102 equemene
    if GpuStyle=='OpenCL':
822 102 equemene
        try:
823 102 equemene
            # For PyOpenCL import
824 102 equemene
            import pyopencl as cl
825 122 equemene
            Id=0
826 102 equemene
            for platform in cl.get_platforms():
827 102 equemene
                for device in platform.get_devices():
828 138 equemene
                    #deviceType=cl.device_type.to_string(device.type)
829 152 equemene
                    deviceType="xPU"
830 127 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
831 102 equemene
832 104 equemene
                    if Id in Devices:
833 102 equemene
                    # Set the Alu as detected Device Type
834 104 equemene
                        Alu[Id]=deviceType
835 102 equemene
                    Id=Id+1
836 102 equemene
        except ImportError:
837 127 equemene
            print("Platform does not seem to support OpenCL")
838 104 equemene
839 252 equemene
    #print(Devices,Alu)
840 104 equemene
841 127 equemene
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
842 127 equemene
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
843 102 equemene
844 102 equemene
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
845 102 equemene
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
846 102 equemene
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
847 104 equemene
    avgD=numpy.array([]).astype(numpy.float32)
848 104 equemene
    medD=numpy.array([]).astype(numpy.float32)
849 104 equemene
    stdD=numpy.array([]).astype(numpy.float32)
850 104 equemene
    minD=numpy.array([]).astype(numpy.float32)
851 104 equemene
    maxD=numpy.array([]).astype(numpy.float32)
852 104 equemene
    avgR=numpy.array([]).astype(numpy.float32)
853 104 equemene
    medR=numpy.array([]).astype(numpy.float32)
854 104 equemene
    stdR=numpy.array([]).astype(numpy.float32)
855 104 equemene
    minR=numpy.array([]).astype(numpy.float32)
856 104 equemene
    maxR=numpy.array([]).astype(numpy.float32)
857 102 equemene
858 102 equemene
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
859 102 equemene
860 102 equemene
        # print Blocks,Threads
861 102 equemene
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
862 102 equemene
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
863 102 equemene
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
864 102 equemene
        ExploredThreads=numpy.append(ExploredThreads,Threads)
865 102 equemene
866 102 equemene
        if OutMetrology:
867 104 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
868 104 equemene
            Duration=numpy.array([]).astype(numpy.float32)
869 104 equemene
            Rate=numpy.array([]).astype(numpy.float32)
870 102 equemene
            for i in range(Redo):
871 102 equemene
                start=time.time()
872 102 equemene
                if GpuStyle=='CUDA':
873 102 equemene
                    try:
874 122 equemene
                        InputCU={}
875 122 equemene
                        InputCU['Iterations']=Iterations
876 122 equemene
                        InputCU['Steps']=1
877 122 equemene
                        InputCU['Blocks']=Blocks
878 122 equemene
                        InputCU['Threads']=Threads
879 122 equemene
                        InputCU['Device']=Devices[0]
880 122 equemene
                        InputCU['RNG']=RNG
881 239 equemene
                        InputCU['Seeds']=Seeds
882 122 equemene
                        InputCU['ValueType']=ValueType
883 181 equemene
                        InputCU['IfThen']=IfThen
884 122 equemene
                        OutputCU=MetropolisCuda(InputCU)
885 122 equemene
                        Inside=OutputCU['Circle']
886 122 equemene
                        NewIterations=OutputCU['NewIterations']
887 122 equemene
                        Duration=OutputCU['Duration']
888 102 equemene
                    except:
889 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
890 102 equemene
                elif GpuStyle=='OpenCL':
891 102 equemene
                    try:
892 106 equemene
                        InputCL={}
893 106 equemene
                        InputCL['Iterations']=Iterations
894 106 equemene
                        InputCL['Steps']=1
895 106 equemene
                        InputCL['Blocks']=Blocks
896 106 equemene
                        InputCL['Threads']=Threads
897 106 equemene
                        InputCL['Device']=Devices[0]
898 106 equemene
                        InputCL['RNG']=RNG
899 239 equemene
                        InputCL['Seeds']=Seeds
900 181 equemene
                        InputCL['ValueType']=ValueType
901 181 equemene
                        InputCL['IfThen']=IfThen
902 106 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
903 106 equemene
                        Inside=OutputCL['Circle']
904 106 equemene
                        NewIterations=OutputCL['NewIterations']
905 106 equemene
                        Duration=OutputCL['Duration']
906 102 equemene
                    except:
907 127 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
908 104 equemene
                Duration=numpy.append(Duration,time.time()-start)
909 104 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
910 102 equemene
        else:
911 102 equemene
            if GpuStyle=='CUDA':
912 102 equemene
                try:
913 122 equemene
                    InputCU={}
914 122 equemene
                    InputCU['Iterations']=Iterations
915 122 equemene
                    InputCU['Steps']=Redo
916 122 equemene
                    InputCU['Blocks']=Blocks
917 122 equemene
                    InputCU['Threads']=Threads
918 122 equemene
                    InputCU['Device']=Devices[0]
919 122 equemene
                    InputCU['RNG']=RNG
920 239 equemene
                    InputCU['Seeds']=Seeds
921 122 equemene
                    InputCU['ValueType']=ValueType
922 181 equemene
                    InputCU['IfThen']=IfThen
923 122 equemene
                    OutputCU=MetropolisCuda(InputCU)
924 122 equemene
                    Inside=OutputCU['Inside']
925 122 equemene
                    NewIterations=OutputCU['NewIterations']
926 122 equemene
                    Duration=OutputCU['Duration']
927 152 equemene
                    pycuda.context.pop()
928 152 equemene
                except:
929 194 equemene
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
930 102 equemene
            elif GpuStyle=='OpenCL':
931 102 equemene
                try:
932 106 equemene
                    InputCL={}
933 106 equemene
                    InputCL['Iterations']=Iterations
934 106 equemene
                    InputCL['Steps']=Redo
935 106 equemene
                    InputCL['Blocks']=Blocks
936 106 equemene
                    InputCL['Threads']=Threads
937 106 equemene
                    InputCL['Device']=Devices[0]
938 106 equemene
                    InputCL['RNG']=RNG
939 239 equemene
                    InputCL['Seeds']=Seeds
940 106 equemene
                    InputCL['ValueType']=ValueType
941 181 equemene
                    InputCL['IfThen']=IfThen
942 106 equemene
                    OutputCL=MetropolisOpenCL(InputCL)
943 106 equemene
                    Inside=OutputCL['Inside']
944 106 equemene
                    NewIterations=OutputCL['NewIterations']
945 106 equemene
                    Duration=OutputCL['Duration']
946 102 equemene
                except:
947 127 equemene
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
948 253 equemene
            Rate=NewIterations/Duration[-1]
949 252 equemene
            print("Itops %i\nLogItops %.2f " % (int(Rate),numpy.log(Rate)/numpy.log(10)))
950 130 equemene
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
951 130 equemene
952 104 equemene
        avgD=numpy.append(avgD,numpy.average(Duration))
953 104 equemene
        medD=numpy.append(medD,numpy.median(Duration))
954 104 equemene
        stdD=numpy.append(stdD,numpy.std(Duration))
955 104 equemene
        minD=numpy.append(minD,numpy.min(Duration))
956 104 equemene
        maxD=numpy.append(maxD,numpy.max(Duration))
957 104 equemene
        avgR=numpy.append(avgR,numpy.average(Rate))
958 104 equemene
        medR=numpy.append(medR,numpy.median(Rate))
959 104 equemene
        stdR=numpy.append(stdR,numpy.std(Rate))
960 104 equemene
        minR=numpy.append(minR,numpy.min(Rate))
961 104 equemene
        maxR=numpy.append(maxR,numpy.max(Rate))
962 102 equemene
963 127 equemene
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
964 104 equemene
965 104 equemene
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
966 104 equemene
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
967 104 equemene
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
968 102 equemene
969 102 equemene
    if Fit:
970 102 equemene
        FitAndPrint(ExploredJobs,median,Curves)