Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 185

Historique | Voir | Annoter | Télécharger (31,02 ko)

1 127 equemene
#!/usr/bin/env python3
2 102 equemene
3 102 equemene
#
4 102 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 102 equemene
#
6 102 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 102 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 102 equemene
#
9 102 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 102 equemene
# http://mathema.tician.de/software/pycuda
11 102 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 102 equemene
# http://mathema.tician.de/software/pyopencl
13 102 equemene
#
14 102 equemene
15 102 equemene
# 2013-01-01 : problems with launch timeout
16 102 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 102 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 102 equemene
19 102 equemene
# Common tools
20 102 equemene
import numpy
21 102 equemene
from numpy.random import randint as nprnd
22 102 equemene
import sys
23 102 equemene
import getopt
24 102 equemene
import time
25 102 equemene
import itertools
26 102 equemene
from socket import gethostname
27 102 equemene
28 104 equemene
class PenStacle:
29 104 equemene
    """Pentacle of Statistics from data"""
30 104 equemene
    Avg=0
31 104 equemene
    Med=0
32 104 equemene
    Std=0
33 104 equemene
    Min=0
34 104 equemene
    Max=0
35 104 equemene
    def __init__(self,Data):
36 104 equemene
        self.Avg=numpy.average(Data)
37 104 equemene
        self.Med=numpy.median(Data)
38 104 equemene
        self.Std=numpy.std(Data)
39 104 equemene
        self.Max=numpy.max(Data)
40 104 equemene
        self.Min=numpy.min(Data)
41 104 equemene
    def display(self):
42 127 equemene
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43 104 equemene
44 104 equemene
class Experience:
45 104 equemene
    """Metrology for experiences"""
46 104 equemene
    DeviceStyle=''
47 104 equemene
    DeviceId=0
48 104 equemene
    AvgD=0
49 104 equemene
    MedD=0
50 104 equemene
    StdD=0
51 104 equemene
    MinD=0
52 104 equemene
    MaxD=0
53 104 equemene
    AvgR=0
54 104 equemene
    MedR=0
55 104 equemene
    StdR=0
56 104 equemene
    MinR=0
57 104 equemene
    MaxR=0
58 104 equemene
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59 104 equemene
        self.DeviceStyle=DeviceStyle
60 104 equemene
        self.DeviceId=DeviceId
61 104 equemene
        self.Iterations
62 104 equemene
63 104 equemene
    def Metrology(self,Data):
64 104 equemene
        Duration=PenStacle(Data)
65 104 equemene
        Rate=PenStacle(Iterations/Data)
66 127 equemene
        print("Duration %s" % Duration)
67 127 equemene
        print("Rate %s" % Rate)
68 104 equemene
69 105 equemene
70 105 equemene
71 103 equemene
def DictionariesAPI():
72 103 equemene
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73 103 equemene
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74 181 equemene
    Test={True:1,False:0}
75 181 equemene
    return(Marsaglia,Computing,Test)
76 103 equemene
77 102 equemene
# find prime factors of a number
78 102 equemene
# Get for WWW :
79 102 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
80 102 equemene
def PrimeFactors(x):
81 102 equemene
82 102 equemene
    factorlist=numpy.array([]).astype('uint32')
83 102 equemene
    loop=2
84 102 equemene
    while loop<=x:
85 102 equemene
        if x%loop==0:
86 102 equemene
            x/=loop
87 102 equemene
            factorlist=numpy.append(factorlist,[loop])
88 102 equemene
        else:
89 102 equemene
            loop+=1
90 102 equemene
    return factorlist
91 102 equemene
92 102 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
93 102 equemene
# output is thread number
94 102 equemene
def BestThreadsNumber(jobs):
95 102 equemene
    factors=PrimeFactors(jobs)
96 102 equemene
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
97 102 equemene
    threads=1
98 102 equemene
    for factor in matrix.transpose().ravel():
99 102 equemene
        threads=threads*factor
100 102 equemene
        if threads*threads>jobs or threads>512:
101 102 equemene
            break
102 102 equemene
    return(long(threads))
103 102 equemene
104 102 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
105 102 equemene
def AmdahlR(N, T1, p):
106 102 equemene
    return (T1*(1-p+p/N))
107 102 equemene
108 102 equemene
# Predicted Amdahl Law
109 102 equemene
def Amdahl(N, T1, s, p):
110 102 equemene
    return (T1*(s+p/N))
111 102 equemene
112 102 equemene
# Predicted Mylq Law with first order
113 102 equemene
def Mylq(N, T1,s,c,p):
114 102 equemene
    return (T1*(s+p/N)+c*N)
115 102 equemene
116 102 equemene
# Predicted Mylq Law with second order
117 102 equemene
def Mylq2(N, T1,s,c1,c2,p):
118 102 equemene
    return (T1*(s+p/N)+c1*N+c2*N*N)
119 102 equemene
120 103 equemene
def KernelCodeCuda():
121 103 equemene
    KERNEL_CODE_CUDA="""
122 102 equemene
#define TCONG 0
123 102 equemene
#define TSHR3 1
124 102 equemene
#define TMWC 2
125 102 equemene
#define TKISS 3
126 102 equemene

127 102 equemene
#define TINT32 0
128 102 equemene
#define TINT64 1
129 102 equemene
#define TFP32 2
130 102 equemene
#define TFP64 3
131 102 equemene

132 181 equemene
#define IFTHEN 1
133 181 equemene

134 102 equemene
// Marsaglia RNG very simple implementation
135 102 equemene

136 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
137 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
138 102 equemene
#define MWC   (znew+wnew)
139 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
140 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
141 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
142 102 equemene

143 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
144 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
145 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
146 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
147 102 equemene

148 102 equemene
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
149 102 equemene
{
150 102 equemene

151 102 equemene
#if TRNG == TCONG
152 102 equemene
   uint jcong=seed_z+work;
153 102 equemene
#elif TRNG == TSHR3
154 102 equemene
   uint jsr=seed_w+work;
155 102 equemene
#elif TRNG == TMWC
156 102 equemene
   uint z=seed_z+work;
157 102 equemene
   uint w=seed_w+work;
158 102 equemene
#elif TRNG == TKISS
159 102 equemene
   uint jcong=seed_z+work;
160 102 equemene
   uint jsr=seed_w+work;
161 102 equemene
   uint z=seed_z-work;
162 102 equemene
   uint w=seed_w-work;
163 102 equemene
#endif
164 102 equemene

165 102 equemene
   ulong total=0;
166 102 equemene

167 102 equemene
   for (ulong i=0;i<iterations;i++) {
168 102 equemene

169 102 equemene
#if TYPE == TINT32
170 102 equemene
    #define THEONE 1073741824
171 102 equemene
    #if TRNG == TCONG
172 102 equemene
        uint x=CONG>>17 ;
173 102 equemene
        uint y=CONG>>17 ;
174 102 equemene
    #elif TRNG == TSHR3
175 102 equemene
        uint x=SHR3>>17 ;
176 102 equemene
        uint y=SHR3>>17 ;
177 102 equemene
    #elif TRNG == TMWC
178 102 equemene
        uint x=MWC>>17 ;
179 102 equemene
        uint y=MWC>>17 ;
180 102 equemene
    #elif TRNG == TKISS
181 102 equemene
        uint x=KISS>>17 ;
182 102 equemene
        uint y=KISS>>17 ;
183 102 equemene
    #endif
184 102 equemene
#elif TYPE == TINT64
185 102 equemene
    #define THEONE 4611686018427387904
186 102 equemene
    #if TRNG == TCONG
187 102 equemene
        ulong x=(ulong)(CONG>>1) ;
188 102 equemene
        ulong y=(ulong)(CONG>>1) ;
189 102 equemene
    #elif TRNG == TSHR3
190 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
191 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
192 102 equemene
    #elif TRNG == TMWC
193 102 equemene
        ulong x=(ulong)(MWC>>1) ;
194 102 equemene
        ulong y=(ulong)(MWC>>1) ;
195 102 equemene
    #elif TRNG == TKISS
196 102 equemene
        ulong x=(ulong)(KISS>>1) ;
197 102 equemene
        ulong y=(ulong)(KISS>>1) ;
198 102 equemene
    #endif
199 102 equemene
#elif TYPE == TFP32
200 102 equemene
    #define THEONE 1.0f
201 102 equemene
    #if TRNG == TCONG
202 102 equemene
        float x=CONGfp ;
203 102 equemene
        float y=CONGfp ;
204 102 equemene
    #elif TRNG == TSHR3
205 102 equemene
        float x=SHR3fp ;
206 102 equemene
        float y=SHR3fp ;
207 102 equemene
    #elif TRNG == TMWC
208 102 equemene
        float x=MWCfp ;
209 102 equemene
        float y=MWCfp ;
210 102 equemene
    #elif TRNG == TKISS
211 102 equemene
      float x=KISSfp ;
212 102 equemene
      float y=KISSfp ;
213 102 equemene
    #endif
214 102 equemene
#elif TYPE == TFP64
215 102 equemene
    #define THEONE 1.0f
216 102 equemene
    #if TRNG == TCONG
217 102 equemene
        double x=(double)CONGfp ;
218 102 equemene
        double y=(double)CONGfp ;
219 102 equemene
    #elif TRNG == TSHR3
220 102 equemene
        double x=(double)SHR3fp ;
221 102 equemene
        double y=(double)SHR3fp ;
222 102 equemene
    #elif TRNG == TMWC
223 102 equemene
        double x=(double)MWCfp ;
224 102 equemene
        double y=(double)MWCfp ;
225 102 equemene
    #elif TRNG == TKISS
226 102 equemene
        double x=(double)KISSfp ;
227 102 equemene
        double y=(double)KISSfp ;
228 102 equemene
    #endif
229 102 equemene
#endif
230 102 equemene

231 181 equemene
#if TEST == IFTHEN
232 181 equemene
      if ((x*x+y*y) <=THEONE) {
233 181 equemene
         total+=1;
234 181 equemene
      }
235 181 equemene
#else
236 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
237 102 equemene
      total+=inside;
238 181 equemene
#endif
239 102 equemene
   }
240 102 equemene

241 102 equemene
   return(total);
242 102 equemene
}
243 102 equemene

244 102 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
245 102 equemene
{
246 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
247 102 equemene
   s[blockIdx.x]=total;
248 102 equemene
   __syncthreads();
249 102 equemene

250 102 equemene
}
251 102 equemene

252 102 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
253 102 equemene
{
254 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
255 102 equemene
   s[threadIdx.x]=total;
256 102 equemene
   __syncthreads();
257 102 equemene

258 102 equemene
}
259 102 equemene

260 102 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
261 102 equemene
{
262 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
263 102 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
264 102 equemene
   __syncthreads();
265 102 equemene
}
266 102 equemene

267 102 equemene
"""
268 103 equemene
    return(KERNEL_CODE_CUDA)
269 102 equemene
270 103 equemene
def KernelCodeOpenCL():
271 103 equemene
    KERNEL_CODE_OPENCL="""
272 102 equemene
#define TCONG 0
273 102 equemene
#define TSHR3 1
274 102 equemene
#define TMWC 2
275 102 equemene
#define TKISS 3
276 102 equemene

277 102 equemene
#define TINT32 0
278 102 equemene
#define TINT64 1
279 102 equemene
#define TFP32 2
280 102 equemene
#define TFP64 3
281 102 equemene

282 181 equemene
#define IFTHEN 1
283 181 equemene

284 102 equemene
// Marsaglia RNG very simple implementation
285 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
286 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
287 102 equemene

288 102 equemene
#define MWC   (znew+wnew)
289 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
290 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
291 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
292 102 equemene

293 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
294 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
295 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
296 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
297 102 equemene

298 102 equemene
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
299 102 equemene
{
300 102 equemene

301 102 equemene
#if TRNG == TCONG
302 102 equemene
   uint jcong=seed_z+work;
303 102 equemene
#elif TRNG == TSHR3
304 102 equemene
   uint jsr=seed_w+work;
305 102 equemene
#elif TRNG == TMWC
306 102 equemene
   uint z=seed_z+work;
307 102 equemene
   uint w=seed_w+work;
308 102 equemene
#elif TRNG == TKISS
309 102 equemene
   uint jcong=seed_z+work;
310 102 equemene
   uint jsr=seed_w+work;
311 102 equemene
   uint z=seed_z-work;
312 102 equemene
   uint w=seed_w-work;
313 102 equemene
#endif
314 102 equemene

315 102 equemene
   ulong total=0;
316 102 equemene

317 102 equemene
   for (ulong i=0;i<iterations;i++) {
318 102 equemene

319 102 equemene
#if TYPE == TINT32
320 102 equemene
    #define THEONE 1073741824
321 102 equemene
    #if TRNG == TCONG
322 102 equemene
        uint x=CONG>>17 ;
323 102 equemene
        uint y=CONG>>17 ;
324 102 equemene
    #elif TRNG == TSHR3
325 102 equemene
        uint x=SHR3>>17 ;
326 102 equemene
        uint y=SHR3>>17 ;
327 102 equemene
    #elif TRNG == TMWC
328 102 equemene
        uint x=MWC>>17 ;
329 102 equemene
        uint y=MWC>>17 ;
330 102 equemene
    #elif TRNG == TKISS
331 102 equemene
        uint x=KISS>>17 ;
332 102 equemene
        uint y=KISS>>17 ;
333 102 equemene
    #endif
334 102 equemene
#elif TYPE == TINT64
335 102 equemene
    #define THEONE 4611686018427387904
336 102 equemene
    #if TRNG == TCONG
337 102 equemene
        ulong x=(ulong)(CONG>>1) ;
338 102 equemene
        ulong y=(ulong)(CONG>>1) ;
339 102 equemene
    #elif TRNG == TSHR3
340 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
341 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
342 102 equemene
    #elif TRNG == TMWC
343 102 equemene
        ulong x=(ulong)(MWC>>1) ;
344 102 equemene
        ulong y=(ulong)(MWC>>1) ;
345 102 equemene
    #elif TRNG == TKISS
346 102 equemene
        ulong x=(ulong)(KISS>>1) ;
347 102 equemene
        ulong y=(ulong)(KISS>>1) ;
348 102 equemene
    #endif
349 102 equemene
#elif TYPE == TFP32
350 102 equemene
    #define THEONE 1.0f
351 102 equemene
    #if TRNG == TCONG
352 102 equemene
        float x=CONGfp ;
353 102 equemene
        float y=CONGfp ;
354 102 equemene
    #elif TRNG == TSHR3
355 102 equemene
        float x=SHR3fp ;
356 102 equemene
        float y=SHR3fp ;
357 102 equemene
    #elif TRNG == TMWC
358 102 equemene
        float x=MWCfp ;
359 102 equemene
        float y=MWCfp ;
360 102 equemene
    #elif TRNG == TKISS
361 102 equemene
      float x=KISSfp ;
362 102 equemene
      float y=KISSfp ;
363 102 equemene
    #endif
364 102 equemene
#elif TYPE == TFP64
365 102 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
366 102 equemene
    #define THEONE 1.0f
367 102 equemene
    #if TRNG == TCONG
368 102 equemene
        double x=(double)CONGfp ;
369 102 equemene
        double y=(double)CONGfp ;
370 102 equemene
    #elif TRNG == TSHR3
371 102 equemene
        double x=(double)SHR3fp ;
372 102 equemene
        double y=(double)SHR3fp ;
373 102 equemene
    #elif TRNG == TMWC
374 102 equemene
        double x=(double)MWCfp ;
375 102 equemene
        double y=(double)MWCfp ;
376 102 equemene
    #elif TRNG == TKISS
377 102 equemene
        double x=(double)KISSfp ;
378 102 equemene
        double y=(double)KISSfp ;
379 102 equemene
    #endif
380 102 equemene
#endif
381 102 equemene

382 181 equemene
#if TEST == IFTHEN
383 181 equemene
      if ((x*x+y*y) <= THEONE) {
384 181 equemene
         total+=1;
385 181 equemene
      }
386 181 equemene
#else
387 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
388 102 equemene
      total+=inside;
389 181 equemene
#endif
390 102 equemene
   }
391 102 equemene

392 102 equemene
   return(total);
393 102 equemene
}
394 102 equemene

395 102 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
396 102 equemene
{
397 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
398 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
399 102 equemene
   s[get_global_id(0)]=total;
400 102 equemene
}
401 102 equemene

402 102 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
403 102 equemene
{
404 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
405 102 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
406 102 equemene
   s[get_local_id(0)]=total;
407 102 equemene
}
408 102 equemene

409 102 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
410 102 equemene
{
411 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
412 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
413 102 equemene
   s[get_global_id(0)]=total;
414 102 equemene
}
415 102 equemene

416 102 equemene
"""
417 103 equemene
    return(KERNEL_CODE_OPENCL)
418 103 equemene
419 122 equemene
def MetropolisCuda(InputCU):
420 102 equemene
421 127 equemene
    print("Inside ",InputCU)
422 122 equemene
423 122 equemene
    iterations=InputCU['Iterations']
424 122 equemene
    steps=InputCU['Steps']
425 122 equemene
    blocks=InputCU['Blocks']
426 122 equemene
    threads=InputCU['Threads']
427 122 equemene
    Device=InputCU['Device']
428 122 equemene
    RNG=InputCU['RNG']
429 122 equemene
    ValueType=InputCU['ValueType']
430 181 equemene
    TestType=InputCU['IfThen']
431 122 equemene
432 181 equemene
    Marsaglia,Computing,Test=DictionariesAPI()
433 103 equemene
434 122 equemene
    try:
435 122 equemene
        # For PyCUDA import
436 122 equemene
        import pycuda.driver as cuda
437 122 equemene
        from pycuda.compiler import SourceModule
438 122 equemene
439 122 equemene
        cuda.init()
440 122 equemene
        for Id in range(cuda.Device.count()):
441 122 equemene
            if Id==Device:
442 122 equemene
                XPU=cuda.Device(Id)
443 127 equemene
                print("GPU selected %s" % XPU.name())
444 122 equemene
        print
445 122 equemene
446 122 equemene
    except ImportError:
447 127 equemene
        print("Platform does not seem to support CUDA")
448 123 equemene
449 122 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
450 102 equemene
    circleCU = cuda.InOut(circle)
451 123 equemene
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
452 123 equemene
    #cuda.memcpy_htod(circleCU, circle)
453 102 equemene
454 152 equemene
    Context=XPU.make_context()
455 152 equemene
456 102 equemene
    try:
457 152 equemene
        #mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
458 177 equemene
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
459 177 equemene
        # Needed to set the compiler via ccbin for CUDA9 implementation
460 181 equemene
        mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.8','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
461 102 equemene
    except:
462 177 equemene
        print("Compilation seems to break")
463 152 equemene
464 102 equemene
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
465 123 equemene
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
466 102 equemene
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
467 123 equemene
468 102 equemene
    MyDuration=numpy.zeros(steps)
469 102 equemene
470 122 equemene
    jobs=blocks*threads;
471 102 equemene
472 122 equemene
    iterationsCU=numpy.uint64(iterations/jobs)
473 122 equemene
    if iterations%jobs!=0:
474 123 equemene
        iterationsCU+=numpy.uint64(1)
475 122 equemene
476 102 equemene
    for i in range(steps):
477 122 equemene
        start_time=time.time()
478 122 equemene
479 123 equemene
        try:
480 123 equemene
            MetropolisHybridCU(circleCU,
481 123 equemene
                               numpy.uint64(iterationsCU),
482 123 equemene
                               numpy.uint32(nprnd(2**32)),
483 123 equemene
                               numpy.uint32(nprnd(2**32)),
484 123 equemene
                               grid=(blocks,1),block=(threads,1,1))
485 123 equemene
        except:
486 127 equemene
            print("Crash during CUDA call")
487 102 equemene
488 122 equemene
        elapsed = time.time()-start_time
489 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
490 122 equemene
491 102 equemene
        MyDuration[i]=elapsed
492 102 equemene
493 122 equemene
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
494 127 equemene
    print(OutputCU)
495 152 equemene
    Context.pop()
496 122 equemene
497 177 equemene
    Context.detach()
498 122 equemene
    return(OutputCU)
499 102 equemene
500 106 equemene
def MetropolisOpenCL(InputCL):
501 103 equemene
502 103 equemene
    import pyopencl as cl
503 105 equemene
504 127 equemene
    print("Inside ",InputCL)
505 107 equemene
506 106 equemene
    iterations=InputCL['Iterations']
507 106 equemene
    steps=InputCL['Steps']
508 106 equemene
    blocks=InputCL['Blocks']
509 106 equemene
    threads=InputCL['Threads']
510 106 equemene
    Device=InputCL['Device']
511 181 equemene
    RNG=InputCL['RNG']
512 106 equemene
    ValueType=InputCL['ValueType']
513 181 equemene
    TestType=InputCL['IfThen']
514 181 equemene
515 181 equemene
    Marsaglia,Computing,Test=DictionariesAPI()
516 103 equemene
517 102 equemene
    # Initialisation des variables en les CASTant correctement
518 122 equemene
    Id=0
519 102 equemene
    HasXPU=False
520 102 equemene
    for platform in cl.get_platforms():
521 102 equemene
        for device in platform.get_devices():
522 102 equemene
            if Id==Device:
523 102 equemene
                XPU=device
524 127 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
525 102 equemene
                HasXPU=True
526 102 equemene
            Id+=1
527 102 equemene
528 102 equemene
    if HasXPU==False:
529 127 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
530 102 equemene
        sys.exit()
531 102 equemene
532 102 equemene
    # Je cree le contexte et la queue pour son execution
533 106 equemene
    try:
534 106 equemene
        ctx = cl.Context([XPU])
535 106 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
536 106 equemene
    except:
537 127 equemene
        print("Crash during context creation")
538 102 equemene
539 102 equemene
    # Je recupere les flag possibles pour les buffers
540 102 equemene
    mf = cl.mem_flags
541 102 equemene
542 106 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
543 102 equemene
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
544 106 equemene
545 181 equemene
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
546 102 equemene
547 102 equemene
    MyDuration=numpy.zeros(steps)
548 102 equemene
549 102 equemene
    jobs=blocks*threads;
550 102 equemene
551 106 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
552 106 equemene
    if iterations%jobs!=0:
553 106 equemene
        iterationsCL+=1
554 102 equemene
555 102 equemene
    for i in range(steps):
556 102 equemene
        start_time=time.time()
557 102 equemene
        if threads == 1:
558 102 equemene
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
559 102 equemene
                                                 circleCL,
560 102 equemene
                                                 numpy.uint64(iterationsCL),
561 102 equemene
                                                 numpy.uint32(nprnd(2**32)),
562 102 equemene
                                                 numpy.uint32(nprnd(2**32)))
563 102 equemene
        else:
564 102 equemene
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
565 102 equemene
                                                 circleCL,
566 102 equemene
                                                 numpy.uint64(iterationsCL),
567 102 equemene
                                                 numpy.uint32(nprnd(2**32)),
568 102 equemene
                                                 numpy.uint32(nprnd(2**32)))
569 102 equemene
570 102 equemene
        CLLaunch.wait()
571 102 equemene
        cl.enqueue_copy(queue, circle, circleCL).wait()
572 102 equemene
573 102 equemene
        elapsed = time.time()-start_time
574 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
575 102 equemene
576 104 equemene
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
577 102 equemene
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
578 102 equemene
579 102 equemene
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
580 102 equemene
        MyDuration[i]=elapsed
581 104 equemene
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
582 104 equemene
        # MyPi[i]=numpy.median(AllPi)
583 102 equemene
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
584 102 equemene
585 102 equemene
    circleCL.release()
586 102 equemene
587 106 equemene
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
588 127 equemene
    print(OutputCL)
589 106 equemene
    return(OutputCL)
590 102 equemene
591 102 equemene
592 102 equemene
def FitAndPrint(N,D,Curves):
593 102 equemene
594 102 equemene
    from scipy.optimize import curve_fit
595 102 equemene
    import matplotlib.pyplot as plt
596 102 equemene
597 102 equemene
    try:
598 102 equemene
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
599 102 equemene
600 102 equemene
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
601 102 equemene
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
602 102 equemene
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
603 102 equemene
        coeffs_Amdahl[0]=D[0]
604 127 equemene
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
605 102 equemene
    except:
606 127 equemene
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
607 102 equemene
608 102 equemene
    try:
609 102 equemene
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
610 102 equemene
611 102 equemene
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
612 102 equemene
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
613 102 equemene
        coeffs_AmdahlR[0]=D[0]
614 127 equemene
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
615 102 equemene
616 102 equemene
    except:
617 127 equemene
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
618 102 equemene
619 102 equemene
    try:
620 102 equemene
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
621 102 equemene
622 102 equemene
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
623 102 equemene
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
624 102 equemene
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
625 102 equemene
        coeffs_Mylq[0]=D[0]
626 127 equemene
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
627 102 equemene
                                                                coeffs_Mylq[1],
628 102 equemene
                                                                coeffs_Mylq[3],
629 127 equemene
                                                                coeffs_Mylq[2]))
630 102 equemene
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
631 102 equemene
                    coeffs_Mylq[3])
632 102 equemene
    except:
633 127 equemene
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
634 102 equemene
635 102 equemene
    try:
636 102 equemene
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
637 102 equemene
638 102 equemene
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
639 102 equemene
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
640 102 equemene
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
641 102 equemene
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
642 102 equemene
        coeffs_Mylq2[0]=D[0]
643 127 equemene
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
644 102 equemene
645 102 equemene
    except:
646 127 equemene
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
647 102 equemene
648 102 equemene
    if Curves:
649 102 equemene
        plt.xlabel("Number of Threads/work Items")
650 102 equemene
        plt.ylabel("Total Elapsed Time")
651 102 equemene
652 102 equemene
        Experience,=plt.plot(N,D,'ro')
653 102 equemene
    try:
654 102 equemene
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
655 102 equemene
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
656 102 equemene
    except:
657 127 equemene
        print("Fit curves seem not to be available")
658 102 equemene
659 102 equemene
    plt.legend()
660 102 equemene
    plt.show()
661 102 equemene
662 102 equemene
if __name__=='__main__':
663 103 equemene
664 102 equemene
    # Set defaults values
665 102 equemene
666 102 equemene
    # Id of Device : 1 is for first find !
667 102 equemene
    Device=1
668 102 equemene
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
669 102 equemene
    GpuStyle='OpenCL'
670 102 equemene
    # Iterations is integer
671 104 equemene
    Iterations=10000000
672 102 equemene
    # BlocksBlocks in first number of Blocks to explore
673 102 equemene
    BlocksBegin=1
674 102 equemene
    # BlocksEnd is last number of Blocks to explore
675 152 equemene
    BlocksEnd=1
676 102 equemene
    # BlocksStep is the step of Blocks to explore
677 102 equemene
    BlocksStep=1
678 102 equemene
    # ThreadsBlocks in first number of Blocks to explore
679 102 equemene
    ThreadsBegin=1
680 102 equemene
    # ThreadsEnd is last number of Blocks to explore
681 102 equemene
    ThreadsEnd=1
682 102 equemene
    # ThreadsStep is the step of Blocks to explore
683 102 equemene
    ThreadsStep=1
684 102 equemene
    # Redo is the times to redo the test to improve metrology
685 102 equemene
    Redo=1
686 102 equemene
    # OutMetrology is method for duration estimation : False is GPU inside
687 102 equemene
    OutMetrology=False
688 102 equemene
    Metrology='InMetro'
689 102 equemene
    # Curves is True to print the curves
690 102 equemene
    Curves=False
691 102 equemene
    # Fit is True to print the curves
692 102 equemene
    Fit=False
693 181 equemene
    # Inside based on If
694 181 equemene
    IfThen=False
695 102 equemene
    # Marsaglia RNG
696 102 equemene
    RNG='MWC'
697 102 equemene
    # Value type : INT32, INT64, FP32, FP64
698 102 equemene
    ValueType='FP32'
699 102 equemene
700 181 equemene
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
701 102 equemene
702 102 equemene
    try:
703 181 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
704 102 equemene
    except getopt.GetoptError:
705 127 equemene
        print(HowToUse % sys.argv[0])
706 102 equemene
        sys.exit(2)
707 104 equemene
708 104 equemene
    # List of Devices
709 104 equemene
    Devices=[]
710 104 equemene
    Alu={}
711 104 equemene
712 102 equemene
    for opt, arg in opts:
713 102 equemene
        if opt == '-h':
714 127 equemene
            print(HowToUse % sys.argv[0])
715 102 equemene
716 127 equemene
            print("\nInformations about devices detected under OpenCL API:")
717 102 equemene
            # For PyOpenCL import
718 102 equemene
            try:
719 102 equemene
                import pyopencl as cl
720 122 equemene
                Id=0
721 102 equemene
                for platform in cl.get_platforms():
722 102 equemene
                    for device in platform.get_devices():
723 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
724 157 equemene
                        deviceType="xPU"
725 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
726 102 equemene
                        Id=Id+1
727 102 equemene
728 123 equemene
            except:
729 127 equemene
                print("Your platform does not seem to support OpenCL")
730 122 equemene
731 127 equemene
            print("\nInformations about devices detected under CUDA API:")
732 122 equemene
            # For PyCUDA import
733 122 equemene
            try:
734 122 equemene
                import pycuda.driver as cuda
735 122 equemene
                cuda.init()
736 122 equemene
                for Id in range(cuda.Device.count()):
737 122 equemene
                    device=cuda.Device(Id)
738 127 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
739 102 equemene
                print
740 123 equemene
            except:
741 127 equemene
                print("Your platform does not seem to support CUDA")
742 102 equemene
743 122 equemene
            sys.exit()
744 122 equemene
745 122 equemene
746 102 equemene
        elif opt == '-o':
747 102 equemene
            OutMetrology=True
748 102 equemene
            Metrology='OutMetro'
749 102 equemene
        elif opt == '-c':
750 102 equemene
            Curves=True
751 181 equemene
        elif opt == '-k':
752 181 equemene
            IfThen=True
753 102 equemene
        elif opt in ("-d", "--device"):
754 104 equemene
            Devices.append(int(arg))
755 102 equemene
        elif opt in ("-g", "--gpustyle"):
756 102 equemene
            GpuStyle = arg
757 102 equemene
        elif opt in ("-m", "--marsaglia"):
758 102 equemene
            RNG = arg
759 102 equemene
        elif opt in ("-v", "--valuetype"):
760 102 equemene
            ValueType = arg
761 102 equemene
        elif opt in ("-i", "--iterations"):
762 102 equemene
            Iterations = numpy.uint64(arg)
763 102 equemene
        elif opt in ("-b", "--blocksbegin"):
764 102 equemene
            BlocksBegin = int(arg)
765 102 equemene
        elif opt in ("-e", "--blocksend"):
766 102 equemene
            BlocksEnd = int(arg)
767 102 equemene
        elif opt in ("-s", "--blocksstep"):
768 102 equemene
            BlocksStep = int(arg)
769 102 equemene
        elif opt in ("-f", "--threadsfirst"):
770 102 equemene
            ThreadsBegin = int(arg)
771 102 equemene
        elif opt in ("-l", "--threadslast"):
772 102 equemene
            ThreadsEnd = int(arg)
773 102 equemene
        elif opt in ("-t", "--threadsstep"):
774 102 equemene
            ThreadsStep = int(arg)
775 102 equemene
        elif opt in ("-r", "--redo"):
776 102 equemene
            Redo = int(arg)
777 102 equemene
778 127 equemene
    print("Devices Identification : %s" % Devices)
779 127 equemene
    print("GpuStyle used : %s" % GpuStyle)
780 127 equemene
    print("Iterations : %s" % Iterations)
781 127 equemene
    print("Number of Blocks on begin : %s" % BlocksBegin)
782 127 equemene
    print("Number of Blocks on end : %s" % BlocksEnd)
783 127 equemene
    print("Step on Blocks : %s" % BlocksStep)
784 127 equemene
    print("Number of Threads on begin : %s" % ThreadsBegin)
785 127 equemene
    print("Number of Threads on end : %s" % ThreadsEnd)
786 127 equemene
    print("Step on Threads : %s" % ThreadsStep)
787 127 equemene
    print("Number of redo : %s" % Redo)
788 127 equemene
    print("Metrology done out of XPU : %r" % OutMetrology)
789 127 equemene
    print("Type of Marsaglia RNG used : %s" % RNG)
790 127 equemene
    print("Type of variable : %s" % ValueType)
791 102 equemene
792 102 equemene
    if GpuStyle=='CUDA':
793 102 equemene
        try:
794 102 equemene
            # For PyCUDA import
795 102 equemene
            import pycuda.driver as cuda
796 122 equemene
797 122 equemene
            cuda.init()
798 122 equemene
            for Id in range(cuda.Device.count()):
799 122 equemene
                device=cuda.Device(Id)
800 127 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
801 122 equemene
                if Id in Devices:
802 122 equemene
                    Alu[Id]='GPU'
803 122 equemene
804 102 equemene
        except ImportError:
805 127 equemene
            print("Platform does not seem to support CUDA")
806 102 equemene
807 102 equemene
    if GpuStyle=='OpenCL':
808 102 equemene
        try:
809 102 equemene
            # For PyOpenCL import
810 102 equemene
            import pyopencl as cl
811 122 equemene
            Id=0
812 102 equemene
            for platform in cl.get_platforms():
813 102 equemene
                for device in platform.get_devices():
814 138 equemene
                    #deviceType=cl.device_type.to_string(device.type)
815 152 equemene
                    deviceType="xPU"
816 127 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
817 102 equemene
818 104 equemene
                    if Id in Devices:
819 102 equemene
                    # Set the Alu as detected Device Type
820 104 equemene
                        Alu[Id]=deviceType
821 102 equemene
                    Id=Id+1
822 102 equemene
        except ImportError:
823 127 equemene
            print("Platform does not seem to support OpenCL")
824 104 equemene
825 127 equemene
    print(Devices,Alu)
826 104 equemene
827 127 equemene
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
828 127 equemene
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
829 102 equemene
830 102 equemene
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
831 102 equemene
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
832 102 equemene
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
833 104 equemene
    avgD=numpy.array([]).astype(numpy.float32)
834 104 equemene
    medD=numpy.array([]).astype(numpy.float32)
835 104 equemene
    stdD=numpy.array([]).astype(numpy.float32)
836 104 equemene
    minD=numpy.array([]).astype(numpy.float32)
837 104 equemene
    maxD=numpy.array([]).astype(numpy.float32)
838 104 equemene
    avgR=numpy.array([]).astype(numpy.float32)
839 104 equemene
    medR=numpy.array([]).astype(numpy.float32)
840 104 equemene
    stdR=numpy.array([]).astype(numpy.float32)
841 104 equemene
    minR=numpy.array([]).astype(numpy.float32)
842 104 equemene
    maxR=numpy.array([]).astype(numpy.float32)
843 102 equemene
844 102 equemene
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
845 102 equemene
846 102 equemene
        # print Blocks,Threads
847 102 equemene
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
848 102 equemene
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
849 102 equemene
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
850 102 equemene
        ExploredThreads=numpy.append(ExploredThreads,Threads)
851 102 equemene
852 102 equemene
        if OutMetrology:
853 104 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
854 104 equemene
            Duration=numpy.array([]).astype(numpy.float32)
855 104 equemene
            Rate=numpy.array([]).astype(numpy.float32)
856 102 equemene
            for i in range(Redo):
857 102 equemene
                start=time.time()
858 102 equemene
                if GpuStyle=='CUDA':
859 102 equemene
                    try:
860 122 equemene
                        InputCU={}
861 122 equemene
                        InputCU['Iterations']=Iterations
862 122 equemene
                        InputCU['Steps']=1
863 122 equemene
                        InputCU['Blocks']=Blocks
864 122 equemene
                        InputCU['Threads']=Threads
865 122 equemene
                        InputCU['Device']=Devices[0]
866 122 equemene
                        InputCU['RNG']=RNG
867 122 equemene
                        InputCU['ValueType']=ValueType
868 181 equemene
                        InputCU['IfThen']=IfThen
869 122 equemene
                        OutputCU=MetropolisCuda(InputCU)
870 122 equemene
                        Inside=OutputCU['Circle']
871 122 equemene
                        NewIterations=OutputCU['NewIterations']
872 122 equemene
                        Duration=OutputCU['Duration']
873 102 equemene
                    except:
874 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
875 102 equemene
                elif GpuStyle=='OpenCL':
876 102 equemene
                    try:
877 106 equemene
                        InputCL={}
878 106 equemene
                        InputCL['Iterations']=Iterations
879 106 equemene
                        InputCL['Steps']=1
880 106 equemene
                        InputCL['Blocks']=Blocks
881 106 equemene
                        InputCL['Threads']=Threads
882 106 equemene
                        InputCL['Device']=Devices[0]
883 106 equemene
                        InputCL['RNG']=RNG
884 181 equemene
                        InputCL['ValueType']=ValueType
885 181 equemene
                        InputCL['IfThen']=IfThen
886 106 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
887 106 equemene
                        Inside=OutputCL['Circle']
888 106 equemene
                        NewIterations=OutputCL['NewIterations']
889 106 equemene
                        Duration=OutputCL['Duration']
890 102 equemene
                    except:
891 127 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
892 104 equemene
                Duration=numpy.append(Duration,time.time()-start)
893 104 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
894 102 equemene
        else:
895 102 equemene
            if GpuStyle=='CUDA':
896 102 equemene
                try:
897 122 equemene
                    InputCU={}
898 122 equemene
                    InputCU['Iterations']=Iterations
899 122 equemene
                    InputCU['Steps']=Redo
900 122 equemene
                    InputCU['Blocks']=Blocks
901 122 equemene
                    InputCU['Threads']=Threads
902 122 equemene
                    InputCU['Device']=Devices[0]
903 122 equemene
                    InputCU['RNG']=RNG
904 122 equemene
                    InputCU['ValueType']=ValueType
905 181 equemene
                    InputCU['IfThen']=IfThen
906 122 equemene
                    OutputCU=MetropolisCuda(InputCU)
907 122 equemene
                    Inside=OutputCU['Inside']
908 122 equemene
                    NewIterations=OutputCU['NewIterations']
909 122 equemene
                    Duration=OutputCU['Duration']
910 102 equemene
                except:
911 127 equemene
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
912 152 equemene
                try:
913 152 equemene
                    pycuda.context.pop()
914 152 equemene
                except:
915 152 equemene
                    pass
916 102 equemene
            elif GpuStyle=='OpenCL':
917 102 equemene
                try:
918 106 equemene
                    InputCL={}
919 106 equemene
                    InputCL['Iterations']=Iterations
920 106 equemene
                    InputCL['Steps']=Redo
921 106 equemene
                    InputCL['Blocks']=Blocks
922 106 equemene
                    InputCL['Threads']=Threads
923 106 equemene
                    InputCL['Device']=Devices[0]
924 106 equemene
                    InputCL['RNG']=RNG
925 106 equemene
                    InputCL['ValueType']=ValueType
926 181 equemene
                    InputCL['IfThen']=IfThen
927 106 equemene
                    OutputCL=MetropolisOpenCL(InputCL)
928 106 equemene
                    Inside=OutputCL['Inside']
929 106 equemene
                    NewIterations=OutputCL['NewIterations']
930 106 equemene
                    Duration=OutputCL['Duration']
931 102 equemene
                except:
932 127 equemene
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
933 104 equemene
            Rate=NewIterations/Duration
934 130 equemene
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
935 130 equemene
936 104 equemene
937 104 equemene
        avgD=numpy.append(avgD,numpy.average(Duration))
938 104 equemene
        medD=numpy.append(medD,numpy.median(Duration))
939 104 equemene
        stdD=numpy.append(stdD,numpy.std(Duration))
940 104 equemene
        minD=numpy.append(minD,numpy.min(Duration))
941 104 equemene
        maxD=numpy.append(maxD,numpy.max(Duration))
942 104 equemene
        avgR=numpy.append(avgR,numpy.average(Rate))
943 104 equemene
        medR=numpy.append(medR,numpy.median(Rate))
944 104 equemene
        stdR=numpy.append(stdR,numpy.std(Rate))
945 104 equemene
        minR=numpy.append(minR,numpy.min(Rate))
946 104 equemene
        maxR=numpy.append(maxR,numpy.max(Rate))
947 102 equemene
948 127 equemene
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
949 104 equemene
950 104 equemene
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
951 104 equemene
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
952 104 equemene
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
953 102 equemene
954 102 equemene
    if Fit:
955 102 equemene
        FitAndPrint(ExploredJobs,median,Curves)