Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 177

Historique | Voir | Annoter | Télécharger (30,32 ko)

1 127 equemene
#!/usr/bin/env python3
2 102 equemene
3 102 equemene
#
4 102 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 102 equemene
#
6 102 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 102 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 102 equemene
#
9 102 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 102 equemene
# http://mathema.tician.de/software/pycuda
11 102 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 102 equemene
# http://mathema.tician.de/software/pyopencl
13 102 equemene
#
14 102 equemene
15 102 equemene
# 2013-01-01 : problems with launch timeout
16 102 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 102 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 102 equemene
19 102 equemene
# Common tools
20 102 equemene
import numpy
21 102 equemene
from numpy.random import randint as nprnd
22 102 equemene
import sys
23 102 equemene
import getopt
24 102 equemene
import time
25 102 equemene
import itertools
26 102 equemene
from socket import gethostname
27 102 equemene
28 104 equemene
class PenStacle:
29 104 equemene
    """Pentacle of Statistics from data"""
30 104 equemene
    Avg=0
31 104 equemene
    Med=0
32 104 equemene
    Std=0
33 104 equemene
    Min=0
34 104 equemene
    Max=0
35 104 equemene
    def __init__(self,Data):
36 104 equemene
        self.Avg=numpy.average(Data)
37 104 equemene
        self.Med=numpy.median(Data)
38 104 equemene
        self.Std=numpy.std(Data)
39 104 equemene
        self.Max=numpy.max(Data)
40 104 equemene
        self.Min=numpy.min(Data)
41 104 equemene
    def display(self):
42 127 equemene
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43 104 equemene
44 104 equemene
class Experience:
45 104 equemene
    """Metrology for experiences"""
46 104 equemene
    DeviceStyle=''
47 104 equemene
    DeviceId=0
48 104 equemene
    AvgD=0
49 104 equemene
    MedD=0
50 104 equemene
    StdD=0
51 104 equemene
    MinD=0
52 104 equemene
    MaxD=0
53 104 equemene
    AvgR=0
54 104 equemene
    MedR=0
55 104 equemene
    StdR=0
56 104 equemene
    MinR=0
57 104 equemene
    MaxR=0
58 104 equemene
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59 104 equemene
        self.DeviceStyle=DeviceStyle
60 104 equemene
        self.DeviceId=DeviceId
61 104 equemene
        self.Iterations
62 104 equemene
63 104 equemene
    def Metrology(self,Data):
64 104 equemene
        Duration=PenStacle(Data)
65 104 equemene
        Rate=PenStacle(Iterations/Data)
66 127 equemene
        print("Duration %s" % Duration)
67 127 equemene
        print("Rate %s" % Rate)
68 104 equemene
69 105 equemene
70 105 equemene
71 103 equemene
def DictionariesAPI():
72 103 equemene
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73 103 equemene
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74 103 equemene
    return(Marsaglia,Computing)
75 103 equemene
76 102 equemene
# find prime factors of a number
77 102 equemene
# Get for WWW :
78 102 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
79 102 equemene
def PrimeFactors(x):
80 102 equemene
81 102 equemene
    factorlist=numpy.array([]).astype('uint32')
82 102 equemene
    loop=2
83 102 equemene
    while loop<=x:
84 102 equemene
        if x%loop==0:
85 102 equemene
            x/=loop
86 102 equemene
            factorlist=numpy.append(factorlist,[loop])
87 102 equemene
        else:
88 102 equemene
            loop+=1
89 102 equemene
    return factorlist
90 102 equemene
91 102 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
92 102 equemene
# output is thread number
93 102 equemene
def BestThreadsNumber(jobs):
94 102 equemene
    factors=PrimeFactors(jobs)
95 102 equemene
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
96 102 equemene
    threads=1
97 102 equemene
    for factor in matrix.transpose().ravel():
98 102 equemene
        threads=threads*factor
99 102 equemene
        if threads*threads>jobs or threads>512:
100 102 equemene
            break
101 102 equemene
    return(long(threads))
102 102 equemene
103 102 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
104 102 equemene
def AmdahlR(N, T1, p):
105 102 equemene
    return (T1*(1-p+p/N))
106 102 equemene
107 102 equemene
# Predicted Amdahl Law
108 102 equemene
def Amdahl(N, T1, s, p):
109 102 equemene
    return (T1*(s+p/N))
110 102 equemene
111 102 equemene
# Predicted Mylq Law with first order
112 102 equemene
def Mylq(N, T1,s,c,p):
113 102 equemene
    return (T1*(s+p/N)+c*N)
114 102 equemene
115 102 equemene
# Predicted Mylq Law with second order
116 102 equemene
def Mylq2(N, T1,s,c1,c2,p):
117 102 equemene
    return (T1*(s+p/N)+c1*N+c2*N*N)
118 102 equemene
119 103 equemene
def KernelCodeCuda():
120 103 equemene
    KERNEL_CODE_CUDA="""
121 102 equemene
#define TCONG 0
122 102 equemene
#define TSHR3 1
123 102 equemene
#define TMWC 2
124 102 equemene
#define TKISS 3
125 102 equemene

126 102 equemene
#define TINT32 0
127 102 equemene
#define TINT64 1
128 102 equemene
#define TFP32 2
129 102 equemene
#define TFP64 3
130 102 equemene

131 102 equemene
// Marsaglia RNG very simple implementation
132 102 equemene

133 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
134 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
135 102 equemene
#define MWC   (znew+wnew)
136 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
137 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
138 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
139 102 equemene

140 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
141 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
142 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
143 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
144 102 equemene

145 102 equemene
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
146 102 equemene
{
147 102 equemene

148 102 equemene
#if TRNG == TCONG
149 102 equemene
   uint jcong=seed_z+work;
150 102 equemene
#elif TRNG == TSHR3
151 102 equemene
   uint jsr=seed_w+work;
152 102 equemene
#elif TRNG == TMWC
153 102 equemene
   uint z=seed_z+work;
154 102 equemene
   uint w=seed_w+work;
155 102 equemene
#elif TRNG == TKISS
156 102 equemene
   uint jcong=seed_z+work;
157 102 equemene
   uint jsr=seed_w+work;
158 102 equemene
   uint z=seed_z-work;
159 102 equemene
   uint w=seed_w-work;
160 102 equemene
#endif
161 102 equemene

162 102 equemene
   ulong total=0;
163 102 equemene

164 102 equemene
   for (ulong i=0;i<iterations;i++) {
165 102 equemene

166 102 equemene
#if TYPE == TINT32
167 102 equemene
    #define THEONE 1073741824
168 102 equemene
    #if TRNG == TCONG
169 102 equemene
        uint x=CONG>>17 ;
170 102 equemene
        uint y=CONG>>17 ;
171 102 equemene
    #elif TRNG == TSHR3
172 102 equemene
        uint x=SHR3>>17 ;
173 102 equemene
        uint y=SHR3>>17 ;
174 102 equemene
    #elif TRNG == TMWC
175 102 equemene
        uint x=MWC>>17 ;
176 102 equemene
        uint y=MWC>>17 ;
177 102 equemene
    #elif TRNG == TKISS
178 102 equemene
        uint x=KISS>>17 ;
179 102 equemene
        uint y=KISS>>17 ;
180 102 equemene
    #endif
181 102 equemene
#elif TYPE == TINT64
182 102 equemene
    #define THEONE 4611686018427387904
183 102 equemene
    #if TRNG == TCONG
184 102 equemene
        ulong x=(ulong)(CONG>>1) ;
185 102 equemene
        ulong y=(ulong)(CONG>>1) ;
186 102 equemene
    #elif TRNG == TSHR3
187 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
188 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
189 102 equemene
    #elif TRNG == TMWC
190 102 equemene
        ulong x=(ulong)(MWC>>1) ;
191 102 equemene
        ulong y=(ulong)(MWC>>1) ;
192 102 equemene
    #elif TRNG == TKISS
193 102 equemene
        ulong x=(ulong)(KISS>>1) ;
194 102 equemene
        ulong y=(ulong)(KISS>>1) ;
195 102 equemene
    #endif
196 102 equemene
#elif TYPE == TFP32
197 102 equemene
    #define THEONE 1.0f
198 102 equemene
    #if TRNG == TCONG
199 102 equemene
        float x=CONGfp ;
200 102 equemene
        float y=CONGfp ;
201 102 equemene
    #elif TRNG == TSHR3
202 102 equemene
        float x=SHR3fp ;
203 102 equemene
        float y=SHR3fp ;
204 102 equemene
    #elif TRNG == TMWC
205 102 equemene
        float x=MWCfp ;
206 102 equemene
        float y=MWCfp ;
207 102 equemene
    #elif TRNG == TKISS
208 102 equemene
      float x=KISSfp ;
209 102 equemene
      float y=KISSfp ;
210 102 equemene
    #endif
211 102 equemene
#elif TYPE == TFP64
212 102 equemene
    #define THEONE 1.0f
213 102 equemene
    #if TRNG == TCONG
214 102 equemene
        double x=(double)CONGfp ;
215 102 equemene
        double y=(double)CONGfp ;
216 102 equemene
    #elif TRNG == TSHR3
217 102 equemene
        double x=(double)SHR3fp ;
218 102 equemene
        double y=(double)SHR3fp ;
219 102 equemene
    #elif TRNG == TMWC
220 102 equemene
        double x=(double)MWCfp ;
221 102 equemene
        double y=(double)MWCfp ;
222 102 equemene
    #elif TRNG == TKISS
223 102 equemene
        double x=(double)KISSfp ;
224 102 equemene
        double y=(double)KISSfp ;
225 102 equemene
    #endif
226 102 equemene
#endif
227 102 equemene

228 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
229 102 equemene
      total+=inside;
230 102 equemene
   }
231 102 equemene

232 102 equemene
   return(total);
233 102 equemene
}
234 102 equemene

235 102 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
236 102 equemene
{
237 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
238 102 equemene
   s[blockIdx.x]=total;
239 102 equemene
   __syncthreads();
240 102 equemene

241 102 equemene
}
242 102 equemene

243 102 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
244 102 equemene
{
245 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
246 102 equemene
   s[threadIdx.x]=total;
247 102 equemene
   __syncthreads();
248 102 equemene

249 102 equemene
}
250 102 equemene

251 102 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
252 102 equemene
{
253 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
254 102 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
255 102 equemene
   __syncthreads();
256 102 equemene
}
257 102 equemene

258 102 equemene
"""
259 103 equemene
    return(KERNEL_CODE_CUDA)
260 102 equemene
261 103 equemene
def KernelCodeOpenCL():
262 103 equemene
    KERNEL_CODE_OPENCL="""
263 102 equemene
#define TCONG 0
264 102 equemene
#define TSHR3 1
265 102 equemene
#define TMWC 2
266 102 equemene
#define TKISS 3
267 102 equemene

268 102 equemene
#define TINT32 0
269 102 equemene
#define TINT64 1
270 102 equemene
#define TFP32 2
271 102 equemene
#define TFP64 3
272 102 equemene

273 102 equemene
// Marsaglia RNG very simple implementation
274 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
275 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
276 102 equemene

277 102 equemene
#define MWC   (znew+wnew)
278 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
279 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
280 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
281 102 equemene

282 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
283 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
284 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
285 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
286 102 equemene

287 102 equemene
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
288 102 equemene
{
289 102 equemene

290 102 equemene
#if TRNG == TCONG
291 102 equemene
   uint jcong=seed_z+work;
292 102 equemene
#elif TRNG == TSHR3
293 102 equemene
   uint jsr=seed_w+work;
294 102 equemene
#elif TRNG == TMWC
295 102 equemene
   uint z=seed_z+work;
296 102 equemene
   uint w=seed_w+work;
297 102 equemene
#elif TRNG == TKISS
298 102 equemene
   uint jcong=seed_z+work;
299 102 equemene
   uint jsr=seed_w+work;
300 102 equemene
   uint z=seed_z-work;
301 102 equemene
   uint w=seed_w-work;
302 102 equemene
#endif
303 102 equemene

304 102 equemene
   ulong total=0;
305 102 equemene

306 102 equemene
   for (ulong i=0;i<iterations;i++) {
307 102 equemene

308 102 equemene
#if TYPE == TINT32
309 102 equemene
    #define THEONE 1073741824
310 102 equemene
    #if TRNG == TCONG
311 102 equemene
        uint x=CONG>>17 ;
312 102 equemene
        uint y=CONG>>17 ;
313 102 equemene
    #elif TRNG == TSHR3
314 102 equemene
        uint x=SHR3>>17 ;
315 102 equemene
        uint y=SHR3>>17 ;
316 102 equemene
    #elif TRNG == TMWC
317 102 equemene
        uint x=MWC>>17 ;
318 102 equemene
        uint y=MWC>>17 ;
319 102 equemene
    #elif TRNG == TKISS
320 102 equemene
        uint x=KISS>>17 ;
321 102 equemene
        uint y=KISS>>17 ;
322 102 equemene
    #endif
323 102 equemene
#elif TYPE == TINT64
324 102 equemene
    #define THEONE 4611686018427387904
325 102 equemene
    #if TRNG == TCONG
326 102 equemene
        ulong x=(ulong)(CONG>>1) ;
327 102 equemene
        ulong y=(ulong)(CONG>>1) ;
328 102 equemene
    #elif TRNG == TSHR3
329 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
330 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
331 102 equemene
    #elif TRNG == TMWC
332 102 equemene
        ulong x=(ulong)(MWC>>1) ;
333 102 equemene
        ulong y=(ulong)(MWC>>1) ;
334 102 equemene
    #elif TRNG == TKISS
335 102 equemene
        ulong x=(ulong)(KISS>>1) ;
336 102 equemene
        ulong y=(ulong)(KISS>>1) ;
337 102 equemene
    #endif
338 102 equemene
#elif TYPE == TFP32
339 102 equemene
    #define THEONE 1.0f
340 102 equemene
    #if TRNG == TCONG
341 102 equemene
        float x=CONGfp ;
342 102 equemene
        float y=CONGfp ;
343 102 equemene
    #elif TRNG == TSHR3
344 102 equemene
        float x=SHR3fp ;
345 102 equemene
        float y=SHR3fp ;
346 102 equemene
    #elif TRNG == TMWC
347 102 equemene
        float x=MWCfp ;
348 102 equemene
        float y=MWCfp ;
349 102 equemene
    #elif TRNG == TKISS
350 102 equemene
      float x=KISSfp ;
351 102 equemene
      float y=KISSfp ;
352 102 equemene
    #endif
353 102 equemene
#elif TYPE == TFP64
354 102 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
355 102 equemene
    #define THEONE 1.0f
356 102 equemene
    #if TRNG == TCONG
357 102 equemene
        double x=(double)CONGfp ;
358 102 equemene
        double y=(double)CONGfp ;
359 102 equemene
    #elif TRNG == TSHR3
360 102 equemene
        double x=(double)SHR3fp ;
361 102 equemene
        double y=(double)SHR3fp ;
362 102 equemene
    #elif TRNG == TMWC
363 102 equemene
        double x=(double)MWCfp ;
364 102 equemene
        double y=(double)MWCfp ;
365 102 equemene
    #elif TRNG == TKISS
366 102 equemene
        double x=(double)KISSfp ;
367 102 equemene
        double y=(double)KISSfp ;
368 102 equemene
    #endif
369 102 equemene
#endif
370 102 equemene

371 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
372 102 equemene
      total+=inside;
373 102 equemene
   }
374 102 equemene

375 102 equemene
   return(total);
376 102 equemene
}
377 102 equemene

378 102 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
379 102 equemene
{
380 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
381 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
382 102 equemene
   s[get_global_id(0)]=total;
383 102 equemene
}
384 102 equemene

385 102 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
386 102 equemene
{
387 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
388 102 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
389 102 equemene
   s[get_local_id(0)]=total;
390 102 equemene
}
391 102 equemene

392 102 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
393 102 equemene
{
394 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
395 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
396 102 equemene
   s[get_global_id(0)]=total;
397 102 equemene
}
398 102 equemene

399 102 equemene
"""
400 103 equemene
    return(KERNEL_CODE_OPENCL)
401 103 equemene
402 122 equemene
def MetropolisCuda(InputCU):
403 102 equemene
404 127 equemene
    print("Inside ",InputCU)
405 122 equemene
406 122 equemene
    iterations=InputCU['Iterations']
407 122 equemene
    steps=InputCU['Steps']
408 122 equemene
    blocks=InputCU['Blocks']
409 122 equemene
    threads=InputCU['Threads']
410 122 equemene
    Device=InputCU['Device']
411 122 equemene
    RNG=InputCU['RNG']
412 122 equemene
    ValueType=InputCU['ValueType']
413 122 equemene
414 103 equemene
    Marsaglia,Computing=DictionariesAPI()
415 103 equemene
416 122 equemene
    try:
417 122 equemene
        # For PyCUDA import
418 122 equemene
        import pycuda.driver as cuda
419 122 equemene
        from pycuda.compiler import SourceModule
420 122 equemene
421 122 equemene
        cuda.init()
422 122 equemene
        for Id in range(cuda.Device.count()):
423 122 equemene
            if Id==Device:
424 122 equemene
                XPU=cuda.Device(Id)
425 127 equemene
                print("GPU selected %s" % XPU.name())
426 122 equemene
        print
427 122 equemene
428 122 equemene
    except ImportError:
429 127 equemene
        print("Platform does not seem to support CUDA")
430 123 equemene
431 122 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
432 102 equemene
    circleCU = cuda.InOut(circle)
433 123 equemene
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
434 123 equemene
    #cuda.memcpy_htod(circleCU, circle)
435 102 equemene
436 152 equemene
    Context=XPU.make_context()
437 152 equemene
438 102 equemene
    try:
439 152 equemene
        #mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
440 177 equemene
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
441 177 equemene
        # Needed to set the compiler via ccbin for CUDA9 implementation
442 177 equemene
        mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.8','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType]],keep=True)
443 102 equemene
    except:
444 177 equemene
        print("Compilation seems to break")
445 152 equemene
446 102 equemene
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
447 123 equemene
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
448 102 equemene
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
449 123 equemene
450 102 equemene
    MyDuration=numpy.zeros(steps)
451 102 equemene
452 122 equemene
    jobs=blocks*threads;
453 102 equemene
454 122 equemene
    iterationsCU=numpy.uint64(iterations/jobs)
455 122 equemene
    if iterations%jobs!=0:
456 123 equemene
        iterationsCU+=numpy.uint64(1)
457 122 equemene
458 102 equemene
    for i in range(steps):
459 122 equemene
        start_time=time.time()
460 122 equemene
461 123 equemene
        try:
462 123 equemene
            MetropolisHybridCU(circleCU,
463 123 equemene
                               numpy.uint64(iterationsCU),
464 123 equemene
                               numpy.uint32(nprnd(2**32)),
465 123 equemene
                               numpy.uint32(nprnd(2**32)),
466 123 equemene
                               grid=(blocks,1),block=(threads,1,1))
467 123 equemene
        except:
468 127 equemene
            print("Crash during CUDA call")
469 102 equemene
470 122 equemene
        elapsed = time.time()-start_time
471 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
472 122 equemene
473 102 equemene
        MyDuration[i]=elapsed
474 102 equemene
475 122 equemene
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
476 127 equemene
    print(OutputCU)
477 152 equemene
    Context.pop()
478 122 equemene
479 177 equemene
    Context.detach()
480 122 equemene
    return(OutputCU)
481 102 equemene
482 106 equemene
def MetropolisOpenCL(InputCL):
483 103 equemene
484 103 equemene
    import pyopencl as cl
485 105 equemene
486 127 equemene
    print("Inside ",InputCL)
487 107 equemene
488 106 equemene
    iterations=InputCL['Iterations']
489 106 equemene
    steps=InputCL['Steps']
490 106 equemene
    blocks=InputCL['Blocks']
491 106 equemene
    threads=InputCL['Threads']
492 106 equemene
    Device=InputCL['Device']
493 106 equemene
    RNG=InputCL['RNG']
494 106 equemene
    ValueType=InputCL['ValueType']
495 103 equemene
496 103 equemene
    Marsaglia,Computing=DictionariesAPI()
497 103 equemene
498 102 equemene
    # Initialisation des variables en les CASTant correctement
499 122 equemene
    Id=0
500 102 equemene
    HasXPU=False
501 102 equemene
    for platform in cl.get_platforms():
502 102 equemene
        for device in platform.get_devices():
503 102 equemene
            if Id==Device:
504 102 equemene
                XPU=device
505 127 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
506 102 equemene
                HasXPU=True
507 102 equemene
            Id+=1
508 102 equemene
509 102 equemene
    if HasXPU==False:
510 127 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
511 102 equemene
        sys.exit()
512 102 equemene
513 102 equemene
    # Je cree le contexte et la queue pour son execution
514 106 equemene
    try:
515 106 equemene
        ctx = cl.Context([XPU])
516 106 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
517 106 equemene
    except:
518 127 equemene
        print("Crash during context creation")
519 102 equemene
520 102 equemene
    # Je recupere les flag possibles pour les buffers
521 102 equemene
    mf = cl.mem_flags
522 102 equemene
523 106 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
524 102 equemene
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
525 106 equemene
526 103 equemene
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType]))
527 102 equemene
528 102 equemene
    MyDuration=numpy.zeros(steps)
529 102 equemene
530 102 equemene
    jobs=blocks*threads;
531 102 equemene
532 106 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
533 106 equemene
    if iterations%jobs!=0:
534 106 equemene
        iterationsCL+=1
535 102 equemene
536 102 equemene
    for i in range(steps):
537 102 equemene
        start_time=time.time()
538 102 equemene
        if threads == 1:
539 102 equemene
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
540 102 equemene
                                                 circleCL,
541 102 equemene
                                                 numpy.uint64(iterationsCL),
542 102 equemene
                                                 numpy.uint32(nprnd(2**32)),
543 102 equemene
                                                 numpy.uint32(nprnd(2**32)))
544 102 equemene
        else:
545 102 equemene
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
546 102 equemene
                                                 circleCL,
547 102 equemene
                                                 numpy.uint64(iterationsCL),
548 102 equemene
                                                 numpy.uint32(nprnd(2**32)),
549 102 equemene
                                                 numpy.uint32(nprnd(2**32)))
550 102 equemene
551 102 equemene
        CLLaunch.wait()
552 102 equemene
        cl.enqueue_copy(queue, circle, circleCL).wait()
553 102 equemene
554 102 equemene
        elapsed = time.time()-start_time
555 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
556 102 equemene
557 104 equemene
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
558 102 equemene
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
559 102 equemene
560 102 equemene
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
561 102 equemene
        MyDuration[i]=elapsed
562 104 equemene
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
563 104 equemene
        # MyPi[i]=numpy.median(AllPi)
564 102 equemene
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
565 102 equemene
566 102 equemene
    circleCL.release()
567 102 equemene
568 106 equemene
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
569 127 equemene
    print(OutputCL)
570 106 equemene
    return(OutputCL)
571 102 equemene
572 102 equemene
573 102 equemene
def FitAndPrint(N,D,Curves):
574 102 equemene
575 102 equemene
    from scipy.optimize import curve_fit
576 102 equemene
    import matplotlib.pyplot as plt
577 102 equemene
578 102 equemene
    try:
579 102 equemene
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
580 102 equemene
581 102 equemene
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
582 102 equemene
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
583 102 equemene
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
584 102 equemene
        coeffs_Amdahl[0]=D[0]
585 127 equemene
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
586 102 equemene
    except:
587 127 equemene
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
588 102 equemene
589 102 equemene
    try:
590 102 equemene
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
591 102 equemene
592 102 equemene
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
593 102 equemene
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
594 102 equemene
        coeffs_AmdahlR[0]=D[0]
595 127 equemene
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
596 102 equemene
597 102 equemene
    except:
598 127 equemene
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
599 102 equemene
600 102 equemene
    try:
601 102 equemene
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
602 102 equemene
603 102 equemene
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
604 102 equemene
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
605 102 equemene
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
606 102 equemene
        coeffs_Mylq[0]=D[0]
607 127 equemene
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
608 102 equemene
                                                                coeffs_Mylq[1],
609 102 equemene
                                                                coeffs_Mylq[3],
610 127 equemene
                                                                coeffs_Mylq[2]))
611 102 equemene
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
612 102 equemene
                    coeffs_Mylq[3])
613 102 equemene
    except:
614 127 equemene
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
615 102 equemene
616 102 equemene
    try:
617 102 equemene
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
618 102 equemene
619 102 equemene
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
620 102 equemene
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
621 102 equemene
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
622 102 equemene
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
623 102 equemene
        coeffs_Mylq2[0]=D[0]
624 127 equemene
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
625 102 equemene
626 102 equemene
    except:
627 127 equemene
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
628 102 equemene
629 102 equemene
    if Curves:
630 102 equemene
        plt.xlabel("Number of Threads/work Items")
631 102 equemene
        plt.ylabel("Total Elapsed Time")
632 102 equemene
633 102 equemene
        Experience,=plt.plot(N,D,'ro')
634 102 equemene
    try:
635 102 equemene
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
636 102 equemene
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
637 102 equemene
    except:
638 127 equemene
        print("Fit curves seem not to be available")
639 102 equemene
640 102 equemene
    plt.legend()
641 102 equemene
    plt.show()
642 102 equemene
643 102 equemene
if __name__=='__main__':
644 103 equemene
645 102 equemene
    # Set defaults values
646 102 equemene
647 102 equemene
    # Id of Device : 1 is for first find !
648 102 equemene
    Device=1
649 102 equemene
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
650 102 equemene
    GpuStyle='OpenCL'
651 102 equemene
    # Iterations is integer
652 104 equemene
    Iterations=10000000
653 102 equemene
    # BlocksBlocks in first number of Blocks to explore
654 102 equemene
    BlocksBegin=1
655 102 equemene
    # BlocksEnd is last number of Blocks to explore
656 152 equemene
    BlocksEnd=1
657 102 equemene
    # BlocksStep is the step of Blocks to explore
658 102 equemene
    BlocksStep=1
659 102 equemene
    # ThreadsBlocks in first number of Blocks to explore
660 102 equemene
    ThreadsBegin=1
661 102 equemene
    # ThreadsEnd is last number of Blocks to explore
662 102 equemene
    ThreadsEnd=1
663 102 equemene
    # ThreadsStep is the step of Blocks to explore
664 102 equemene
    ThreadsStep=1
665 102 equemene
    # Redo is the times to redo the test to improve metrology
666 102 equemene
    Redo=1
667 102 equemene
    # OutMetrology is method for duration estimation : False is GPU inside
668 102 equemene
    OutMetrology=False
669 102 equemene
    Metrology='InMetro'
670 102 equemene
    # Curves is True to print the curves
671 102 equemene
    Curves=False
672 102 equemene
    # Fit is True to print the curves
673 102 equemene
    Fit=False
674 102 equemene
    # Marsaglia RNG
675 102 equemene
    RNG='MWC'
676 102 equemene
    # Value type : INT32, INT64, FP32, FP64
677 102 equemene
    ValueType='FP32'
678 102 equemene
679 102 equemene
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
680 102 equemene
681 102 equemene
    try:
682 102 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hocg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
683 102 equemene
    except getopt.GetoptError:
684 127 equemene
        print(HowToUse % sys.argv[0])
685 102 equemene
        sys.exit(2)
686 104 equemene
687 104 equemene
    # List of Devices
688 104 equemene
    Devices=[]
689 104 equemene
    Alu={}
690 104 equemene
691 102 equemene
    for opt, arg in opts:
692 102 equemene
        if opt == '-h':
693 127 equemene
            print(HowToUse % sys.argv[0])
694 102 equemene
695 127 equemene
            print("\nInformations about devices detected under OpenCL API:")
696 102 equemene
            # For PyOpenCL import
697 102 equemene
            try:
698 102 equemene
                import pyopencl as cl
699 122 equemene
                Id=0
700 102 equemene
                for platform in cl.get_platforms():
701 102 equemene
                    for device in platform.get_devices():
702 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
703 157 equemene
                        deviceType="xPU"
704 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
705 102 equemene
                        Id=Id+1
706 102 equemene
707 123 equemene
            except:
708 127 equemene
                print("Your platform does not seem to support OpenCL")
709 122 equemene
710 127 equemene
            print("\nInformations about devices detected under CUDA API:")
711 122 equemene
            # For PyCUDA import
712 122 equemene
            try:
713 122 equemene
                import pycuda.driver as cuda
714 122 equemene
                cuda.init()
715 122 equemene
                for Id in range(cuda.Device.count()):
716 122 equemene
                    device=cuda.Device(Id)
717 127 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
718 102 equemene
                print
719 123 equemene
            except:
720 127 equemene
                print("Your platform does not seem to support CUDA")
721 102 equemene
722 122 equemene
            sys.exit()
723 122 equemene
724 122 equemene
725 102 equemene
        elif opt == '-o':
726 102 equemene
            OutMetrology=True
727 102 equemene
            Metrology='OutMetro'
728 102 equemene
        elif opt == '-c':
729 102 equemene
            Curves=True
730 102 equemene
        elif opt in ("-d", "--device"):
731 104 equemene
            Devices.append(int(arg))
732 102 equemene
        elif opt in ("-g", "--gpustyle"):
733 102 equemene
            GpuStyle = arg
734 102 equemene
        elif opt in ("-m", "--marsaglia"):
735 102 equemene
            RNG = arg
736 102 equemene
        elif opt in ("-v", "--valuetype"):
737 102 equemene
            ValueType = arg
738 102 equemene
        elif opt in ("-i", "--iterations"):
739 102 equemene
            Iterations = numpy.uint64(arg)
740 102 equemene
        elif opt in ("-b", "--blocksbegin"):
741 102 equemene
            BlocksBegin = int(arg)
742 102 equemene
        elif opt in ("-e", "--blocksend"):
743 102 equemene
            BlocksEnd = int(arg)
744 102 equemene
        elif opt in ("-s", "--blocksstep"):
745 102 equemene
            BlocksStep = int(arg)
746 102 equemene
        elif opt in ("-f", "--threadsfirst"):
747 102 equemene
            ThreadsBegin = int(arg)
748 102 equemene
        elif opt in ("-l", "--threadslast"):
749 102 equemene
            ThreadsEnd = int(arg)
750 102 equemene
        elif opt in ("-t", "--threadsstep"):
751 102 equemene
            ThreadsStep = int(arg)
752 102 equemene
        elif opt in ("-r", "--redo"):
753 102 equemene
            Redo = int(arg)
754 102 equemene
755 127 equemene
    print("Devices Identification : %s" % Devices)
756 127 equemene
    print("GpuStyle used : %s" % GpuStyle)
757 127 equemene
    print("Iterations : %s" % Iterations)
758 127 equemene
    print("Number of Blocks on begin : %s" % BlocksBegin)
759 127 equemene
    print("Number of Blocks on end : %s" % BlocksEnd)
760 127 equemene
    print("Step on Blocks : %s" % BlocksStep)
761 127 equemene
    print("Number of Threads on begin : %s" % ThreadsBegin)
762 127 equemene
    print("Number of Threads on end : %s" % ThreadsEnd)
763 127 equemene
    print("Step on Threads : %s" % ThreadsStep)
764 127 equemene
    print("Number of redo : %s" % Redo)
765 127 equemene
    print("Metrology done out of XPU : %r" % OutMetrology)
766 127 equemene
    print("Type of Marsaglia RNG used : %s" % RNG)
767 127 equemene
    print("Type of variable : %s" % ValueType)
768 102 equemene
769 102 equemene
    if GpuStyle=='CUDA':
770 102 equemene
        try:
771 102 equemene
            # For PyCUDA import
772 102 equemene
            import pycuda.driver as cuda
773 122 equemene
774 122 equemene
            cuda.init()
775 122 equemene
            for Id in range(cuda.Device.count()):
776 122 equemene
                device=cuda.Device(Id)
777 127 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
778 122 equemene
                if Id in Devices:
779 122 equemene
                    Alu[Id]='GPU'
780 122 equemene
781 102 equemene
        except ImportError:
782 127 equemene
            print("Platform does not seem to support CUDA")
783 102 equemene
784 102 equemene
    if GpuStyle=='OpenCL':
785 102 equemene
        try:
786 102 equemene
            # For PyOpenCL import
787 102 equemene
            import pyopencl as cl
788 122 equemene
            Id=0
789 102 equemene
            for platform in cl.get_platforms():
790 102 equemene
                for device in platform.get_devices():
791 138 equemene
                    #deviceType=cl.device_type.to_string(device.type)
792 152 equemene
                    deviceType="xPU"
793 127 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
794 102 equemene
795 104 equemene
                    if Id in Devices:
796 102 equemene
                    # Set the Alu as detected Device Type
797 104 equemene
                        Alu[Id]=deviceType
798 102 equemene
                    Id=Id+1
799 102 equemene
        except ImportError:
800 127 equemene
            print("Platform does not seem to support OpenCL")
801 104 equemene
802 127 equemene
    print(Devices,Alu)
803 104 equemene
804 127 equemene
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
805 127 equemene
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
806 102 equemene
807 102 equemene
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
808 102 equemene
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
809 102 equemene
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
810 104 equemene
    avgD=numpy.array([]).astype(numpy.float32)
811 104 equemene
    medD=numpy.array([]).astype(numpy.float32)
812 104 equemene
    stdD=numpy.array([]).astype(numpy.float32)
813 104 equemene
    minD=numpy.array([]).astype(numpy.float32)
814 104 equemene
    maxD=numpy.array([]).astype(numpy.float32)
815 104 equemene
    avgR=numpy.array([]).astype(numpy.float32)
816 104 equemene
    medR=numpy.array([]).astype(numpy.float32)
817 104 equemene
    stdR=numpy.array([]).astype(numpy.float32)
818 104 equemene
    minR=numpy.array([]).astype(numpy.float32)
819 104 equemene
    maxR=numpy.array([]).astype(numpy.float32)
820 102 equemene
821 102 equemene
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
822 102 equemene
823 102 equemene
        # print Blocks,Threads
824 102 equemene
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
825 102 equemene
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
826 102 equemene
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
827 102 equemene
        ExploredThreads=numpy.append(ExploredThreads,Threads)
828 102 equemene
829 102 equemene
        if OutMetrology:
830 104 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
831 104 equemene
            Duration=numpy.array([]).astype(numpy.float32)
832 104 equemene
            Rate=numpy.array([]).astype(numpy.float32)
833 102 equemene
            for i in range(Redo):
834 102 equemene
                start=time.time()
835 102 equemene
                if GpuStyle=='CUDA':
836 102 equemene
                    try:
837 122 equemene
                        InputCU={}
838 122 equemene
                        InputCU['Iterations']=Iterations
839 122 equemene
                        InputCU['Steps']=1
840 122 equemene
                        InputCU['Blocks']=Blocks
841 122 equemene
                        InputCU['Threads']=Threads
842 122 equemene
                        InputCU['Device']=Devices[0]
843 122 equemene
                        InputCU['RNG']=RNG
844 122 equemene
                        InputCU['ValueType']=ValueType
845 122 equemene
                        OutputCU=MetropolisCuda(InputCU)
846 122 equemene
                        Inside=OutputCU['Circle']
847 122 equemene
                        NewIterations=OutputCU['NewIterations']
848 122 equemene
                        Duration=OutputCU['Duration']
849 102 equemene
                    except:
850 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
851 102 equemene
                elif GpuStyle=='OpenCL':
852 102 equemene
                    try:
853 106 equemene
                        InputCL={}
854 106 equemene
                        InputCL['Iterations']=Iterations
855 106 equemene
                        InputCL['Steps']=1
856 106 equemene
                        InputCL['Blocks']=Blocks
857 106 equemene
                        InputCL['Threads']=Threads
858 106 equemene
                        InputCL['Device']=Devices[0]
859 106 equemene
                        InputCL['RNG']=RNG
860 106 equemene
                        InputCL['ValueType']=ValueType
861 106 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
862 106 equemene
                        Inside=OutputCL['Circle']
863 106 equemene
                        NewIterations=OutputCL['NewIterations']
864 106 equemene
                        Duration=OutputCL['Duration']
865 102 equemene
                    except:
866 127 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
867 104 equemene
                Duration=numpy.append(Duration,time.time()-start)
868 104 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
869 102 equemene
        else:
870 102 equemene
            if GpuStyle=='CUDA':
871 102 equemene
                try:
872 122 equemene
                    InputCU={}
873 122 equemene
                    InputCU['Iterations']=Iterations
874 122 equemene
                    InputCU['Steps']=Redo
875 122 equemene
                    InputCU['Blocks']=Blocks
876 122 equemene
                    InputCU['Threads']=Threads
877 122 equemene
                    InputCU['Device']=Devices[0]
878 122 equemene
                    InputCU['RNG']=RNG
879 122 equemene
                    InputCU['ValueType']=ValueType
880 122 equemene
                    OutputCU=MetropolisCuda(InputCU)
881 122 equemene
                    Inside=OutputCU['Inside']
882 122 equemene
                    NewIterations=OutputCU['NewIterations']
883 122 equemene
                    Duration=OutputCU['Duration']
884 102 equemene
                except:
885 127 equemene
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
886 152 equemene
                try:
887 152 equemene
                    pycuda.context.pop()
888 152 equemene
                except:
889 152 equemene
                    pass
890 102 equemene
            elif GpuStyle=='OpenCL':
891 102 equemene
                try:
892 106 equemene
                    InputCL={}
893 106 equemene
                    InputCL['Iterations']=Iterations
894 106 equemene
                    InputCL['Steps']=Redo
895 106 equemene
                    InputCL['Blocks']=Blocks
896 106 equemene
                    InputCL['Threads']=Threads
897 106 equemene
                    InputCL['Device']=Devices[0]
898 106 equemene
                    InputCL['RNG']=RNG
899 106 equemene
                    InputCL['ValueType']=ValueType
900 106 equemene
                    OutputCL=MetropolisOpenCL(InputCL)
901 106 equemene
                    Inside=OutputCL['Inside']
902 106 equemene
                    NewIterations=OutputCL['NewIterations']
903 106 equemene
                    Duration=OutputCL['Duration']
904 102 equemene
                except:
905 127 equemene
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
906 104 equemene
            Rate=NewIterations/Duration
907 130 equemene
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
908 130 equemene
909 104 equemene
910 104 equemene
        avgD=numpy.append(avgD,numpy.average(Duration))
911 104 equemene
        medD=numpy.append(medD,numpy.median(Duration))
912 104 equemene
        stdD=numpy.append(stdD,numpy.std(Duration))
913 104 equemene
        minD=numpy.append(minD,numpy.min(Duration))
914 104 equemene
        maxD=numpy.append(maxD,numpy.max(Duration))
915 104 equemene
        avgR=numpy.append(avgR,numpy.average(Rate))
916 104 equemene
        medR=numpy.append(medR,numpy.median(Rate))
917 104 equemene
        stdR=numpy.append(stdR,numpy.std(Rate))
918 104 equemene
        minR=numpy.append(minR,numpy.min(Rate))
919 104 equemene
        maxR=numpy.append(maxR,numpy.max(Rate))
920 102 equemene
921 127 equemene
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
922 104 equemene
923 104 equemene
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
924 104 equemene
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
925 104 equemene
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
926 102 equemene
927 102 equemene
    if Fit:
928 102 equemene
        FitAndPrint(ExploredJobs,median,Curves)