Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 158

Historique | Voir | Annoter | Télécharger (30,15 ko)

1 127 equemene
#!/usr/bin/env python3
2 102 equemene
3 102 equemene
#
4 102 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 102 equemene
#
6 102 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 102 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 102 equemene
#
9 102 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 102 equemene
# http://mathema.tician.de/software/pycuda
11 102 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 102 equemene
# http://mathema.tician.de/software/pyopencl
13 102 equemene
#
14 102 equemene
15 102 equemene
# 2013-01-01 : problems with launch timeout
16 102 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 102 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 102 equemene
19 102 equemene
# Common tools
20 102 equemene
import numpy
21 102 equemene
from numpy.random import randint as nprnd
22 102 equemene
import sys
23 102 equemene
import getopt
24 102 equemene
import time
25 102 equemene
import itertools
26 102 equemene
from socket import gethostname
27 102 equemene
28 104 equemene
class PenStacle:
29 104 equemene
    """Pentacle of Statistics from data"""
30 104 equemene
    Avg=0
31 104 equemene
    Med=0
32 104 equemene
    Std=0
33 104 equemene
    Min=0
34 104 equemene
    Max=0
35 104 equemene
    def __init__(self,Data):
36 104 equemene
        self.Avg=numpy.average(Data)
37 104 equemene
        self.Med=numpy.median(Data)
38 104 equemene
        self.Std=numpy.std(Data)
39 104 equemene
        self.Max=numpy.max(Data)
40 104 equemene
        self.Min=numpy.min(Data)
41 104 equemene
    def display(self):
42 127 equemene
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43 104 equemene
44 104 equemene
class Experience:
45 104 equemene
    """Metrology for experiences"""
46 104 equemene
    DeviceStyle=''
47 104 equemene
    DeviceId=0
48 104 equemene
    AvgD=0
49 104 equemene
    MedD=0
50 104 equemene
    StdD=0
51 104 equemene
    MinD=0
52 104 equemene
    MaxD=0
53 104 equemene
    AvgR=0
54 104 equemene
    MedR=0
55 104 equemene
    StdR=0
56 104 equemene
    MinR=0
57 104 equemene
    MaxR=0
58 104 equemene
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59 104 equemene
        self.DeviceStyle=DeviceStyle
60 104 equemene
        self.DeviceId=DeviceId
61 104 equemene
        self.Iterations
62 104 equemene
63 104 equemene
    def Metrology(self,Data):
64 104 equemene
        Duration=PenStacle(Data)
65 104 equemene
        Rate=PenStacle(Iterations/Data)
66 127 equemene
        print("Duration %s" % Duration)
67 127 equemene
        print("Rate %s" % Rate)
68 104 equemene
69 105 equemene
70 105 equemene
71 103 equemene
def DictionariesAPI():
72 103 equemene
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73 103 equemene
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74 103 equemene
    return(Marsaglia,Computing)
75 103 equemene
76 102 equemene
# find prime factors of a number
77 102 equemene
# Get for WWW :
78 102 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
79 102 equemene
def PrimeFactors(x):
80 102 equemene
81 102 equemene
    factorlist=numpy.array([]).astype('uint32')
82 102 equemene
    loop=2
83 102 equemene
    while loop<=x:
84 102 equemene
        if x%loop==0:
85 102 equemene
            x/=loop
86 102 equemene
            factorlist=numpy.append(factorlist,[loop])
87 102 equemene
        else:
88 102 equemene
            loop+=1
89 102 equemene
    return factorlist
90 102 equemene
91 102 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
92 102 equemene
# output is thread number
93 102 equemene
def BestThreadsNumber(jobs):
94 102 equemene
    factors=PrimeFactors(jobs)
95 102 equemene
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
96 102 equemene
    threads=1
97 102 equemene
    for factor in matrix.transpose().ravel():
98 102 equemene
        threads=threads*factor
99 102 equemene
        if threads*threads>jobs or threads>512:
100 102 equemene
            break
101 102 equemene
    return(long(threads))
102 102 equemene
103 102 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
104 102 equemene
def AmdahlR(N, T1, p):
105 102 equemene
    return (T1*(1-p+p/N))
106 102 equemene
107 102 equemene
# Predicted Amdahl Law
108 102 equemene
def Amdahl(N, T1, s, p):
109 102 equemene
    return (T1*(s+p/N))
110 102 equemene
111 102 equemene
# Predicted Mylq Law with first order
112 102 equemene
def Mylq(N, T1,s,c,p):
113 102 equemene
    return (T1*(s+p/N)+c*N)
114 102 equemene
115 102 equemene
# Predicted Mylq Law with second order
116 102 equemene
def Mylq2(N, T1,s,c1,c2,p):
117 102 equemene
    return (T1*(s+p/N)+c1*N+c2*N*N)
118 102 equemene
119 103 equemene
def KernelCodeCuda():
120 103 equemene
    KERNEL_CODE_CUDA="""
121 102 equemene
#define TCONG 0
122 102 equemene
#define TSHR3 1
123 102 equemene
#define TMWC 2
124 102 equemene
#define TKISS 3
125 102 equemene

126 102 equemene
#define TINT32 0
127 102 equemene
#define TINT64 1
128 102 equemene
#define TFP32 2
129 102 equemene
#define TFP64 3
130 102 equemene

131 102 equemene
// Marsaglia RNG very simple implementation
132 102 equemene

133 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
134 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
135 102 equemene
#define MWC   (znew+wnew)
136 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
137 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
138 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
139 102 equemene

140 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
141 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
142 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
143 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
144 102 equemene

145 102 equemene
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
146 102 equemene
{
147 102 equemene

148 102 equemene
#if TRNG == TCONG
149 102 equemene
   uint jcong=seed_z+work;
150 102 equemene
#elif TRNG == TSHR3
151 102 equemene
   uint jsr=seed_w+work;
152 102 equemene
#elif TRNG == TMWC
153 102 equemene
   uint z=seed_z+work;
154 102 equemene
   uint w=seed_w+work;
155 102 equemene
#elif TRNG == TKISS
156 102 equemene
   uint jcong=seed_z+work;
157 102 equemene
   uint jsr=seed_w+work;
158 102 equemene
   uint z=seed_z-work;
159 102 equemene
   uint w=seed_w-work;
160 102 equemene
#endif
161 102 equemene

162 102 equemene
   ulong total=0;
163 102 equemene

164 102 equemene
   for (ulong i=0;i<iterations;i++) {
165 102 equemene

166 102 equemene
#if TYPE == TINT32
167 102 equemene
    #define THEONE 1073741824
168 102 equemene
    #if TRNG == TCONG
169 102 equemene
        uint x=CONG>>17 ;
170 102 equemene
        uint y=CONG>>17 ;
171 102 equemene
    #elif TRNG == TSHR3
172 102 equemene
        uint x=SHR3>>17 ;
173 102 equemene
        uint y=SHR3>>17 ;
174 102 equemene
    #elif TRNG == TMWC
175 102 equemene
        uint x=MWC>>17 ;
176 102 equemene
        uint y=MWC>>17 ;
177 102 equemene
    #elif TRNG == TKISS
178 102 equemene
        uint x=KISS>>17 ;
179 102 equemene
        uint y=KISS>>17 ;
180 102 equemene
    #endif
181 102 equemene
#elif TYPE == TINT64
182 102 equemene
    #define THEONE 4611686018427387904
183 102 equemene
    #if TRNG == TCONG
184 102 equemene
        ulong x=(ulong)(CONG>>1) ;
185 102 equemene
        ulong y=(ulong)(CONG>>1) ;
186 102 equemene
    #elif TRNG == TSHR3
187 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
188 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
189 102 equemene
    #elif TRNG == TMWC
190 102 equemene
        ulong x=(ulong)(MWC>>1) ;
191 102 equemene
        ulong y=(ulong)(MWC>>1) ;
192 102 equemene
    #elif TRNG == TKISS
193 102 equemene
        ulong x=(ulong)(KISS>>1) ;
194 102 equemene
        ulong y=(ulong)(KISS>>1) ;
195 102 equemene
    #endif
196 102 equemene
#elif TYPE == TFP32
197 102 equemene
    #define THEONE 1.0f
198 102 equemene
    #if TRNG == TCONG
199 102 equemene
        float x=CONGfp ;
200 102 equemene
        float y=CONGfp ;
201 102 equemene
    #elif TRNG == TSHR3
202 102 equemene
        float x=SHR3fp ;
203 102 equemene
        float y=SHR3fp ;
204 102 equemene
    #elif TRNG == TMWC
205 102 equemene
        float x=MWCfp ;
206 102 equemene
        float y=MWCfp ;
207 102 equemene
    #elif TRNG == TKISS
208 102 equemene
      float x=KISSfp ;
209 102 equemene
      float y=KISSfp ;
210 102 equemene
    #endif
211 102 equemene
#elif TYPE == TFP64
212 102 equemene
    #define THEONE 1.0f
213 102 equemene
    #if TRNG == TCONG
214 102 equemene
        double x=(double)CONGfp ;
215 102 equemene
        double y=(double)CONGfp ;
216 102 equemene
    #elif TRNG == TSHR3
217 102 equemene
        double x=(double)SHR3fp ;
218 102 equemene
        double y=(double)SHR3fp ;
219 102 equemene
    #elif TRNG == TMWC
220 102 equemene
        double x=(double)MWCfp ;
221 102 equemene
        double y=(double)MWCfp ;
222 102 equemene
    #elif TRNG == TKISS
223 102 equemene
        double x=(double)KISSfp ;
224 102 equemene
        double y=(double)KISSfp ;
225 102 equemene
    #endif
226 102 equemene
#endif
227 102 equemene

228 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
229 102 equemene
      total+=inside;
230 102 equemene
   }
231 102 equemene

232 102 equemene
   return(total);
233 102 equemene
}
234 102 equemene

235 102 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
236 102 equemene
{
237 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
238 102 equemene
   s[blockIdx.x]=total;
239 102 equemene
   __syncthreads();
240 102 equemene

241 102 equemene
}
242 102 equemene

243 102 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
244 102 equemene
{
245 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
246 102 equemene
   s[threadIdx.x]=total;
247 102 equemene
   __syncthreads();
248 102 equemene

249 102 equemene
}
250 102 equemene

251 102 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
252 102 equemene
{
253 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
254 102 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
255 102 equemene
   __syncthreads();
256 102 equemene
}
257 102 equemene

258 102 equemene
"""
259 103 equemene
    return(KERNEL_CODE_CUDA)
260 102 equemene
261 103 equemene
def KernelCodeOpenCL():
262 103 equemene
    KERNEL_CODE_OPENCL="""
263 102 equemene
#define TCONG 0
264 102 equemene
#define TSHR3 1
265 102 equemene
#define TMWC 2
266 102 equemene
#define TKISS 3
267 102 equemene

268 102 equemene
#define TINT32 0
269 102 equemene
#define TINT64 1
270 102 equemene
#define TFP32 2
271 102 equemene
#define TFP64 3
272 102 equemene

273 102 equemene
// Marsaglia RNG very simple implementation
274 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
275 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
276 102 equemene

277 102 equemene
#define MWC   (znew+wnew)
278 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
279 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
280 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
281 102 equemene

282 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
283 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
284 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
285 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
286 102 equemene

287 102 equemene
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
288 102 equemene
{
289 102 equemene

290 102 equemene
#if TRNG == TCONG
291 102 equemene
   uint jcong=seed_z+work;
292 102 equemene
#elif TRNG == TSHR3
293 102 equemene
   uint jsr=seed_w+work;
294 102 equemene
#elif TRNG == TMWC
295 102 equemene
   uint z=seed_z+work;
296 102 equemene
   uint w=seed_w+work;
297 102 equemene
#elif TRNG == TKISS
298 102 equemene
   uint jcong=seed_z+work;
299 102 equemene
   uint jsr=seed_w+work;
300 102 equemene
   uint z=seed_z-work;
301 102 equemene
   uint w=seed_w-work;
302 102 equemene
#endif
303 102 equemene

304 102 equemene
   ulong total=0;
305 102 equemene

306 102 equemene
   for (ulong i=0;i<iterations;i++) {
307 102 equemene

308 102 equemene
#if TYPE == TINT32
309 102 equemene
    #define THEONE 1073741824
310 102 equemene
    #if TRNG == TCONG
311 102 equemene
        uint x=CONG>>17 ;
312 102 equemene
        uint y=CONG>>17 ;
313 102 equemene
    #elif TRNG == TSHR3
314 102 equemene
        uint x=SHR3>>17 ;
315 102 equemene
        uint y=SHR3>>17 ;
316 102 equemene
    #elif TRNG == TMWC
317 102 equemene
        uint x=MWC>>17 ;
318 102 equemene
        uint y=MWC>>17 ;
319 102 equemene
    #elif TRNG == TKISS
320 102 equemene
        uint x=KISS>>17 ;
321 102 equemene
        uint y=KISS>>17 ;
322 102 equemene
    #endif
323 102 equemene
#elif TYPE == TINT64
324 102 equemene
    #define THEONE 4611686018427387904
325 102 equemene
    #if TRNG == TCONG
326 102 equemene
        ulong x=(ulong)(CONG>>1) ;
327 102 equemene
        ulong y=(ulong)(CONG>>1) ;
328 102 equemene
    #elif TRNG == TSHR3
329 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
330 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
331 102 equemene
    #elif TRNG == TMWC
332 102 equemene
        ulong x=(ulong)(MWC>>1) ;
333 102 equemene
        ulong y=(ulong)(MWC>>1) ;
334 102 equemene
    #elif TRNG == TKISS
335 102 equemene
        ulong x=(ulong)(KISS>>1) ;
336 102 equemene
        ulong y=(ulong)(KISS>>1) ;
337 102 equemene
    #endif
338 102 equemene
#elif TYPE == TFP32
339 102 equemene
    #define THEONE 1.0f
340 102 equemene
    #if TRNG == TCONG
341 102 equemene
        float x=CONGfp ;
342 102 equemene
        float y=CONGfp ;
343 102 equemene
    #elif TRNG == TSHR3
344 102 equemene
        float x=SHR3fp ;
345 102 equemene
        float y=SHR3fp ;
346 102 equemene
    #elif TRNG == TMWC
347 102 equemene
        float x=MWCfp ;
348 102 equemene
        float y=MWCfp ;
349 102 equemene
    #elif TRNG == TKISS
350 102 equemene
      float x=KISSfp ;
351 102 equemene
      float y=KISSfp ;
352 102 equemene
    #endif
353 102 equemene
#elif TYPE == TFP64
354 102 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
355 102 equemene
    #define THEONE 1.0f
356 102 equemene
    #if TRNG == TCONG
357 102 equemene
        double x=(double)CONGfp ;
358 102 equemene
        double y=(double)CONGfp ;
359 102 equemene
    #elif TRNG == TSHR3
360 102 equemene
        double x=(double)SHR3fp ;
361 102 equemene
        double y=(double)SHR3fp ;
362 102 equemene
    #elif TRNG == TMWC
363 102 equemene
        double x=(double)MWCfp ;
364 102 equemene
        double y=(double)MWCfp ;
365 102 equemene
    #elif TRNG == TKISS
366 102 equemene
        double x=(double)KISSfp ;
367 102 equemene
        double y=(double)KISSfp ;
368 102 equemene
    #endif
369 102 equemene
#endif
370 102 equemene

371 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
372 102 equemene
      total+=inside;
373 102 equemene
   }
374 102 equemene

375 102 equemene
   return(total);
376 102 equemene
}
377 102 equemene

378 102 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
379 102 equemene
{
380 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
381 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
382 102 equemene
   s[get_global_id(0)]=total;
383 102 equemene
}
384 102 equemene

385 102 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
386 102 equemene
{
387 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
388 102 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
389 102 equemene
   s[get_local_id(0)]=total;
390 102 equemene
}
391 102 equemene

392 102 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
393 102 equemene
{
394 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
395 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
396 102 equemene
   s[get_global_id(0)]=total;
397 102 equemene
}
398 102 equemene

399 102 equemene
"""
400 103 equemene
    return(KERNEL_CODE_OPENCL)
401 103 equemene
402 122 equemene
def MetropolisCuda(InputCU):
403 102 equemene
404 127 equemene
    print("Inside ",InputCU)
405 122 equemene
406 122 equemene
    iterations=InputCU['Iterations']
407 122 equemene
    steps=InputCU['Steps']
408 122 equemene
    blocks=InputCU['Blocks']
409 122 equemene
    threads=InputCU['Threads']
410 122 equemene
    Device=InputCU['Device']
411 122 equemene
    RNG=InputCU['RNG']
412 122 equemene
    ValueType=InputCU['ValueType']
413 122 equemene
414 103 equemene
    Marsaglia,Computing=DictionariesAPI()
415 103 equemene
416 122 equemene
    try:
417 122 equemene
        # For PyCUDA import
418 122 equemene
        import pycuda.driver as cuda
419 122 equemene
        from pycuda.compiler import SourceModule
420 122 equemene
421 122 equemene
        cuda.init()
422 122 equemene
        for Id in range(cuda.Device.count()):
423 122 equemene
            if Id==Device:
424 122 equemene
                XPU=cuda.Device(Id)
425 127 equemene
                print("GPU selected %s" % XPU.name())
426 122 equemene
        print
427 122 equemene
428 122 equemene
    except ImportError:
429 127 equemene
        print("Platform does not seem to support CUDA")
430 123 equemene
431 122 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
432 102 equemene
    circleCU = cuda.InOut(circle)
433 123 equemene
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
434 123 equemene
    #cuda.memcpy_htod(circleCU, circle)
435 102 equemene
436 152 equemene
    Context=XPU.make_context()
437 152 equemene
438 102 equemene
    try:
439 152 equemene
        #mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
440 123 equemene
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
441 102 equemene
    except:
442 127 equemene
        print("Compilation seems to broke")
443 152 equemene
444 102 equemene
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
445 123 equemene
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
446 102 equemene
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
447 123 equemene
448 102 equemene
    MyDuration=numpy.zeros(steps)
449 102 equemene
450 122 equemene
    jobs=blocks*threads;
451 102 equemene
452 122 equemene
    iterationsCU=numpy.uint64(iterations/jobs)
453 122 equemene
    if iterations%jobs!=0:
454 123 equemene
        iterationsCU+=numpy.uint64(1)
455 122 equemene
456 102 equemene
    for i in range(steps):
457 122 equemene
        start_time=time.time()
458 122 equemene
459 123 equemene
        try:
460 123 equemene
            MetropolisHybridCU(circleCU,
461 123 equemene
                               numpy.uint64(iterationsCU),
462 123 equemene
                               numpy.uint32(nprnd(2**32)),
463 123 equemene
                               numpy.uint32(nprnd(2**32)),
464 123 equemene
                               grid=(blocks,1),block=(threads,1,1))
465 123 equemene
        except:
466 127 equemene
            print("Crash during CUDA call")
467 102 equemene
468 122 equemene
        elapsed = time.time()-start_time
469 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
470 122 equemene
471 102 equemene
        MyDuration[i]=elapsed
472 102 equemene
473 122 equemene
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
474 127 equemene
    print(OutputCU)
475 152 equemene
    Context.pop()
476 122 equemene
477 152 equemene
    #Context.detach()
478 122 equemene
    return(OutputCU)
479 102 equemene
480 106 equemene
def MetropolisOpenCL(InputCL):
481 103 equemene
482 103 equemene
    import pyopencl as cl
483 105 equemene
484 127 equemene
    print("Inside ",InputCL)
485 107 equemene
486 106 equemene
    iterations=InputCL['Iterations']
487 106 equemene
    steps=InputCL['Steps']
488 106 equemene
    blocks=InputCL['Blocks']
489 106 equemene
    threads=InputCL['Threads']
490 106 equemene
    Device=InputCL['Device']
491 106 equemene
    RNG=InputCL['RNG']
492 106 equemene
    ValueType=InputCL['ValueType']
493 103 equemene
494 103 equemene
    Marsaglia,Computing=DictionariesAPI()
495 103 equemene
496 102 equemene
    # Initialisation des variables en les CASTant correctement
497 122 equemene
    Id=0
498 102 equemene
    HasXPU=False
499 102 equemene
    for platform in cl.get_platforms():
500 102 equemene
        for device in platform.get_devices():
501 102 equemene
            if Id==Device:
502 102 equemene
                XPU=device
503 127 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
504 102 equemene
                HasXPU=True
505 102 equemene
            Id+=1
506 102 equemene
507 102 equemene
    if HasXPU==False:
508 127 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
509 102 equemene
        sys.exit()
510 102 equemene
511 102 equemene
    # Je cree le contexte et la queue pour son execution
512 106 equemene
    try:
513 106 equemene
        ctx = cl.Context([XPU])
514 106 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
515 106 equemene
    except:
516 127 equemene
        print("Crash during context creation")
517 102 equemene
518 102 equemene
    # Je recupere les flag possibles pour les buffers
519 102 equemene
    mf = cl.mem_flags
520 102 equemene
521 106 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
522 102 equemene
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
523 106 equemene
524 103 equemene
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType]))
525 102 equemene
526 102 equemene
    MyDuration=numpy.zeros(steps)
527 102 equemene
528 102 equemene
    jobs=blocks*threads;
529 102 equemene
530 106 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
531 106 equemene
    if iterations%jobs!=0:
532 106 equemene
        iterationsCL+=1
533 102 equemene
534 102 equemene
    for i in range(steps):
535 102 equemene
        start_time=time.time()
536 102 equemene
        if threads == 1:
537 102 equemene
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
538 102 equemene
                                                 circleCL,
539 102 equemene
                                                 numpy.uint64(iterationsCL),
540 102 equemene
                                                 numpy.uint32(nprnd(2**32)),
541 102 equemene
                                                 numpy.uint32(nprnd(2**32)))
542 102 equemene
        else:
543 102 equemene
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
544 102 equemene
                                                 circleCL,
545 102 equemene
                                                 numpy.uint64(iterationsCL),
546 102 equemene
                                                 numpy.uint32(nprnd(2**32)),
547 102 equemene
                                                 numpy.uint32(nprnd(2**32)))
548 102 equemene
549 102 equemene
        CLLaunch.wait()
550 102 equemene
        cl.enqueue_copy(queue, circle, circleCL).wait()
551 102 equemene
552 102 equemene
        elapsed = time.time()-start_time
553 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
554 102 equemene
555 104 equemene
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
556 102 equemene
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
557 102 equemene
558 102 equemene
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
559 102 equemene
        MyDuration[i]=elapsed
560 104 equemene
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
561 104 equemene
        # MyPi[i]=numpy.median(AllPi)
562 102 equemene
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
563 102 equemene
564 102 equemene
    circleCL.release()
565 102 equemene
566 106 equemene
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
567 127 equemene
    print(OutputCL)
568 106 equemene
    return(OutputCL)
569 102 equemene
570 102 equemene
571 102 equemene
def FitAndPrint(N,D,Curves):
572 102 equemene
573 102 equemene
    from scipy.optimize import curve_fit
574 102 equemene
    import matplotlib.pyplot as plt
575 102 equemene
576 102 equemene
    try:
577 102 equemene
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
578 102 equemene
579 102 equemene
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
580 102 equemene
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
581 102 equemene
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
582 102 equemene
        coeffs_Amdahl[0]=D[0]
583 127 equemene
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
584 102 equemene
    except:
585 127 equemene
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
586 102 equemene
587 102 equemene
    try:
588 102 equemene
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
589 102 equemene
590 102 equemene
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
591 102 equemene
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
592 102 equemene
        coeffs_AmdahlR[0]=D[0]
593 127 equemene
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
594 102 equemene
595 102 equemene
    except:
596 127 equemene
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
597 102 equemene
598 102 equemene
    try:
599 102 equemene
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
600 102 equemene
601 102 equemene
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
602 102 equemene
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
603 102 equemene
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
604 102 equemene
        coeffs_Mylq[0]=D[0]
605 127 equemene
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
606 102 equemene
                                                                coeffs_Mylq[1],
607 102 equemene
                                                                coeffs_Mylq[3],
608 127 equemene
                                                                coeffs_Mylq[2]))
609 102 equemene
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
610 102 equemene
                    coeffs_Mylq[3])
611 102 equemene
    except:
612 127 equemene
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
613 102 equemene
614 102 equemene
    try:
615 102 equemene
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
616 102 equemene
617 102 equemene
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
618 102 equemene
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
619 102 equemene
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
620 102 equemene
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
621 102 equemene
        coeffs_Mylq2[0]=D[0]
622 127 equemene
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
623 102 equemene
624 102 equemene
    except:
625 127 equemene
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
626 102 equemene
627 102 equemene
    if Curves:
628 102 equemene
        plt.xlabel("Number of Threads/work Items")
629 102 equemene
        plt.ylabel("Total Elapsed Time")
630 102 equemene
631 102 equemene
        Experience,=plt.plot(N,D,'ro')
632 102 equemene
    try:
633 102 equemene
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
634 102 equemene
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
635 102 equemene
    except:
636 127 equemene
        print("Fit curves seem not to be available")
637 102 equemene
638 102 equemene
    plt.legend()
639 102 equemene
    plt.show()
640 102 equemene
641 102 equemene
if __name__=='__main__':
642 103 equemene
643 102 equemene
    # Set defaults values
644 102 equemene
645 102 equemene
    # Id of Device : 1 is for first find !
646 102 equemene
    Device=1
647 102 equemene
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
648 102 equemene
    GpuStyle='OpenCL'
649 102 equemene
    # Iterations is integer
650 104 equemene
    Iterations=10000000
651 102 equemene
    # BlocksBlocks in first number of Blocks to explore
652 102 equemene
    BlocksBegin=1
653 102 equemene
    # BlocksEnd is last number of Blocks to explore
654 152 equemene
    BlocksEnd=1
655 102 equemene
    # BlocksStep is the step of Blocks to explore
656 102 equemene
    BlocksStep=1
657 102 equemene
    # ThreadsBlocks in first number of Blocks to explore
658 102 equemene
    ThreadsBegin=1
659 102 equemene
    # ThreadsEnd is last number of Blocks to explore
660 102 equemene
    ThreadsEnd=1
661 102 equemene
    # ThreadsStep is the step of Blocks to explore
662 102 equemene
    ThreadsStep=1
663 102 equemene
    # Redo is the times to redo the test to improve metrology
664 102 equemene
    Redo=1
665 102 equemene
    # OutMetrology is method for duration estimation : False is GPU inside
666 102 equemene
    OutMetrology=False
667 102 equemene
    Metrology='InMetro'
668 102 equemene
    # Curves is True to print the curves
669 102 equemene
    Curves=False
670 102 equemene
    # Fit is True to print the curves
671 102 equemene
    Fit=False
672 102 equemene
    # Marsaglia RNG
673 102 equemene
    RNG='MWC'
674 102 equemene
    # Value type : INT32, INT64, FP32, FP64
675 102 equemene
    ValueType='FP32'
676 102 equemene
677 102 equemene
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
678 102 equemene
679 102 equemene
    try:
680 102 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hocg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
681 102 equemene
    except getopt.GetoptError:
682 127 equemene
        print(HowToUse % sys.argv[0])
683 102 equemene
        sys.exit(2)
684 104 equemene
685 104 equemene
    # List of Devices
686 104 equemene
    Devices=[]
687 104 equemene
    Alu={}
688 104 equemene
689 102 equemene
    for opt, arg in opts:
690 102 equemene
        if opt == '-h':
691 127 equemene
            print(HowToUse % sys.argv[0])
692 102 equemene
693 127 equemene
            print("\nInformations about devices detected under OpenCL API:")
694 102 equemene
            # For PyOpenCL import
695 102 equemene
            try:
696 102 equemene
                import pyopencl as cl
697 122 equemene
                Id=0
698 102 equemene
                for platform in cl.get_platforms():
699 102 equemene
                    for device in platform.get_devices():
700 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
701 157 equemene
                        deviceType="xPU"
702 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
703 102 equemene
                        Id=Id+1
704 102 equemene
705 123 equemene
            except:
706 127 equemene
                print("Your platform does not seem to support OpenCL")
707 122 equemene
708 127 equemene
            print("\nInformations about devices detected under CUDA API:")
709 122 equemene
            # For PyCUDA import
710 122 equemene
            try:
711 122 equemene
                import pycuda.driver as cuda
712 122 equemene
                cuda.init()
713 122 equemene
                for Id in range(cuda.Device.count()):
714 122 equemene
                    device=cuda.Device(Id)
715 127 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
716 102 equemene
                print
717 123 equemene
            except:
718 127 equemene
                print("Your platform does not seem to support CUDA")
719 102 equemene
720 122 equemene
            sys.exit()
721 122 equemene
722 122 equemene
723 102 equemene
        elif opt == '-o':
724 102 equemene
            OutMetrology=True
725 102 equemene
            Metrology='OutMetro'
726 102 equemene
        elif opt == '-c':
727 102 equemene
            Curves=True
728 102 equemene
        elif opt in ("-d", "--device"):
729 104 equemene
            Devices.append(int(arg))
730 102 equemene
        elif opt in ("-g", "--gpustyle"):
731 102 equemene
            GpuStyle = arg
732 102 equemene
        elif opt in ("-m", "--marsaglia"):
733 102 equemene
            RNG = arg
734 102 equemene
        elif opt in ("-v", "--valuetype"):
735 102 equemene
            ValueType = arg
736 102 equemene
        elif opt in ("-i", "--iterations"):
737 102 equemene
            Iterations = numpy.uint64(arg)
738 102 equemene
        elif opt in ("-b", "--blocksbegin"):
739 102 equemene
            BlocksBegin = int(arg)
740 102 equemene
        elif opt in ("-e", "--blocksend"):
741 102 equemene
            BlocksEnd = int(arg)
742 102 equemene
        elif opt in ("-s", "--blocksstep"):
743 102 equemene
            BlocksStep = int(arg)
744 102 equemene
        elif opt in ("-f", "--threadsfirst"):
745 102 equemene
            ThreadsBegin = int(arg)
746 102 equemene
        elif opt in ("-l", "--threadslast"):
747 102 equemene
            ThreadsEnd = int(arg)
748 102 equemene
        elif opt in ("-t", "--threadsstep"):
749 102 equemene
            ThreadsStep = int(arg)
750 102 equemene
        elif opt in ("-r", "--redo"):
751 102 equemene
            Redo = int(arg)
752 102 equemene
753 127 equemene
    print("Devices Identification : %s" % Devices)
754 127 equemene
    print("GpuStyle used : %s" % GpuStyle)
755 127 equemene
    print("Iterations : %s" % Iterations)
756 127 equemene
    print("Number of Blocks on begin : %s" % BlocksBegin)
757 127 equemene
    print("Number of Blocks on end : %s" % BlocksEnd)
758 127 equemene
    print("Step on Blocks : %s" % BlocksStep)
759 127 equemene
    print("Number of Threads on begin : %s" % ThreadsBegin)
760 127 equemene
    print("Number of Threads on end : %s" % ThreadsEnd)
761 127 equemene
    print("Step on Threads : %s" % ThreadsStep)
762 127 equemene
    print("Number of redo : %s" % Redo)
763 127 equemene
    print("Metrology done out of XPU : %r" % OutMetrology)
764 127 equemene
    print("Type of Marsaglia RNG used : %s" % RNG)
765 127 equemene
    print("Type of variable : %s" % ValueType)
766 102 equemene
767 102 equemene
    if GpuStyle=='CUDA':
768 102 equemene
        try:
769 102 equemene
            # For PyCUDA import
770 102 equemene
            import pycuda.driver as cuda
771 122 equemene
772 122 equemene
            cuda.init()
773 122 equemene
            for Id in range(cuda.Device.count()):
774 122 equemene
                device=cuda.Device(Id)
775 127 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
776 122 equemene
                if Id in Devices:
777 122 equemene
                    Alu[Id]='GPU'
778 122 equemene
779 102 equemene
        except ImportError:
780 127 equemene
            print("Platform does not seem to support CUDA")
781 102 equemene
782 102 equemene
    if GpuStyle=='OpenCL':
783 102 equemene
        try:
784 102 equemene
            # For PyOpenCL import
785 102 equemene
            import pyopencl as cl
786 122 equemene
            Id=0
787 102 equemene
            for platform in cl.get_platforms():
788 102 equemene
                for device in platform.get_devices():
789 138 equemene
                    #deviceType=cl.device_type.to_string(device.type)
790 152 equemene
                    deviceType="xPU"
791 127 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
792 102 equemene
793 104 equemene
                    if Id in Devices:
794 102 equemene
                    # Set the Alu as detected Device Type
795 104 equemene
                        Alu[Id]=deviceType
796 102 equemene
                    Id=Id+1
797 102 equemene
        except ImportError:
798 127 equemene
            print("Platform does not seem to support OpenCL")
799 104 equemene
800 127 equemene
    print(Devices,Alu)
801 104 equemene
802 127 equemene
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
803 127 equemene
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
804 102 equemene
805 102 equemene
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
806 102 equemene
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
807 102 equemene
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
808 104 equemene
    avgD=numpy.array([]).astype(numpy.float32)
809 104 equemene
    medD=numpy.array([]).astype(numpy.float32)
810 104 equemene
    stdD=numpy.array([]).astype(numpy.float32)
811 104 equemene
    minD=numpy.array([]).astype(numpy.float32)
812 104 equemene
    maxD=numpy.array([]).astype(numpy.float32)
813 104 equemene
    avgR=numpy.array([]).astype(numpy.float32)
814 104 equemene
    medR=numpy.array([]).astype(numpy.float32)
815 104 equemene
    stdR=numpy.array([]).astype(numpy.float32)
816 104 equemene
    minR=numpy.array([]).astype(numpy.float32)
817 104 equemene
    maxR=numpy.array([]).astype(numpy.float32)
818 102 equemene
819 102 equemene
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
820 102 equemene
821 102 equemene
        # print Blocks,Threads
822 102 equemene
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
823 102 equemene
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
824 102 equemene
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
825 102 equemene
        ExploredThreads=numpy.append(ExploredThreads,Threads)
826 102 equemene
827 102 equemene
        if OutMetrology:
828 104 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
829 104 equemene
            Duration=numpy.array([]).astype(numpy.float32)
830 104 equemene
            Rate=numpy.array([]).astype(numpy.float32)
831 102 equemene
            for i in range(Redo):
832 102 equemene
                start=time.time()
833 102 equemene
                if GpuStyle=='CUDA':
834 102 equemene
                    try:
835 122 equemene
                        InputCU={}
836 122 equemene
                        InputCU['Iterations']=Iterations
837 122 equemene
                        InputCU['Steps']=1
838 122 equemene
                        InputCU['Blocks']=Blocks
839 122 equemene
                        InputCU['Threads']=Threads
840 122 equemene
                        InputCU['Device']=Devices[0]
841 122 equemene
                        InputCU['RNG']=RNG
842 122 equemene
                        InputCU['ValueType']=ValueType
843 122 equemene
                        OutputCU=MetropolisCuda(InputCU)
844 122 equemene
                        Inside=OutputCU['Circle']
845 122 equemene
                        NewIterations=OutputCU['NewIterations']
846 122 equemene
                        Duration=OutputCU['Duration']
847 102 equemene
                    except:
848 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
849 102 equemene
                elif GpuStyle=='OpenCL':
850 102 equemene
                    try:
851 106 equemene
                        InputCL={}
852 106 equemene
                        InputCL['Iterations']=Iterations
853 106 equemene
                        InputCL['Steps']=1
854 106 equemene
                        InputCL['Blocks']=Blocks
855 106 equemene
                        InputCL['Threads']=Threads
856 106 equemene
                        InputCL['Device']=Devices[0]
857 106 equemene
                        InputCL['RNG']=RNG
858 106 equemene
                        InputCL['ValueType']=ValueType
859 106 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
860 106 equemene
                        Inside=OutputCL['Circle']
861 106 equemene
                        NewIterations=OutputCL['NewIterations']
862 106 equemene
                        Duration=OutputCL['Duration']
863 102 equemene
                    except:
864 127 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
865 104 equemene
                Duration=numpy.append(Duration,time.time()-start)
866 104 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
867 102 equemene
        else:
868 102 equemene
            if GpuStyle=='CUDA':
869 102 equemene
                try:
870 122 equemene
                    InputCU={}
871 122 equemene
                    InputCU['Iterations']=Iterations
872 122 equemene
                    InputCU['Steps']=Redo
873 122 equemene
                    InputCU['Blocks']=Blocks
874 122 equemene
                    InputCU['Threads']=Threads
875 122 equemene
                    InputCU['Device']=Devices[0]
876 122 equemene
                    InputCU['RNG']=RNG
877 122 equemene
                    InputCU['ValueType']=ValueType
878 122 equemene
                    OutputCU=MetropolisCuda(InputCU)
879 122 equemene
                    Inside=OutputCU['Inside']
880 122 equemene
                    NewIterations=OutputCU['NewIterations']
881 122 equemene
                    Duration=OutputCU['Duration']
882 102 equemene
                except:
883 127 equemene
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
884 152 equemene
                try:
885 152 equemene
                    pycuda.context.pop()
886 152 equemene
                except:
887 152 equemene
                    pass
888 102 equemene
            elif GpuStyle=='OpenCL':
889 102 equemene
                try:
890 106 equemene
                    InputCL={}
891 106 equemene
                    InputCL['Iterations']=Iterations
892 106 equemene
                    InputCL['Steps']=Redo
893 106 equemene
                    InputCL['Blocks']=Blocks
894 106 equemene
                    InputCL['Threads']=Threads
895 106 equemene
                    InputCL['Device']=Devices[0]
896 106 equemene
                    InputCL['RNG']=RNG
897 106 equemene
                    InputCL['ValueType']=ValueType
898 106 equemene
                    OutputCL=MetropolisOpenCL(InputCL)
899 106 equemene
                    Inside=OutputCL['Inside']
900 106 equemene
                    NewIterations=OutputCL['NewIterations']
901 106 equemene
                    Duration=OutputCL['Duration']
902 102 equemene
                except:
903 127 equemene
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
904 104 equemene
            Rate=NewIterations/Duration
905 130 equemene
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
906 130 equemene
907 104 equemene
908 104 equemene
        avgD=numpy.append(avgD,numpy.average(Duration))
909 104 equemene
        medD=numpy.append(medD,numpy.median(Duration))
910 104 equemene
        stdD=numpy.append(stdD,numpy.std(Duration))
911 104 equemene
        minD=numpy.append(minD,numpy.min(Duration))
912 104 equemene
        maxD=numpy.append(maxD,numpy.max(Duration))
913 104 equemene
        avgR=numpy.append(avgR,numpy.average(Rate))
914 104 equemene
        medR=numpy.append(medR,numpy.median(Rate))
915 104 equemene
        stdR=numpy.append(stdR,numpy.std(Rate))
916 104 equemene
        minR=numpy.append(minR,numpy.min(Rate))
917 104 equemene
        maxR=numpy.append(maxR,numpy.max(Rate))
918 102 equemene
919 127 equemene
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
920 104 equemene
921 104 equemene
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
922 104 equemene
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
923 104 equemene
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
924 102 equemene
925 102 equemene
    if Fit:
926 102 equemene
        FitAndPrint(ExploredJobs,median,Curves)