Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 233

Historique | Voir | Annoter | Télécharger (31,43 ko)

1 127 equemene
#!/usr/bin/env python3
2 102 equemene
3 102 equemene
#
4 102 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 102 equemene
#
6 102 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 102 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 102 equemene
#
9 102 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 102 equemene
# http://mathema.tician.de/software/pycuda
11 102 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 102 equemene
# http://mathema.tician.de/software/pyopencl
13 102 equemene
#
14 102 equemene
15 102 equemene
# 2013-01-01 : problems with launch timeout
16 102 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 102 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 102 equemene
19 102 equemene
# Common tools
20 102 equemene
import numpy
21 102 equemene
from numpy.random import randint as nprnd
22 102 equemene
import sys
23 102 equemene
import getopt
24 102 equemene
import time
25 102 equemene
import itertools
26 102 equemene
from socket import gethostname
27 102 equemene
28 104 equemene
class PenStacle:
29 104 equemene
    """Pentacle of Statistics from data"""
30 104 equemene
    Avg=0
31 104 equemene
    Med=0
32 104 equemene
    Std=0
33 104 equemene
    Min=0
34 104 equemene
    Max=0
35 104 equemene
    def __init__(self,Data):
36 104 equemene
        self.Avg=numpy.average(Data)
37 104 equemene
        self.Med=numpy.median(Data)
38 104 equemene
        self.Std=numpy.std(Data)
39 104 equemene
        self.Max=numpy.max(Data)
40 104 equemene
        self.Min=numpy.min(Data)
41 104 equemene
    def display(self):
42 127 equemene
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43 104 equemene
44 104 equemene
class Experience:
45 104 equemene
    """Metrology for experiences"""
46 104 equemene
    DeviceStyle=''
47 104 equemene
    DeviceId=0
48 104 equemene
    AvgD=0
49 104 equemene
    MedD=0
50 104 equemene
    StdD=0
51 104 equemene
    MinD=0
52 104 equemene
    MaxD=0
53 104 equemene
    AvgR=0
54 104 equemene
    MedR=0
55 104 equemene
    StdR=0
56 104 equemene
    MinR=0
57 104 equemene
    MaxR=0
58 104 equemene
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59 104 equemene
        self.DeviceStyle=DeviceStyle
60 104 equemene
        self.DeviceId=DeviceId
61 104 equemene
        self.Iterations
62 104 equemene
63 104 equemene
    def Metrology(self,Data):
64 104 equemene
        Duration=PenStacle(Data)
65 104 equemene
        Rate=PenStacle(Iterations/Data)
66 127 equemene
        print("Duration %s" % Duration)
67 127 equemene
        print("Rate %s" % Rate)
68 104 equemene
69 105 equemene
70 105 equemene
71 103 equemene
def DictionariesAPI():
72 103 equemene
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73 103 equemene
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74 181 equemene
    Test={True:1,False:0}
75 181 equemene
    return(Marsaglia,Computing,Test)
76 103 equemene
77 102 equemene
# find prime factors of a number
78 102 equemene
# Get for WWW :
79 102 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
80 102 equemene
def PrimeFactors(x):
81 102 equemene
82 102 equemene
    factorlist=numpy.array([]).astype('uint32')
83 102 equemene
    loop=2
84 102 equemene
    while loop<=x:
85 102 equemene
        if x%loop==0:
86 102 equemene
            x/=loop
87 102 equemene
            factorlist=numpy.append(factorlist,[loop])
88 102 equemene
        else:
89 102 equemene
            loop+=1
90 102 equemene
    return factorlist
91 102 equemene
92 102 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
93 102 equemene
# output is thread number
94 102 equemene
def BestThreadsNumber(jobs):
95 102 equemene
    factors=PrimeFactors(jobs)
96 102 equemene
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
97 102 equemene
    threads=1
98 102 equemene
    for factor in matrix.transpose().ravel():
99 102 equemene
        threads=threads*factor
100 102 equemene
        if threads*threads>jobs or threads>512:
101 102 equemene
            break
102 102 equemene
    return(long(threads))
103 102 equemene
104 102 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
105 102 equemene
def AmdahlR(N, T1, p):
106 102 equemene
    return (T1*(1-p+p/N))
107 102 equemene
108 102 equemene
# Predicted Amdahl Law
109 102 equemene
def Amdahl(N, T1, s, p):
110 102 equemene
    return (T1*(s+p/N))
111 102 equemene
112 102 equemene
# Predicted Mylq Law with first order
113 102 equemene
def Mylq(N, T1,s,c,p):
114 102 equemene
    return (T1*(s+p/N)+c*N)
115 102 equemene
116 102 equemene
# Predicted Mylq Law with second order
117 102 equemene
def Mylq2(N, T1,s,c1,c2,p):
118 102 equemene
    return (T1*(s+p/N)+c1*N+c2*N*N)
119 102 equemene
120 103 equemene
def KernelCodeCuda():
121 103 equemene
    KERNEL_CODE_CUDA="""
122 102 equemene
#define TCONG 0
123 102 equemene
#define TSHR3 1
124 102 equemene
#define TMWC 2
125 102 equemene
#define TKISS 3
126 102 equemene

127 102 equemene
#define TINT32 0
128 102 equemene
#define TINT64 1
129 102 equemene
#define TFP32 2
130 102 equemene
#define TFP64 3
131 102 equemene

132 181 equemene
#define IFTHEN 1
133 181 equemene

134 102 equemene
// Marsaglia RNG very simple implementation
135 102 equemene

136 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
137 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
138 102 equemene
#define MWC   (znew+wnew)
139 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
140 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
141 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
142 102 equemene

143 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
144 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
145 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
146 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
147 102 equemene

148 102 equemene
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
149 102 equemene
{
150 102 equemene

151 102 equemene
#if TRNG == TCONG
152 102 equemene
   uint jcong=seed_z+work;
153 102 equemene
#elif TRNG == TSHR3
154 102 equemene
   uint jsr=seed_w+work;
155 102 equemene
#elif TRNG == TMWC
156 102 equemene
   uint z=seed_z+work;
157 102 equemene
   uint w=seed_w+work;
158 102 equemene
#elif TRNG == TKISS
159 102 equemene
   uint jcong=seed_z+work;
160 102 equemene
   uint jsr=seed_w+work;
161 102 equemene
   uint z=seed_z-work;
162 102 equemene
   uint w=seed_w-work;
163 102 equemene
#endif
164 102 equemene

165 102 equemene
   ulong total=0;
166 102 equemene

167 102 equemene
   for (ulong i=0;i<iterations;i++) {
168 102 equemene

169 102 equemene
#if TYPE == TINT32
170 102 equemene
    #define THEONE 1073741824
171 102 equemene
    #if TRNG == TCONG
172 102 equemene
        uint x=CONG>>17 ;
173 102 equemene
        uint y=CONG>>17 ;
174 102 equemene
    #elif TRNG == TSHR3
175 102 equemene
        uint x=SHR3>>17 ;
176 102 equemene
        uint y=SHR3>>17 ;
177 102 equemene
    #elif TRNG == TMWC
178 102 equemene
        uint x=MWC>>17 ;
179 102 equemene
        uint y=MWC>>17 ;
180 102 equemene
    #elif TRNG == TKISS
181 102 equemene
        uint x=KISS>>17 ;
182 102 equemene
        uint y=KISS>>17 ;
183 102 equemene
    #endif
184 102 equemene
#elif TYPE == TINT64
185 102 equemene
    #define THEONE 4611686018427387904
186 102 equemene
    #if TRNG == TCONG
187 102 equemene
        ulong x=(ulong)(CONG>>1) ;
188 102 equemene
        ulong y=(ulong)(CONG>>1) ;
189 102 equemene
    #elif TRNG == TSHR3
190 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
191 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
192 102 equemene
    #elif TRNG == TMWC
193 102 equemene
        ulong x=(ulong)(MWC>>1) ;
194 102 equemene
        ulong y=(ulong)(MWC>>1) ;
195 102 equemene
    #elif TRNG == TKISS
196 102 equemene
        ulong x=(ulong)(KISS>>1) ;
197 102 equemene
        ulong y=(ulong)(KISS>>1) ;
198 102 equemene
    #endif
199 102 equemene
#elif TYPE == TFP32
200 102 equemene
    #define THEONE 1.0f
201 102 equemene
    #if TRNG == TCONG
202 102 equemene
        float x=CONGfp ;
203 102 equemene
        float y=CONGfp ;
204 102 equemene
    #elif TRNG == TSHR3
205 102 equemene
        float x=SHR3fp ;
206 102 equemene
        float y=SHR3fp ;
207 102 equemene
    #elif TRNG == TMWC
208 102 equemene
        float x=MWCfp ;
209 102 equemene
        float y=MWCfp ;
210 102 equemene
    #elif TRNG == TKISS
211 102 equemene
      float x=KISSfp ;
212 102 equemene
      float y=KISSfp ;
213 102 equemene
    #endif
214 102 equemene
#elif TYPE == TFP64
215 102 equemene
    #define THEONE 1.0f
216 102 equemene
    #if TRNG == TCONG
217 102 equemene
        double x=(double)CONGfp ;
218 102 equemene
        double y=(double)CONGfp ;
219 102 equemene
    #elif TRNG == TSHR3
220 102 equemene
        double x=(double)SHR3fp ;
221 102 equemene
        double y=(double)SHR3fp ;
222 102 equemene
    #elif TRNG == TMWC
223 102 equemene
        double x=(double)MWCfp ;
224 102 equemene
        double y=(double)MWCfp ;
225 102 equemene
    #elif TRNG == TKISS
226 102 equemene
        double x=(double)KISSfp ;
227 102 equemene
        double y=(double)KISSfp ;
228 102 equemene
    #endif
229 102 equemene
#endif
230 102 equemene

231 181 equemene
#if TEST == IFTHEN
232 181 equemene
      if ((x*x+y*y) <=THEONE) {
233 181 equemene
         total+=1;
234 181 equemene
      }
235 181 equemene
#else
236 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
237 102 equemene
      total+=inside;
238 181 equemene
#endif
239 102 equemene
   }
240 102 equemene

241 102 equemene
   return(total);
242 102 equemene
}
243 102 equemene

244 102 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
245 102 equemene
{
246 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
247 102 equemene
   s[blockIdx.x]=total;
248 102 equemene
   __syncthreads();
249 102 equemene

250 102 equemene
}
251 102 equemene

252 102 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
253 102 equemene
{
254 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
255 102 equemene
   s[threadIdx.x]=total;
256 102 equemene
   __syncthreads();
257 102 equemene

258 102 equemene
}
259 102 equemene

260 102 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
261 102 equemene
{
262 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
263 102 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
264 102 equemene
   __syncthreads();
265 102 equemene
}
266 102 equemene

267 102 equemene
"""
268 103 equemene
    return(KERNEL_CODE_CUDA)
269 102 equemene
270 103 equemene
def KernelCodeOpenCL():
271 103 equemene
    KERNEL_CODE_OPENCL="""
272 102 equemene
#define TCONG 0
273 102 equemene
#define TSHR3 1
274 102 equemene
#define TMWC 2
275 102 equemene
#define TKISS 3
276 102 equemene

277 102 equemene
#define TINT32 0
278 102 equemene
#define TINT64 1
279 102 equemene
#define TFP32 2
280 102 equemene
#define TFP64 3
281 102 equemene

282 181 equemene
#define IFTHEN 1
283 181 equemene

284 102 equemene
// Marsaglia RNG very simple implementation
285 102 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
286 102 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
287 102 equemene

288 102 equemene
#define MWC   (znew+wnew)
289 102 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
290 102 equemene
#define CONG  (jcong=69069*jcong+1234567)
291 102 equemene
#define KISS  ((MWC^CONG)+SHR3)
292 102 equemene

293 102 equemene
#define MWCfp MWC * 2.328306435454494e-10f
294 102 equemene
#define KISSfp KISS * 2.328306435454494e-10f
295 102 equemene
#define CONGfp CONG * 2.328306435454494e-10f
296 102 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
297 102 equemene

298 102 equemene
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
299 102 equemene
{
300 102 equemene

301 102 equemene
#if TRNG == TCONG
302 102 equemene
   uint jcong=seed_z+work;
303 102 equemene
#elif TRNG == TSHR3
304 102 equemene
   uint jsr=seed_w+work;
305 102 equemene
#elif TRNG == TMWC
306 102 equemene
   uint z=seed_z+work;
307 102 equemene
   uint w=seed_w+work;
308 102 equemene
#elif TRNG == TKISS
309 102 equemene
   uint jcong=seed_z+work;
310 102 equemene
   uint jsr=seed_w+work;
311 102 equemene
   uint z=seed_z-work;
312 102 equemene
   uint w=seed_w-work;
313 102 equemene
#endif
314 102 equemene

315 102 equemene
   ulong total=0;
316 102 equemene

317 102 equemene
   for (ulong i=0;i<iterations;i++) {
318 102 equemene

319 102 equemene
#if TYPE == TINT32
320 102 equemene
    #define THEONE 1073741824
321 102 equemene
    #if TRNG == TCONG
322 102 equemene
        uint x=CONG>>17 ;
323 102 equemene
        uint y=CONG>>17 ;
324 102 equemene
    #elif TRNG == TSHR3
325 102 equemene
        uint x=SHR3>>17 ;
326 102 equemene
        uint y=SHR3>>17 ;
327 102 equemene
    #elif TRNG == TMWC
328 102 equemene
        uint x=MWC>>17 ;
329 102 equemene
        uint y=MWC>>17 ;
330 102 equemene
    #elif TRNG == TKISS
331 102 equemene
        uint x=KISS>>17 ;
332 102 equemene
        uint y=KISS>>17 ;
333 102 equemene
    #endif
334 102 equemene
#elif TYPE == TINT64
335 102 equemene
    #define THEONE 4611686018427387904
336 102 equemene
    #if TRNG == TCONG
337 102 equemene
        ulong x=(ulong)(CONG>>1) ;
338 102 equemene
        ulong y=(ulong)(CONG>>1) ;
339 102 equemene
    #elif TRNG == TSHR3
340 102 equemene
        ulong x=(ulong)(SHR3>>1) ;
341 102 equemene
        ulong y=(ulong)(SHR3>>1) ;
342 102 equemene
    #elif TRNG == TMWC
343 102 equemene
        ulong x=(ulong)(MWC>>1) ;
344 102 equemene
        ulong y=(ulong)(MWC>>1) ;
345 102 equemene
    #elif TRNG == TKISS
346 102 equemene
        ulong x=(ulong)(KISS>>1) ;
347 102 equemene
        ulong y=(ulong)(KISS>>1) ;
348 102 equemene
    #endif
349 102 equemene
#elif TYPE == TFP32
350 102 equemene
    #define THEONE 1.0f
351 102 equemene
    #if TRNG == TCONG
352 102 equemene
        float x=CONGfp ;
353 102 equemene
        float y=CONGfp ;
354 102 equemene
    #elif TRNG == TSHR3
355 102 equemene
        float x=SHR3fp ;
356 102 equemene
        float y=SHR3fp ;
357 102 equemene
    #elif TRNG == TMWC
358 102 equemene
        float x=MWCfp ;
359 102 equemene
        float y=MWCfp ;
360 102 equemene
    #elif TRNG == TKISS
361 102 equemene
      float x=KISSfp ;
362 102 equemene
      float y=KISSfp ;
363 102 equemene
    #endif
364 102 equemene
#elif TYPE == TFP64
365 102 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
366 102 equemene
    #define THEONE 1.0f
367 102 equemene
    #if TRNG == TCONG
368 102 equemene
        double x=(double)CONGfp ;
369 102 equemene
        double y=(double)CONGfp ;
370 102 equemene
    #elif TRNG == TSHR3
371 102 equemene
        double x=(double)SHR3fp ;
372 102 equemene
        double y=(double)SHR3fp ;
373 102 equemene
    #elif TRNG == TMWC
374 102 equemene
        double x=(double)MWCfp ;
375 102 equemene
        double y=(double)MWCfp ;
376 102 equemene
    #elif TRNG == TKISS
377 102 equemene
        double x=(double)KISSfp ;
378 102 equemene
        double y=(double)KISSfp ;
379 102 equemene
    #endif
380 102 equemene
#endif
381 102 equemene

382 181 equemene
#if TEST == IFTHEN
383 181 equemene
      if ((x*x+y*y) <= THEONE) {
384 181 equemene
         total+=1;
385 181 equemene
      }
386 181 equemene
#else
387 102 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
388 102 equemene
      total+=inside;
389 181 equemene
#endif
390 102 equemene
   }
391 102 equemene

392 102 equemene
   return(total);
393 102 equemene
}
394 102 equemene

395 102 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
396 102 equemene
{
397 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
398 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
399 102 equemene
   s[get_global_id(0)]=total;
400 102 equemene
}
401 102 equemene

402 102 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
403 102 equemene
{
404 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
405 102 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
406 102 equemene
   s[get_local_id(0)]=total;
407 102 equemene
}
408 102 equemene

409 102 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
410 102 equemene
{
411 102 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
412 102 equemene
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
413 102 equemene
   s[get_global_id(0)]=total;
414 102 equemene
}
415 102 equemene

416 102 equemene
"""
417 103 equemene
    return(KERNEL_CODE_OPENCL)
418 103 equemene
419 122 equemene
def MetropolisCuda(InputCU):
420 102 equemene
421 127 equemene
    print("Inside ",InputCU)
422 122 equemene
423 122 equemene
    iterations=InputCU['Iterations']
424 122 equemene
    steps=InputCU['Steps']
425 122 equemene
    blocks=InputCU['Blocks']
426 122 equemene
    threads=InputCU['Threads']
427 122 equemene
    Device=InputCU['Device']
428 122 equemene
    RNG=InputCU['RNG']
429 122 equemene
    ValueType=InputCU['ValueType']
430 181 equemene
    TestType=InputCU['IfThen']
431 122 equemene
432 181 equemene
    Marsaglia,Computing,Test=DictionariesAPI()
433 103 equemene
434 122 equemene
    try:
435 122 equemene
        # For PyCUDA import
436 122 equemene
        import pycuda.driver as cuda
437 122 equemene
        from pycuda.compiler import SourceModule
438 122 equemene
439 122 equemene
        cuda.init()
440 122 equemene
        for Id in range(cuda.Device.count()):
441 122 equemene
            if Id==Device:
442 122 equemene
                XPU=cuda.Device(Id)
443 127 equemene
                print("GPU selected %s" % XPU.name())
444 122 equemene
        print
445 122 equemene
446 122 equemene
    except ImportError:
447 127 equemene
        print("Platform does not seem to support CUDA")
448 123 equemene
449 122 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
450 102 equemene
    circleCU = cuda.InOut(circle)
451 123 equemene
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
452 123 equemene
    #cuda.memcpy_htod(circleCU, circle)
453 102 equemene
454 152 equemene
    Context=XPU.make_context()
455 152 equemene
456 102 equemene
    try:
457 194 equemene
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
458 177 equemene
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
459 177 equemene
        # Needed to set the compiler via ccbin for CUDA9 implementation
460 194 equemene
        #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
461 102 equemene
    except:
462 177 equemene
        print("Compilation seems to break")
463 152 equemene
464 102 equemene
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
465 123 equemene
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
466 102 equemene
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
467 123 equemene
468 102 equemene
    MyDuration=numpy.zeros(steps)
469 102 equemene
470 122 equemene
    jobs=blocks*threads;
471 102 equemene
472 122 equemene
    iterationsCU=numpy.uint64(iterations/jobs)
473 122 equemene
    if iterations%jobs!=0:
474 123 equemene
        iterationsCU+=numpy.uint64(1)
475 122 equemene
476 102 equemene
    for i in range(steps):
477 122 equemene
        start_time=time.time()
478 122 equemene
479 123 equemene
        try:
480 123 equemene
            MetropolisHybridCU(circleCU,
481 123 equemene
                               numpy.uint64(iterationsCU),
482 195 equemene
                               numpy.uint32(110271),
483 195 equemene
                               numpy.uint32(101008),
484 195 equemene
                               # numpy.uint32(nprnd(2**32)),
485 195 equemene
                               # numpy.uint32(nprnd(2**32)),
486 123 equemene
                               grid=(blocks,1),block=(threads,1,1))
487 123 equemene
        except:
488 127 equemene
            print("Crash during CUDA call")
489 102 equemene
490 122 equemene
        elapsed = time.time()-start_time
491 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
492 122 equemene
493 102 equemene
        MyDuration[i]=elapsed
494 102 equemene
495 122 equemene
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
496 127 equemene
    print(OutputCU)
497 152 equemene
    Context.pop()
498 122 equemene
499 177 equemene
    Context.detach()
500 194 equemene
501 122 equemene
    return(OutputCU)
502 102 equemene
503 106 equemene
def MetropolisOpenCL(InputCL):
504 103 equemene
505 103 equemene
    import pyopencl as cl
506 105 equemene
507 127 equemene
    print("Inside ",InputCL)
508 107 equemene
509 106 equemene
    iterations=InputCL['Iterations']
510 106 equemene
    steps=InputCL['Steps']
511 106 equemene
    blocks=InputCL['Blocks']
512 106 equemene
    threads=InputCL['Threads']
513 106 equemene
    Device=InputCL['Device']
514 181 equemene
    RNG=InputCL['RNG']
515 106 equemene
    ValueType=InputCL['ValueType']
516 181 equemene
    TestType=InputCL['IfThen']
517 181 equemene
518 181 equemene
    Marsaglia,Computing,Test=DictionariesAPI()
519 103 equemene
520 102 equemene
    # Initialisation des variables en les CASTant correctement
521 122 equemene
    Id=0
522 102 equemene
    HasXPU=False
523 102 equemene
    for platform in cl.get_platforms():
524 102 equemene
        for device in platform.get_devices():
525 102 equemene
            if Id==Device:
526 102 equemene
                XPU=device
527 127 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
528 102 equemene
                HasXPU=True
529 102 equemene
            Id+=1
530 102 equemene
531 102 equemene
    if HasXPU==False:
532 127 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
533 102 equemene
        sys.exit()
534 102 equemene
535 192 equemene
536 102 equemene
    # Je cree le contexte et la queue pour son execution
537 106 equemene
    try:
538 192 equemene
        ctx = cl.Context(devices=[XPU])
539 106 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
540 106 equemene
    except:
541 127 equemene
        print("Crash during context creation")
542 102 equemene
543 102 equemene
    # Je recupere les flag possibles pour les buffers
544 102 equemene
    mf = cl.mem_flags
545 102 equemene
546 106 equemene
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
547 102 equemene
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
548 106 equemene
549 181 equemene
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
550 102 equemene
551 102 equemene
    MyDuration=numpy.zeros(steps)
552 102 equemene
553 102 equemene
    jobs=blocks*threads;
554 102 equemene
555 106 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
556 106 equemene
    if iterations%jobs!=0:
557 106 equemene
        iterationsCL+=1
558 102 equemene
559 102 equemene
    for i in range(steps):
560 102 equemene
        start_time=time.time()
561 102 equemene
        if threads == 1:
562 102 equemene
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
563 102 equemene
                                                 circleCL,
564 102 equemene
                                                 numpy.uint64(iterationsCL),
565 195 equemene
                                                 numpy.uint32(110271),
566 195 equemene
                                                 numpy.uint32(101008))
567 195 equemene
                                                 # numpy.uint32(nprnd(2**32)),
568 195 equemene
                                                 # numpy.uint32(nprnd(2**32)))
569 102 equemene
        else:
570 102 equemene
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
571 102 equemene
                                                 circleCL,
572 102 equemene
                                                 numpy.uint64(iterationsCL),
573 195 equemene
                                                 numpy.uint32(110271),
574 195 equemene
                                                 numpy.uint32(101008))
575 195 equemene
                                                 # numpy.uint32(nprnd(2**32)),
576 195 equemene
                                                 # numpy.uint32(nprnd(2**32)))
577 102 equemene
578 102 equemene
        CLLaunch.wait()
579 102 equemene
        cl.enqueue_copy(queue, circle, circleCL).wait()
580 102 equemene
581 102 equemene
        elapsed = time.time()-start_time
582 127 equemene
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
583 102 equemene
584 104 equemene
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
585 102 equemene
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
586 102 equemene
587 102 equemene
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
588 102 equemene
        MyDuration[i]=elapsed
589 104 equemene
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
590 104 equemene
        # MyPi[i]=numpy.median(AllPi)
591 102 equemene
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
592 102 equemene
593 102 equemene
    circleCL.release()
594 102 equemene
595 106 equemene
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
596 127 equemene
    print(OutputCL)
597 106 equemene
    return(OutputCL)
598 102 equemene
599 102 equemene
600 102 equemene
def FitAndPrint(N,D,Curves):
601 102 equemene
602 102 equemene
    from scipy.optimize import curve_fit
603 102 equemene
    import matplotlib.pyplot as plt
604 102 equemene
605 102 equemene
    try:
606 102 equemene
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
607 102 equemene
608 102 equemene
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
609 102 equemene
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
610 102 equemene
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
611 102 equemene
        coeffs_Amdahl[0]=D[0]
612 127 equemene
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
613 102 equemene
    except:
614 127 equemene
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
615 102 equemene
616 102 equemene
    try:
617 102 equemene
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
618 102 equemene
619 102 equemene
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
620 102 equemene
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
621 102 equemene
        coeffs_AmdahlR[0]=D[0]
622 127 equemene
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
623 102 equemene
624 102 equemene
    except:
625 127 equemene
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
626 102 equemene
627 102 equemene
    try:
628 102 equemene
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
629 102 equemene
630 102 equemene
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
631 102 equemene
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
632 102 equemene
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
633 102 equemene
        coeffs_Mylq[0]=D[0]
634 127 equemene
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
635 102 equemene
                                                                coeffs_Mylq[1],
636 102 equemene
                                                                coeffs_Mylq[3],
637 127 equemene
                                                                coeffs_Mylq[2]))
638 102 equemene
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
639 102 equemene
                    coeffs_Mylq[3])
640 102 equemene
    except:
641 127 equemene
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
642 102 equemene
643 102 equemene
    try:
644 102 equemene
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
645 102 equemene
646 102 equemene
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
647 102 equemene
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
648 102 equemene
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
649 102 equemene
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
650 102 equemene
        coeffs_Mylq2[0]=D[0]
651 127 equemene
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
652 102 equemene
653 102 equemene
    except:
654 127 equemene
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
655 102 equemene
656 102 equemene
    if Curves:
657 102 equemene
        plt.xlabel("Number of Threads/work Items")
658 102 equemene
        plt.ylabel("Total Elapsed Time")
659 102 equemene
660 102 equemene
        Experience,=plt.plot(N,D,'ro')
661 102 equemene
    try:
662 102 equemene
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
663 102 equemene
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
664 102 equemene
    except:
665 127 equemene
        print("Fit curves seem not to be available")
666 102 equemene
667 102 equemene
    plt.legend()
668 102 equemene
    plt.show()
669 102 equemene
670 102 equemene
if __name__=='__main__':
671 103 equemene
672 102 equemene
    # Set defaults values
673 102 equemene
674 102 equemene
    # Id of Device : 1 is for first find !
675 102 equemene
    Device=1
676 102 equemene
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
677 102 equemene
    GpuStyle='OpenCL'
678 102 equemene
    # Iterations is integer
679 104 equemene
    Iterations=10000000
680 102 equemene
    # BlocksBlocks in first number of Blocks to explore
681 102 equemene
    BlocksBegin=1
682 102 equemene
    # BlocksEnd is last number of Blocks to explore
683 152 equemene
    BlocksEnd=1
684 102 equemene
    # BlocksStep is the step of Blocks to explore
685 102 equemene
    BlocksStep=1
686 102 equemene
    # ThreadsBlocks in first number of Blocks to explore
687 102 equemene
    ThreadsBegin=1
688 102 equemene
    # ThreadsEnd is last number of Blocks to explore
689 102 equemene
    ThreadsEnd=1
690 102 equemene
    # ThreadsStep is the step of Blocks to explore
691 102 equemene
    ThreadsStep=1
692 102 equemene
    # Redo is the times to redo the test to improve metrology
693 102 equemene
    Redo=1
694 102 equemene
    # OutMetrology is method for duration estimation : False is GPU inside
695 102 equemene
    OutMetrology=False
696 102 equemene
    Metrology='InMetro'
697 102 equemene
    # Curves is True to print the curves
698 102 equemene
    Curves=False
699 102 equemene
    # Fit is True to print the curves
700 102 equemene
    Fit=False
701 181 equemene
    # Inside based on If
702 181 equemene
    IfThen=False
703 102 equemene
    # Marsaglia RNG
704 102 equemene
    RNG='MWC'
705 102 equemene
    # Value type : INT32, INT64, FP32, FP64
706 102 equemene
    ValueType='FP32'
707 102 equemene
708 181 equemene
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
709 102 equemene
710 102 equemene
    try:
711 181 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
712 102 equemene
    except getopt.GetoptError:
713 127 equemene
        print(HowToUse % sys.argv[0])
714 102 equemene
        sys.exit(2)
715 104 equemene
716 104 equemene
    # List of Devices
717 104 equemene
    Devices=[]
718 104 equemene
    Alu={}
719 104 equemene
720 102 equemene
    for opt, arg in opts:
721 102 equemene
        if opt == '-h':
722 127 equemene
            print(HowToUse % sys.argv[0])
723 102 equemene
724 127 equemene
            print("\nInformations about devices detected under OpenCL API:")
725 102 equemene
            # For PyOpenCL import
726 102 equemene
            try:
727 102 equemene
                import pyopencl as cl
728 122 equemene
                Id=0
729 102 equemene
                for platform in cl.get_platforms():
730 102 equemene
                    for device in platform.get_devices():
731 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
732 157 equemene
                        deviceType="xPU"
733 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
734 102 equemene
                        Id=Id+1
735 102 equemene
736 123 equemene
            except:
737 127 equemene
                print("Your platform does not seem to support OpenCL")
738 122 equemene
739 127 equemene
            print("\nInformations about devices detected under CUDA API:")
740 122 equemene
            # For PyCUDA import
741 122 equemene
            try:
742 122 equemene
                import pycuda.driver as cuda
743 122 equemene
                cuda.init()
744 122 equemene
                for Id in range(cuda.Device.count()):
745 122 equemene
                    device=cuda.Device(Id)
746 127 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
747 102 equemene
                print
748 123 equemene
            except:
749 127 equemene
                print("Your platform does not seem to support CUDA")
750 102 equemene
751 122 equemene
            sys.exit()
752 122 equemene
753 122 equemene
754 102 equemene
        elif opt == '-o':
755 102 equemene
            OutMetrology=True
756 102 equemene
            Metrology='OutMetro'
757 102 equemene
        elif opt == '-c':
758 102 equemene
            Curves=True
759 181 equemene
        elif opt == '-k':
760 181 equemene
            IfThen=True
761 102 equemene
        elif opt in ("-d", "--device"):
762 104 equemene
            Devices.append(int(arg))
763 102 equemene
        elif opt in ("-g", "--gpustyle"):
764 102 equemene
            GpuStyle = arg
765 102 equemene
        elif opt in ("-m", "--marsaglia"):
766 102 equemene
            RNG = arg
767 102 equemene
        elif opt in ("-v", "--valuetype"):
768 102 equemene
            ValueType = arg
769 102 equemene
        elif opt in ("-i", "--iterations"):
770 102 equemene
            Iterations = numpy.uint64(arg)
771 102 equemene
        elif opt in ("-b", "--blocksbegin"):
772 102 equemene
            BlocksBegin = int(arg)
773 192 equemene
            BlocksEnd= BlocksBegin
774 102 equemene
        elif opt in ("-e", "--blocksend"):
775 102 equemene
            BlocksEnd = int(arg)
776 102 equemene
        elif opt in ("-s", "--blocksstep"):
777 102 equemene
            BlocksStep = int(arg)
778 102 equemene
        elif opt in ("-f", "--threadsfirst"):
779 102 equemene
            ThreadsBegin = int(arg)
780 192 equemene
            ThreadsEnd = ThreadsBegin
781 102 equemene
        elif opt in ("-l", "--threadslast"):
782 102 equemene
            ThreadsEnd = int(arg)
783 102 equemene
        elif opt in ("-t", "--threadsstep"):
784 102 equemene
            ThreadsStep = int(arg)
785 102 equemene
        elif opt in ("-r", "--redo"):
786 102 equemene
            Redo = int(arg)
787 102 equemene
788 127 equemene
    print("Devices Identification : %s" % Devices)
789 127 equemene
    print("GpuStyle used : %s" % GpuStyle)
790 127 equemene
    print("Iterations : %s" % Iterations)
791 127 equemene
    print("Number of Blocks on begin : %s" % BlocksBegin)
792 127 equemene
    print("Number of Blocks on end : %s" % BlocksEnd)
793 127 equemene
    print("Step on Blocks : %s" % BlocksStep)
794 127 equemene
    print("Number of Threads on begin : %s" % ThreadsBegin)
795 127 equemene
    print("Number of Threads on end : %s" % ThreadsEnd)
796 127 equemene
    print("Step on Threads : %s" % ThreadsStep)
797 127 equemene
    print("Number of redo : %s" % Redo)
798 127 equemene
    print("Metrology done out of XPU : %r" % OutMetrology)
799 127 equemene
    print("Type of Marsaglia RNG used : %s" % RNG)
800 127 equemene
    print("Type of variable : %s" % ValueType)
801 102 equemene
802 102 equemene
    if GpuStyle=='CUDA':
803 102 equemene
        try:
804 102 equemene
            # For PyCUDA import
805 102 equemene
            import pycuda.driver as cuda
806 122 equemene
807 122 equemene
            cuda.init()
808 122 equemene
            for Id in range(cuda.Device.count()):
809 122 equemene
                device=cuda.Device(Id)
810 127 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
811 122 equemene
                if Id in Devices:
812 122 equemene
                    Alu[Id]='GPU'
813 122 equemene
814 102 equemene
        except ImportError:
815 127 equemene
            print("Platform does not seem to support CUDA")
816 102 equemene
817 102 equemene
    if GpuStyle=='OpenCL':
818 102 equemene
        try:
819 102 equemene
            # For PyOpenCL import
820 102 equemene
            import pyopencl as cl
821 122 equemene
            Id=0
822 102 equemene
            for platform in cl.get_platforms():
823 102 equemene
                for device in platform.get_devices():
824 138 equemene
                    #deviceType=cl.device_type.to_string(device.type)
825 152 equemene
                    deviceType="xPU"
826 127 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
827 102 equemene
828 104 equemene
                    if Id in Devices:
829 102 equemene
                    # Set the Alu as detected Device Type
830 104 equemene
                        Alu[Id]=deviceType
831 102 equemene
                    Id=Id+1
832 102 equemene
        except ImportError:
833 127 equemene
            print("Platform does not seem to support OpenCL")
834 104 equemene
835 127 equemene
    print(Devices,Alu)
836 104 equemene
837 127 equemene
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
838 127 equemene
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
839 102 equemene
840 102 equemene
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
841 102 equemene
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
842 102 equemene
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
843 104 equemene
    avgD=numpy.array([]).astype(numpy.float32)
844 104 equemene
    medD=numpy.array([]).astype(numpy.float32)
845 104 equemene
    stdD=numpy.array([]).astype(numpy.float32)
846 104 equemene
    minD=numpy.array([]).astype(numpy.float32)
847 104 equemene
    maxD=numpy.array([]).astype(numpy.float32)
848 104 equemene
    avgR=numpy.array([]).astype(numpy.float32)
849 104 equemene
    medR=numpy.array([]).astype(numpy.float32)
850 104 equemene
    stdR=numpy.array([]).astype(numpy.float32)
851 104 equemene
    minR=numpy.array([]).astype(numpy.float32)
852 104 equemene
    maxR=numpy.array([]).astype(numpy.float32)
853 102 equemene
854 102 equemene
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
855 102 equemene
856 102 equemene
        # print Blocks,Threads
857 102 equemene
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
858 102 equemene
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
859 102 equemene
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
860 102 equemene
        ExploredThreads=numpy.append(ExploredThreads,Threads)
861 102 equemene
862 102 equemene
        if OutMetrology:
863 104 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
864 104 equemene
            Duration=numpy.array([]).astype(numpy.float32)
865 104 equemene
            Rate=numpy.array([]).astype(numpy.float32)
866 102 equemene
            for i in range(Redo):
867 102 equemene
                start=time.time()
868 102 equemene
                if GpuStyle=='CUDA':
869 102 equemene
                    try:
870 122 equemene
                        InputCU={}
871 122 equemene
                        InputCU['Iterations']=Iterations
872 122 equemene
                        InputCU['Steps']=1
873 122 equemene
                        InputCU['Blocks']=Blocks
874 122 equemene
                        InputCU['Threads']=Threads
875 122 equemene
                        InputCU['Device']=Devices[0]
876 122 equemene
                        InputCU['RNG']=RNG
877 122 equemene
                        InputCU['ValueType']=ValueType
878 181 equemene
                        InputCU['IfThen']=IfThen
879 122 equemene
                        OutputCU=MetropolisCuda(InputCU)
880 122 equemene
                        Inside=OutputCU['Circle']
881 122 equemene
                        NewIterations=OutputCU['NewIterations']
882 122 equemene
                        Duration=OutputCU['Duration']
883 102 equemene
                    except:
884 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
885 102 equemene
                elif GpuStyle=='OpenCL':
886 102 equemene
                    try:
887 106 equemene
                        InputCL={}
888 106 equemene
                        InputCL['Iterations']=Iterations
889 106 equemene
                        InputCL['Steps']=1
890 106 equemene
                        InputCL['Blocks']=Blocks
891 106 equemene
                        InputCL['Threads']=Threads
892 106 equemene
                        InputCL['Device']=Devices[0]
893 106 equemene
                        InputCL['RNG']=RNG
894 181 equemene
                        InputCL['ValueType']=ValueType
895 181 equemene
                        InputCL['IfThen']=IfThen
896 106 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
897 106 equemene
                        Inside=OutputCL['Circle']
898 106 equemene
                        NewIterations=OutputCL['NewIterations']
899 106 equemene
                        Duration=OutputCL['Duration']
900 102 equemene
                    except:
901 127 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
902 104 equemene
                Duration=numpy.append(Duration,time.time()-start)
903 104 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
904 102 equemene
        else:
905 102 equemene
            if GpuStyle=='CUDA':
906 102 equemene
                try:
907 122 equemene
                    InputCU={}
908 122 equemene
                    InputCU['Iterations']=Iterations
909 122 equemene
                    InputCU['Steps']=Redo
910 122 equemene
                    InputCU['Blocks']=Blocks
911 122 equemene
                    InputCU['Threads']=Threads
912 122 equemene
                    InputCU['Device']=Devices[0]
913 122 equemene
                    InputCU['RNG']=RNG
914 122 equemene
                    InputCU['ValueType']=ValueType
915 181 equemene
                    InputCU['IfThen']=IfThen
916 122 equemene
                    OutputCU=MetropolisCuda(InputCU)
917 122 equemene
                    Inside=OutputCU['Inside']
918 122 equemene
                    NewIterations=OutputCU['NewIterations']
919 122 equemene
                    Duration=OutputCU['Duration']
920 152 equemene
                    pycuda.context.pop()
921 152 equemene
                except:
922 194 equemene
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
923 102 equemene
            elif GpuStyle=='OpenCL':
924 102 equemene
                try:
925 106 equemene
                    InputCL={}
926 106 equemene
                    InputCL['Iterations']=Iterations
927 106 equemene
                    InputCL['Steps']=Redo
928 106 equemene
                    InputCL['Blocks']=Blocks
929 106 equemene
                    InputCL['Threads']=Threads
930 106 equemene
                    InputCL['Device']=Devices[0]
931 106 equemene
                    InputCL['RNG']=RNG
932 106 equemene
                    InputCL['ValueType']=ValueType
933 181 equemene
                    InputCL['IfThen']=IfThen
934 106 equemene
                    OutputCL=MetropolisOpenCL(InputCL)
935 106 equemene
                    Inside=OutputCL['Inside']
936 106 equemene
                    NewIterations=OutputCL['NewIterations']
937 106 equemene
                    Duration=OutputCL['Duration']
938 102 equemene
                except:
939 127 equemene
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
940 104 equemene
            Rate=NewIterations/Duration
941 130 equemene
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
942 130 equemene
943 104 equemene
944 104 equemene
        avgD=numpy.append(avgD,numpy.average(Duration))
945 104 equemene
        medD=numpy.append(medD,numpy.median(Duration))
946 104 equemene
        stdD=numpy.append(stdD,numpy.std(Duration))
947 104 equemene
        minD=numpy.append(minD,numpy.min(Duration))
948 104 equemene
        maxD=numpy.append(maxD,numpy.max(Duration))
949 104 equemene
        avgR=numpy.append(avgR,numpy.average(Rate))
950 104 equemene
        medR=numpy.append(medR,numpy.median(Rate))
951 104 equemene
        stdR=numpy.append(stdR,numpy.std(Rate))
952 104 equemene
        minR=numpy.append(minR,numpy.min(Rate))
953 104 equemene
        maxR=numpy.append(maxR,numpy.max(Rate))
954 102 equemene
955 127 equemene
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
956 104 equemene
957 104 equemene
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
958 104 equemene
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
959 104 equemene
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
960 102 equemene
961 102 equemene
    if Fit:
962 102 equemene
        FitAndPrint(ExploredJobs,median,Curves)