Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py @ 284

Historique | Voir | Annoter | Télécharger (26,63 ko)

1 7 equemene
#!/usr/bin/env python
2 7 equemene
3 7 equemene
#
4 55 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 7 equemene
#
6 86 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 86 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 7 equemene
#
9 7 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 7 equemene
# http://mathema.tician.de/software/pycuda
11 86 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 86 equemene
# http://mathema.tician.de/software/pyopencl
13 7 equemene
#
14 7 equemene
15 7 equemene
# 2013-01-01 : problems with launch timeout
16 7 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 7 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 7 equemene
19 7 equemene
# Common tools
20 7 equemene
import numpy
21 7 equemene
from numpy.random import randint as nprnd
22 7 equemene
import sys
23 7 equemene
import getopt
24 7 equemene
import time
25 7 equemene
import math
26 7 equemene
from socket import gethostname
27 7 equemene
28 72 equemene
Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
29 72 equemene
Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
30 72 equemene
31 17 equemene
# find prime factors of a number
32 17 equemene
# Get for WWW :
33 17 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
34 17 equemene
def PrimeFactors(x):
35 17 equemene
  factorlist=numpy.array([]).astype('uint32')
36 17 equemene
  loop=2
37 17 equemene
  while loop<=x:
38 17 equemene
    if x%loop==0:
39 17 equemene
      x/=loop
40 17 equemene
      factorlist=numpy.append(factorlist,[loop])
41 17 equemene
    else:
42 17 equemene
      loop+=1
43 17 equemene
  return factorlist
44 17 equemene
45 17 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
46 17 equemene
# output is thread number
47 17 equemene
def BestThreadsNumber(jobs):
48 17 equemene
  factors=PrimeFactors(jobs)
49 17 equemene
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
50 17 equemene
  threads=1
51 17 equemene
  for factor in matrix.transpose().ravel():
52 17 equemene
    threads=threads*factor
53 71 equemene
    if threads*threads>jobs or threads>512:
54 17 equemene
      break
55 17 equemene
  return(long(threads))
56 17 equemene
57 7 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
58 7 equemene
def AmdahlR(N, T1, p):
59 7 equemene
  return (T1*(1-p+p/N))
60 7 equemene
61 7 equemene
# Predicted Amdahl Law
62 7 equemene
def Amdahl(N, T1, s, p):
63 7 equemene
  return (T1*(s+p/N))
64 7 equemene
65 7 equemene
# Predicted Mylq Law with first order
66 7 equemene
def Mylq(N, T1,s,c,p):
67 45 equemene
  return (T1*(s+p/N)+c*N)
68 7 equemene
69 7 equemene
# Predicted Mylq Law with second order
70 7 equemene
def Mylq2(N, T1,s,c1,c2,p):
71 45 equemene
  return (T1*(s+p/N)+c1*N+c2*N*N)
72 7 equemene
73 7 equemene
KERNEL_CODE_CUDA="""
74 73 equemene
#define TCONG 0
75 73 equemene
#define TSHR3 1
76 73 equemene
#define TMWC 2
77 73 equemene
#define TKISS 3
78 7 equemene

79 73 equemene
#define TINT32 0
80 73 equemene
#define TINT64 1
81 73 equemene
#define TFP32 2
82 73 equemene
#define TFP64 3
83 73 equemene

84 7 equemene
// Marsaglia RNG very simple implementation
85 7 equemene

86 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
87 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
88 7 equemene
#define MWC   (znew+wnew)
89 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
90 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
91 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
92 7 equemene

93 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
94 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
95 72 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
96 72 equemene
#define CONGfp CONG * 2.328306435454494e-10f
97 7 equemene

98 74 equemene
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
99 7 equemene
{
100 7 equemene

101 74 equemene
#if TRNG == TCONG
102 74 equemene
   uint jcong=seed_z+work;
103 74 equemene
#elif TRNG == TSHR3
104 74 equemene
   uint jsr=seed_w+work;
105 74 equemene
#elif TRNG == TMWC
106 74 equemene
   uint z=seed_z+work;
107 74 equemene
   uint w=seed_w+work;
108 74 equemene
#elif TRNG == TKISS
109 74 equemene
   uint jcong=seed_z+work;
110 74 equemene
   uint jsr=seed_w+work;
111 74 equemene
   uint z=seed_z-work;
112 74 equemene
   uint w=seed_w-work;
113 74 equemene
#endif
114 7 equemene

115 17 equemene
   ulong total=0;
116 7 equemene

117 17 equemene
   for (ulong i=0;i<iterations;i++) {
118 7 equemene

119 74 equemene
#if TYPE == TINT32
120 74 equemene
    #define THEONE 1073741824
121 74 equemene
    #if TRNG == TCONG
122 74 equemene
        uint x=CONG>>17 ;
123 74 equemene
        uint y=CONG>>17 ;
124 74 equemene
    #elif TRNG == TSHR3
125 74 equemene
        uint x=SHR3>>17 ;
126 74 equemene
        uint y=SHR3>>17 ;
127 74 equemene
    #elif TRNG == TMWC
128 74 equemene
        uint x=MWC>>17 ;
129 74 equemene
        uint y=MWC>>17 ;
130 74 equemene
    #elif TRNG == TKISS
131 74 equemene
        uint x=KISS>>17 ;
132 74 equemene
        uint y=KISS>>17 ;
133 74 equemene
    #endif
134 74 equemene
#elif TYPE == TINT64
135 74 equemene
    #define THEONE 4611686018427387904
136 74 equemene
    #if TRNG == TCONG
137 74 equemene
        ulong x=(ulong)(CONG>>1) ;
138 74 equemene
        ulong y=(ulong)(CONG>>1) ;
139 74 equemene
    #elif TRNG == TSHR3
140 74 equemene
        ulong x=(ulong)(SHR3>>1) ;
141 74 equemene
        ulong y=(ulong)(SHR3>>1) ;
142 74 equemene
    #elif TRNG == TMWC
143 74 equemene
        ulong x=(ulong)(MWC>>1) ;
144 74 equemene
        ulong y=(ulong)(MWC>>1) ;
145 74 equemene
    #elif TRNG == TKISS
146 74 equemene
        ulong x=(ulong)(KISS>>1) ;
147 74 equemene
        ulong y=(ulong)(KISS>>1) ;
148 74 equemene
    #endif
149 74 equemene
#elif TYPE == TFP32
150 74 equemene
    #define THEONE 1.0f
151 74 equemene
    #if TRNG == TCONG
152 74 equemene
        float x=CONGfp ;
153 74 equemene
        float y=CONGfp ;
154 74 equemene
    #elif TRNG == TSHR3
155 74 equemene
        float x=SHR3fp ;
156 74 equemene
        float y=SHR3fp ;
157 74 equemene
    #elif TRNG == TMWC
158 74 equemene
        float x=MWCfp ;
159 74 equemene
        float y=MWCfp ;
160 74 equemene
    #elif TRNG == TKISS
161 74 equemene
      float x=KISSfp ;
162 74 equemene
      float y=KISSfp ;
163 74 equemene
    #endif
164 74 equemene
#elif TYPE == TFP64
165 74 equemene
    #define THEONE 1.0f
166 74 equemene
    #if TRNG == TCONG
167 74 equemene
        double x=(double)CONGfp ;
168 74 equemene
        double y=(double)CONGfp ;
169 74 equemene
    #elif TRNG == TSHR3
170 74 equemene
        double x=(double)SHR3fp ;
171 74 equemene
        double y=(double)SHR3fp ;
172 74 equemene
    #elif TRNG == TMWC
173 74 equemene
        double x=(double)MWCfp ;
174 74 equemene
        double y=(double)MWCfp ;
175 74 equemene
    #elif TRNG == TKISS
176 74 equemene
        double x=(double)KISSfp ;
177 74 equemene
        double y=(double)KISSfp ;
178 74 equemene
    #endif
179 74 equemene
#endif
180 7 equemene

181 74 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
182 7 equemene
      total+=inside;
183 7 equemene
   }
184 7 equemene

185 74 equemene
   return(total);
186 7 equemene
}
187 7 equemene

188 74 equemene
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
189 7 equemene
{
190 74 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
191 50 equemene
   s[blockIdx.x]=total;
192 50 equemene
   __syncthreads();
193 50 equemene

194 50 equemene
}
195 50 equemene

196 74 equemene
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
197 50 equemene
{
198 74 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
199 50 equemene
   s[threadIdx.x]=total;
200 50 equemene
   __syncthreads();
201 50 equemene

202 50 equemene
}
203 50 equemene

204 74 equemene
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
205 50 equemene
{
206 74 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
207 50 equemene
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
208 50 equemene
   __syncthreads();
209 50 equemene
}
210 74 equemene

211 7 equemene
"""
212 7 equemene
213 7 equemene
KERNEL_CODE_OPENCL="""
214 72 equemene
#define TCONG 0
215 72 equemene
#define TSHR3 1
216 72 equemene
#define TMWC 2
217 72 equemene
#define TKISS 3
218 72 equemene

219 72 equemene
#define TINT32 0
220 72 equemene
#define TINT64 1
221 72 equemene
#define TFP32 2
222 72 equemene
#define TFP64 3
223 72 equemene

224 7 equemene
// Marsaglia RNG very simple implementation
225 7 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
226 7 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
227 72 equemene

228 7 equemene
#define MWC   (znew+wnew)
229 7 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
230 7 equemene
#define CONG  (jcong=69069*jcong+1234567)
231 7 equemene
#define KISS  ((MWC^CONG)+SHR3)
232 7 equemene

233 7 equemene
#define MWCfp MWC * 2.328306435454494e-10f
234 7 equemene
#define KISSfp KISS * 2.328306435454494e-10f
235 72 equemene
#define CONGfp CONG * 2.328306435454494e-10f
236 72 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
237 7 equemene

238 73 equemene
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
239 7 equemene
{
240 72 equemene

241 72 equemene
#if TRNG == TCONG
242 73 equemene
   uint jcong=seed_z+work;
243 72 equemene
#elif TRNG == TSHR3
244 73 equemene
   uint jsr=seed_w+work;
245 72 equemene
#elif TRNG == TMWC
246 73 equemene
   uint z=seed_z+work;
247 73 equemene
   uint w=seed_w+work;
248 72 equemene
#elif TRNG == TKISS
249 73 equemene
   uint jcong=seed_z+work;
250 73 equemene
   uint jsr=seed_w+work;
251 73 equemene
   uint z=seed_z-work;
252 73 equemene
   uint w=seed_w-work;
253 72 equemene
#endif
254 7 equemene

255 17 equemene
   ulong total=0;
256 7 equemene

257 17 equemene
   for (ulong i=0;i<iterations;i++) {
258 7 equemene

259 72 equemene
#if TYPE == TINT32
260 73 equemene
    #define THEONE 1073741824
261 73 equemene
    #if TRNG == TCONG
262 73 equemene
        uint x=CONG>>17 ;
263 73 equemene
        uint y=CONG>>17 ;
264 73 equemene
    #elif TRNG == TSHR3
265 73 equemene
        uint x=SHR3>>17 ;
266 73 equemene
        uint y=SHR3>>17 ;
267 73 equemene
    #elif TRNG == TMWC
268 73 equemene
        uint x=MWC>>17 ;
269 73 equemene
        uint y=MWC>>17 ;
270 73 equemene
    #elif TRNG == TKISS
271 73 equemene
        uint x=KISS>>17 ;
272 73 equemene
        uint y=KISS>>17 ;
273 73 equemene
    #endif
274 72 equemene
#elif TYPE == TINT64
275 73 equemene
    #define THEONE 4611686018427387904
276 73 equemene
    #if TRNG == TCONG
277 73 equemene
        ulong x=(ulong)(CONG>>1) ;
278 73 equemene
        ulong y=(ulong)(CONG>>1) ;
279 73 equemene
    #elif TRNG == TSHR3
280 73 equemene
        ulong x=(ulong)(SHR3>>1) ;
281 73 equemene
        ulong y=(ulong)(SHR3>>1) ;
282 73 equemene
    #elif TRNG == TMWC
283 73 equemene
        ulong x=(ulong)(MWC>>1) ;
284 73 equemene
        ulong y=(ulong)(MWC>>1) ;
285 73 equemene
    #elif TRNG == TKISS
286 73 equemene
        ulong x=(ulong)(KISS>>1) ;
287 73 equemene
        ulong y=(ulong)(KISS>>1) ;
288 73 equemene
    #endif
289 72 equemene
#elif TYPE == TFP32
290 73 equemene
    #define THEONE 1.0f
291 73 equemene
    #if TRNG == TCONG
292 73 equemene
        float x=CONGfp ;
293 73 equemene
        float y=CONGfp ;
294 73 equemene
    #elif TRNG == TSHR3
295 73 equemene
        float x=SHR3fp ;
296 73 equemene
        float y=SHR3fp ;
297 73 equemene
    #elif TRNG == TMWC
298 73 equemene
        float x=MWCfp ;
299 73 equemene
        float y=MWCfp ;
300 73 equemene
    #elif TRNG == TKISS
301 72 equemene
      float x=KISSfp ;
302 72 equemene
      float y=KISSfp ;
303 73 equemene
    #endif
304 72 equemene
#elif TYPE == TFP64
305 73 equemene
#pragma OPENCL EXTENSION cl_khr_fp64: enable
306 73 equemene
    #define THEONE 1.0f
307 73 equemene
    #if TRNG == TCONG
308 73 equemene
        double x=(double)CONGfp ;
309 73 equemene
        double y=(double)CONGfp ;
310 73 equemene
    #elif TRNG == TSHR3
311 73 equemene
        double x=(double)SHR3fp ;
312 73 equemene
        double y=(double)SHR3fp ;
313 73 equemene
    #elif TRNG == TMWC
314 73 equemene
        double x=(double)MWCfp ;
315 73 equemene
        double y=(double)MWCfp ;
316 73 equemene
    #elif TRNG == TKISS
317 73 equemene
        double x=(double)KISSfp ;
318 73 equemene
        double y=(double)KISSfp ;
319 73 equemene
    #endif
320 72 equemene
#endif
321 7 equemene

322 72 equemene
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
323 7 equemene
      total+=inside;
324 7 equemene
   }
325 73 equemene

326 73 equemene
   return(total);
327 73 equemene
}
328 72 equemene

329 73 equemene
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
330 73 equemene
{
331 73 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
332 71 equemene
   barrier(CLK_GLOBAL_MEM_FENCE);
333 73 equemene
   s[get_global_id(0)]=total;
334 7 equemene
}
335 7 equemene

336 73 equemene

337 17 equemene
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
338 7 equemene
{
339 73 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
340 71 equemene
   barrier(CLK_LOCAL_MEM_FENCE);
341 7 equemene
   s[get_local_id(0)]=total;
342 7 equemene
}
343 7 equemene

344 17 equemene
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
345 7 equemene
{
346 73 equemene
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
347 73 equemene
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
348 71 equemene
   s[get_global_id(0)]=total;
349 7 equemene
}
350 50 equemene

351 7 equemene
"""
352 7 equemene
353 74 equemene
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,RNG,ValueType):
354 7 equemene
355 7 equemene
  # Avec PyCUDA autoinit, rien a faire !
356 7 equemene
357 7 equemene
  circleCU = cuda.InOut(circle)
358 7 equemene
359 74 equemene
  try:
360 74 equemene
    mod = SourceModule(KERNEL_CODE_CUDA,options=['--compiler-options','-Wall -DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
361 74 equemene
  except:
362 74 equemene
    print "Compilation seems to brake"
363 74 equemene
364 7 equemene
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
365 7 equemene
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
366 7 equemene
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
367 7 equemene
368 7 equemene
  start = pycuda.driver.Event()
369 7 equemene
  stop = pycuda.driver.Event()
370 7 equemene
371 7 equemene
  MyPi=numpy.zeros(steps)
372 7 equemene
  MyDuration=numpy.zeros(steps)
373 50 equemene
374 7 equemene
  if iterations%jobs==0:
375 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
376 7 equemene
    iterationsNew=iterationsCL*jobs
377 7 equemene
  else:
378 17 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
379 7 equemene
    iterationsNew=iterations
380 7 equemene
381 7 equemene
  for i in range(steps):
382 7 equemene
    start.record()
383 7 equemene
    start.synchronize()
384 7 equemene
    if ParaStyle=='Blocks':
385 74 equemene
      MetropolisBlocksCU(circleCU,
386 74 equemene
                         numpy.uint64(iterationsCL),
387 74 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
388 74 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
389 74 equemene
                         grid=(jobs,1),
390 74 equemene
                         block=(1,1,1))
391 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
392 17 equemene
            (Alu,jobs,1,ParaStyle)
393 7 equemene
    elif ParaStyle=='Hybrid':
394 17 equemene
      threads=BestThreadsNumber(jobs)
395 74 equemene
      MetropolisHybridCU(circleCU,
396 50 equemene
                         numpy.uint64(iterationsCL),
397 50 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
398 50 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
399 74 equemene
                         grid=(jobs,1),
400 74 equemene
                         block=(threads,1,1))
401 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
402 74 equemene
            (Alu,jobs/threads,threads,ParaStyle)
403 74 equemene
    else:
404 74 equemene
      MetropolisJobsCU(circleCU,
405 74 equemene
                       numpy.uint64(iterationsCL),
406 74 equemene
                       numpy.uint32(nprnd(2**30/jobs)),
407 74 equemene
                       numpy.uint32(nprnd(2**30/jobs)),
408 74 equemene
                       grid=(1,1),
409 74 equemene
                       block=(jobs,1,1))
410 74 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
411 17 equemene
            (Alu,jobs,1,ParaStyle)
412 7 equemene
    stop.record()
413 7 equemene
    stop.synchronize()
414 7 equemene
415 7 equemene
    elapsed = start.time_till(stop)*1e-3
416 7 equemene
417 7 equemene
    MyDuration[i]=elapsed
418 50 equemene
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
419 50 equemene
    MyPi[i]=numpy.median(AllPi)
420 50 equemene
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
421 7 equemene
422 50 equemene
423 75 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)
424 7 equemene
425 75 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
426 7 equemene
427 7 equemene
428 50 equemene
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
429 73 equemene
                     RNG,ValueType):
430 7 equemene
431 7 equemene
  # Initialisation des variables en les CASTant correctement
432 7 equemene
433 46 equemene
  if Device==0:
434 46 equemene
    print "Enter XPU selector based on ALU type: first selected"
435 46 equemene
    HasXPU=False
436 46 equemene
    # Default Device selection based on ALU Type
437 46 equemene
    for platform in cl.get_platforms():
438 46 equemene
      for device in platform.get_devices():
439 46 equemene
        deviceType=cl.device_type.to_string(device.type)
440 46 equemene
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
441 46 equemene
          XPU=device
442 46 equemene
          print "GPU selected: ",device.name
443 46 equemene
          HasXPU=True
444 46 equemene
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:
445 46 equemene
          XPU=device
446 46 equemene
          print "CPU selected: ",device.name
447 46 equemene
          HasXPU=True
448 46 equemene
  else:
449 46 equemene
    print "Enter XPU selector based on device number & ALU type"
450 46 equemene
    Id=1
451 46 equemene
    HasXPU=False
452 46 equemene
    # Primary Device selection based on Device Id
453 46 equemene
    for platform in cl.get_platforms():
454 46 equemene
      for device in platform.get_devices():
455 46 equemene
        deviceType=cl.device_type.to_string(device.type)
456 46 equemene
        if Id==Device and Alu==deviceType and HasXPU==False:
457 46 equemene
          XPU=device
458 73 equemene
          print "CPU/GPU selected: ",device.name.lstrip()
459 46 equemene
          HasXPU=True
460 46 equemene
        Id=Id+1
461 46 equemene
    if HasXPU==False:
462 46 equemene
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
463 46 equemene
          (Device,Alu,Id-1)
464 46 equemene
      return(0,0,0)
465 7 equemene
466 7 equemene
  # Je cree le contexte et la queue pour son execution
467 46 equemene
  ctx = cl.Context([XPU])
468 7 equemene
  queue = cl.CommandQueue(ctx,
469 7 equemene
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
470 7 equemene
471 7 equemene
  # Je recupere les flag possibles pour les buffers
472 7 equemene
  mf = cl.mem_flags
473 7 equemene
474 7 equemene
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
475 7 equemene
476 72 equemene
477 7 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
478 72 equemene
    options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType]))
479 7 equemene
480 7 equemene
  i=0
481 7 equemene
482 7 equemene
  MyPi=numpy.zeros(steps)
483 7 equemene
  MyDuration=numpy.zeros(steps)
484 7 equemene
485 7 equemene
  if iterations%jobs==0:
486 41 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
487 47 equemene
    iterationsNew=numpy.uint64(iterationsCL*jobs)
488 7 equemene
  else:
489 41 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
490 47 equemene
    iterationsNew=numpy.uint64(iterations)
491 7 equemene
492 7 equemene
  for i in range(steps):
493 7 equemene
494 7 equemene
    if ParaStyle=='Blocks':
495 7 equemene
      # Call OpenCL kernel
496 7 equemene
      # (1,) is Global work size (only 1 work size)
497 7 equemene
      # (1,) is local work size
498 7 equemene
      # circleCL is lattice translated in CL format
499 7 equemene
      # SeedZCL is lattice translated in CL format
500 7 equemene
      # SeedWCL is lattice translated in CL format
501 7 equemene
      # step is number of iterations
502 73 equemene
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
503 73 equemene
                                           circleCL,
504 73 equemene
                                           numpy.uint64(iterationsCL),
505 73 equemene
                                           numpy.uint32(nprnd(2**30/jobs)),
506 73 equemene
                                           numpy.uint32(nprnd(2**30/jobs)))
507 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
508 17 equemene
            (Alu,jobs,1,ParaStyle)
509 7 equemene
    elif ParaStyle=='Hybrid':
510 17 equemene
      threads=BestThreadsNumber(jobs)
511 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
512 73 equemene
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
513 73 equemene
                                          circleCL,
514 73 equemene
                                          numpy.uint64(iterationsCL),
515 73 equemene
                                          numpy.uint32(nprnd(2**30/jobs)),
516 73 equemene
                                          numpy.uint32(nprnd(2**30/jobs)))
517 50 equemene
518 17 equemene
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
519 17 equemene
            (Alu,jobs/threads,threads,ParaStyle)
520 7 equemene
    else:
521 7 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
522 73 equemene
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
523 73 equemene
                                          circleCL,
524 73 equemene
                                          numpy.uint64(iterationsCL),
525 73 equemene
                                          numpy.uint32(nprnd(2**30/jobs)),
526 73 equemene
                                          numpy.uint32(nprnd(2**30/jobs)))
527 7 equemene
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
528 7 equemene
529 7 equemene
    CLLaunch.wait()
530 7 equemene
    cl.enqueue_copy(queue, circle, circleCL).wait()
531 7 equemene
532 7 equemene
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
533 7 equemene
534 71 equemene
    print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
535 7 equemene
    MyDuration[i]=elapsed
536 49 equemene
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
537 49 equemene
    MyPi[i]=numpy.median(AllPi)
538 49 equemene
    print MyPi[i],numpy.std(AllPi),MyDuration[i]
539 7 equemene
540 7 equemene
  circleCL.release()
541 7 equemene
542 75 equemene
  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)
543 7 equemene
544 75 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
545 7 equemene
546 7 equemene
547 7 equemene
def FitAndPrint(N,D,Curves):
548 7 equemene
549 55 equemene
  from scipy.optimize import curve_fit
550 55 equemene
  import matplotlib.pyplot as plt
551 55 equemene
552 7 equemene
  try:
553 7 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
554 7 equemene
555 7 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
556 7 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
557 7 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
558 7 equemene
    coeffs_Amdahl[0]=D[0]
559 7 equemene
    print "Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
560 7 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
561 7 equemene
  except:
562 7 equemene
    print "Impossible to fit for Amdahl law : only %i elements" % len(D)
563 7 equemene
564 7 equemene
  try:
565 7 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
566 7 equemene
567 7 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
568 7 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
569 7 equemene
    coeffs_AmdahlR[0]=D[0]
570 7 equemene
    print "Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
571 7 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
572 7 equemene
573 7 equemene
  except:
574 7 equemene
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D)
575 7 equemene
576 7 equemene
  try:
577 7 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
578 7 equemene
579 7 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
580 45 equemene
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
581 7 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
582 7 equemene
    coeffs_Mylq[0]=D[0]
583 45 equemene
    print "Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
584 7 equemene
                                                            coeffs_Mylq[1],
585 45 equemene
                                                            coeffs_Mylq[3],
586 45 equemene
                                                            coeffs_Mylq[2])
587 7 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
588 7 equemene
                coeffs_Mylq[3])
589 7 equemene
  except:
590 7 equemene
    print "Impossible to fit for Mylq law : only %i elements" % len(D)
591 7 equemene
592 7 equemene
  try:
593 7 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
594 7 equemene
595 7 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
596 45 equemene
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
597 45 equemene
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
598 7 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
599 7 equemene
    coeffs_Mylq2[0]=D[0]
600 45 equemene
    print "Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
601 45 equemene
        (coeffs_Mylq2[0],coeffs_Mylq2[1],
602 45 equemene
         coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3])
603 7 equemene
604 7 equemene
  except:
605 7 equemene
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D)
606 7 equemene
607 7 equemene
  if Curves:
608 7 equemene
    plt.xlabel("Number of Threads/work Items")
609 7 equemene
    plt.ylabel("Total Elapsed Time")
610 7 equemene
611 7 equemene
    Experience,=plt.plot(N,D,'ro')
612 7 equemene
    try:
613 7 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
614 7 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
615 7 equemene
    except:
616 7 equemene
      print "Fit curves seem not to be available"
617 7 equemene
618 7 equemene
    plt.legend()
619 7 equemene
    plt.show()
620 7 equemene
621 7 equemene
if __name__=='__main__':
622 7 equemene
623 7 equemene
  # Set defaults values
624 55 equemene
625 55 equemene
  # Alu can be CPU, GPU or ACCELERATOR
626 7 equemene
  Alu='CPU'
627 46 equemene
  # Id of GPU : 1 is for first find !
628 17 equemene
  Device=0
629 7 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
630 7 equemene
  GpuStyle='OpenCL'
631 7 equemene
  # Parallel distribution can be on Threads or Blocks
632 7 equemene
  ParaStyle='Blocks'
633 7 equemene
  # Iterations is integer
634 7 equemene
  Iterations=100000000
635 7 equemene
  # JobStart in first number of Jobs to explore
636 7 equemene
  JobStart=1
637 7 equemene
  # JobEnd is last number of Jobs to explore
638 7 equemene
  JobEnd=16
639 47 equemene
  # JobStep is the step of Jobs to explore
640 47 equemene
  JobStep=1
641 7 equemene
  # Redo is the times to redo the test to improve metrology
642 7 equemene
  Redo=1
643 7 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
644 7 equemene
  OutMetrology=False
645 45 equemene
  Metrology='InMetro'
646 7 equemene
  # Curves is True to print the curves
647 7 equemene
  Curves=False
648 55 equemene
  # Fit is True to print the curves
649 55 equemene
  Fit=False
650 72 equemene
  # Marsaglia RNG
651 86 equemene
  RNG='MWC'
652 72 equemene
  # Value type : INT32, INT64, FP32, FP64
653 86 equemene
  ValueType='FP32'
654 72 equemene
655 7 equemene
  try:
656 72 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:v:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","marsaglia=","valuetype="])
657 7 equemene
  except getopt.GetoptError:
658 72 equemene
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64> ' % sys.argv[0]
659 7 equemene
    sys.exit(2)
660 7 equemene
661 7 equemene
  for opt, arg in opts:
662 7 equemene
    if opt == '-h':
663 72 equemene
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>' % sys.argv[0]
664 46 equemene
665 46 equemene
      print "\nInformations about devices detected under OpenCL:"
666 46 equemene
      # For PyOpenCL import
667 55 equemene
      try:
668 55 equemene
        import pyopencl as cl
669 55 equemene
        Id=1
670 55 equemene
        for platform in cl.get_platforms():
671 55 equemene
          for device in platform.get_devices():
672 55 equemene
            deviceType=cl.device_type.to_string(device.type)
673 73 equemene
            print "Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())
674 55 equemene
            Id=Id+1
675 46 equemene
676 55 equemene
        print
677 55 equemene
        sys.exit()
678 55 equemene
      except ImportError:
679 55 equemene
        print "Your platform does not seem to support OpenCL"
680 55 equemene
681 7 equemene
    elif opt == '-o':
682 7 equemene
      OutMetrology=True
683 45 equemene
      Metrology='OutMetro'
684 7 equemene
    elif opt == '-c':
685 7 equemene
      Curves=True
686 55 equemene
    elif opt == '-f':
687 55 equemene
      Fit=True
688 7 equemene
    elif opt in ("-a", "--alu"):
689 7 equemene
      Alu = arg
690 7 equemene
    elif opt in ("-d", "--device"):
691 7 equemene
      Device = int(arg)
692 7 equemene
    elif opt in ("-g", "--gpustyle"):
693 7 equemene
      GpuStyle = arg
694 7 equemene
    elif opt in ("-p", "--parastyle"):
695 7 equemene
      ParaStyle = arg
696 72 equemene
    elif opt in ("-m", "--marsaglia"):
697 72 equemene
      RNG = arg
698 72 equemene
    elif opt in ("-v", "--valuetype"):
699 72 equemene
      ValueType = arg
700 7 equemene
    elif opt in ("-i", "--iterations"):
701 40 equemene
      Iterations = numpy.uint64(arg)
702 7 equemene
    elif opt in ("-s", "--jobstart"):
703 7 equemene
      JobStart = int(arg)
704 7 equemene
    elif opt in ("-e", "--jobend"):
705 7 equemene
      JobEnd = int(arg)
706 47 equemene
    elif opt in ("-t", "--jobstep"):
707 47 equemene
      JobStep = int(arg)
708 7 equemene
    elif opt in ("-r", "--redo"):
709 7 equemene
      Redo = int(arg)
710 7 equemene
711 7 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
712 7 equemene
    print "Alu can't be CPU for CUDA, set Alu to GPU"
713 7 equemene
    Alu='GPU'
714 7 equemene
715 7 equemene
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
716 7 equemene
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
717 7 equemene
    ParaStyle='Threads'
718 7 equemene
719 7 equemene
  print "Compute unit : %s" % Alu
720 7 equemene
  print "Device Identification : %s" % Device
721 7 equemene
  print "GpuStyle used : %s" % GpuStyle
722 7 equemene
  print "Parallel Style used : %s" % ParaStyle
723 7 equemene
  print "Iterations : %s" % Iterations
724 7 equemene
  print "Number of threads on start : %s" % JobStart
725 7 equemene
  print "Number of threads on end : %s" % JobEnd
726 7 equemene
  print "Number of redo : %s" % Redo
727 7 equemene
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
728 72 equemene
  print "Type of Marsaglia RNG used : %s" % RNG
729 72 equemene
  print "Type of variable : %s" % ValueType
730 7 equemene
731 7 equemene
  if GpuStyle=='CUDA':
732 55 equemene
    try:
733 55 equemene
      # For PyCUDA import
734 55 equemene
      import pycuda.driver as cuda
735 55 equemene
      import pycuda.gpuarray as gpuarray
736 55 equemene
      import pycuda.autoinit
737 55 equemene
      from pycuda.compiler import SourceModule
738 55 equemene
    except ImportError:
739 55 equemene
      print "Platform does not seem to support CUDA"
740 7 equemene
741 7 equemene
  if GpuStyle=='OpenCL':
742 55 equemene
    try:
743 55 equemene
      # For PyOpenCL import
744 55 equemene
      import pyopencl as cl
745 55 equemene
      Id=1
746 55 equemene
      for platform in cl.get_platforms():
747 55 equemene
        for device in platform.get_devices():
748 55 equemene
          deviceType=cl.device_type.to_string(device.type)
749 73 equemene
          print "Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip())
750 73 equemene
751 55 equemene
          if Id == Device:
752 55 equemene
            # Set the Alu as detected Device Type
753 55 equemene
            Alu=deviceType
754 55 equemene
          Id=Id+1
755 55 equemene
    except ImportError:
756 55 equemene
      print "Platform does not seem to support CUDA"
757 55 equemene
758 7 equemene
  average=numpy.array([]).astype(numpy.float32)
759 7 equemene
  median=numpy.array([]).astype(numpy.float32)
760 7 equemene
  stddev=numpy.array([]).astype(numpy.float32)
761 75 equemene
  averageRate=numpy.array([]).astype(numpy.float32)
762 75 equemene
  medianRate=numpy.array([]).astype(numpy.float32)
763 75 equemene
  stddevRate=numpy.array([]).astype(numpy.float32)
764 7 equemene
765 7 equemene
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
766 7 equemene
767 7 equemene
  Jobs=JobStart
768 7 equemene
769 7 equemene
  while Jobs <= JobEnd:
770 7 equemene
    avg,med,std=0,0,0
771 7 equemene
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
772 17 equemene
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
773 7 equemene
774 75 equemene
    if OutMetrology:
775 7 equemene
      duration=numpy.array([]).astype(numpy.float32)
776 75 equemene
      rate=numpy.array([]).astype(numpy.float32)
777 7 equemene
      for i in range(Redo):
778 7 equemene
        start=time.time()
779 7 equemene
        if GpuStyle=='CUDA':
780 7 equemene
          try:
781 75 equemene
            a,m,s,aR,mR,sR=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,RNG,ValueType)
782 7 equemene
          except:
783 7 equemene
            print "Problem with %i // computations on Cuda" % Jobs
784 7 equemene
        elif GpuStyle=='OpenCL':
785 7 equemene
          try:
786 75 equemene
            a,m,s,aR,mR,sR=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,Alu,Device,RNG,ValueType)
787 7 equemene
          except:
788 7 equemene
            print "Problem with %i // computations on OpenCL" % Jobs
789 7 equemene
        duration=numpy.append(duration,time.time()-start)
790 75 equemene
        rate=numpy.append(rate,Iterations/(time.time()-start))
791 75 equemene
      if (a,m,s) != (0,0,0):
792 46 equemene
        avg=numpy.mean(duration)
793 46 equemene
        med=numpy.median(duration)
794 46 equemene
        std=numpy.std(duration)
795 75 equemene
        avgR=numpy.mean(Iterations/duration)
796 75 equemene
        medR=numpy.median(Iterations/duration)
797 75 equemene
        stdR=numpy.std(Iterations/duration)
798 46 equemene
      else:
799 46 equemene
        print "Values seem to be wrong..."
800 7 equemene
    else:
801 7 equemene
      if GpuStyle=='CUDA':
802 7 equemene
        try:
803 75 equemene
          avg,med,std,avgR,medR,stdR=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,RNG,ValueType)
804 7 equemene
        except:
805 7 equemene
          print "Problem with %i // computations on Cuda" % Jobs
806 7 equemene
      elif GpuStyle=='OpenCL':
807 55 equemene
        try:
808 75 equemene
          avg,med,std,avgR,medR,stdR=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,RNG,ValueType)
809 55 equemene
        except:
810 55 equemene
          print "Problem with %i // computations on OpenCL" % Jobs
811 7 equemene
812 7 equemene
    if (avg,med,std) != (0,0,0):
813 15 equemene
      print "jobs,avg,med,std",Jobs,avg,med,std
814 7 equemene
      average=numpy.append(average,avg)
815 7 equemene
      median=numpy.append(median,med)
816 7 equemene
      stddev=numpy.append(stddev,std)
817 75 equemene
      averageRate=numpy.append(averageRate,avgR)
818 75 equemene
      medianRate=numpy.append(medianRate,medR)
819 75 equemene
      stddevRate=numpy.append(stddevRate,stdR)
820 7 equemene
    else:
821 7 equemene
      print "Values seem to be wrong..."
822 7 equemene
    #THREADS*=2
823 46 equemene
    if len(average)!=0:
824 75 equemene
      averageRate=averageRate.astype(int)
825 75 equemene
      medianRate=medianRate.astype(int)
826 75 equemene
      stddevRate=stddevRate.astype(int)
827 75 equemene
      numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev,averageRate,medianRate,stddevRate))
828 75 equemene
      ToSave=[ ExploredJobs,average,median,stddev,averageRate,medianRate,stddevRate ]
829 75 equemene
      numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %e %e %e %i %i %i')
830 47 equemene
    Jobs+=JobStep
831 7 equemene
832 55 equemene
  if Fit:
833 55 equemene
    FitAndPrint(ExploredJobs,median,Curves)