Statistiques
| Révision :

root / Splutter / GPU / SplutterGPU.py @ 307

Historique | Voir | Annoter | Télécharger (27,7 ko)

1 57 equemene
#!/usr/bin/env python
2 57 equemene
3 57 equemene
#
4 57 equemene
# Splutter-by-MonteCarlo using PyCUDA/PyOpenCL
5 57 equemene
#
6 57 equemene
# CC BY-NC-SA 2014 : <emmanuel.quemener@ens-lyon.fr>
7 57 equemene
#
8 57 equemene
# Thanks to Andreas Klockner for PyCUDA:
9 57 equemene
# http://mathema.tician.de/software/pycuda
10 66 equemene
# http://mathema.tician.de/software/pyopencl
11 57 equemene
#
12 57 equemene
13 57 equemene
# 2013-01-01 : problems with launch timeout
14 57 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
15 57 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
16 57 equemene
17 66 equemene
# Marsaglia elements about RNG
18 66 equemene
19 57 equemene
# Common tools
20 57 equemene
import numpy
21 57 equemene
from numpy.random import randint as nprnd
22 57 equemene
import sys
23 57 equemene
import getopt
24 57 equemene
import time
25 57 equemene
import math
26 57 equemene
from socket import gethostname
27 57 equemene
28 68 equemene
# find prime factors of a number
29 68 equemene
# Get for WWW :
30 68 equemene
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
31 68 equemene
def PrimeFactors(x):
32 68 equemene
  factorlist=numpy.array([]).astype('uint32')
33 68 equemene
  loop=2
34 68 equemene
  while loop<=x:
35 68 equemene
    if x%loop==0:
36 68 equemene
      x/=loop
37 68 equemene
      factorlist=numpy.append(factorlist,[loop])
38 68 equemene
    else:
39 68 equemene
      loop+=1
40 68 equemene
  return factorlist
41 68 equemene
42 57 equemene
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
43 57 equemene
# output is thread number
44 57 equemene
def BestThreadsNumber(jobs):
45 57 equemene
  factors=PrimeFactors(jobs)
46 57 equemene
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
47 57 equemene
  threads=1
48 57 equemene
  for factor in matrix.transpose().ravel():
49 57 equemene
    threads=threads*factor
50 57 equemene
    if threads*threads>jobs:
51 57 equemene
      break
52 57 equemene
  return(long(threads))
53 57 equemene
54 57 equemene
# Predicted Amdahl Law (Reduced with s=1-p)
55 57 equemene
def AmdahlR(N, T1, p):
56 57 equemene
  return (T1*(1-p+p/N))
57 57 equemene
58 57 equemene
# Predicted Amdahl Law
59 57 equemene
def Amdahl(N, T1, s, p):
60 57 equemene
  return (T1*(s+p/N))
61 57 equemene
62 57 equemene
# Predicted Mylq Law with first order
63 57 equemene
def Mylq(N, T1,s,c,p):
64 57 equemene
  return (T1*(s+p/N)+c*N)
65 57 equemene
66 57 equemene
# Predicted Mylq Law with second order
67 57 equemene
def Mylq2(N, T1,s,c1,c2,p):
68 57 equemene
  return (T1*(s+p/N)+c1*N+c2*N*N)
69 57 equemene
70 57 equemene
KERNEL_CODE_CUDA="""
71 57 equemene

72 57 equemene
// Marsaglia RNG very simple implementation
73 57 equemene

74 57 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
75 57 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
76 63 equemene

77 57 equemene
#define MWC   (znew+wnew)
78 57 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
79 57 equemene
#define CONG  (jcong=69069*jcong+1234567)
80 57 equemene
#define KISS  ((MWC^CONG)+SHR3)
81 57 equemene

82 63 equemene
#define CONGfp CONG * 2.328306435454494e-10f
83 63 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
84 57 equemene
#define MWCfp MWC * 2.328306435454494e-10f
85 57 equemene
#define KISSfp KISS * 2.328306435454494e-10f
86 57 equemene

87 63 equemene
#define MAX (ulong)4294967296
88 66 equemene
#define UMAX (uint)2147483648
89 57 equemene

90 66 equemene
__global__ void SplutterGlobal(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
91 66 equemene
{
92 66 equemene
    const ulong id=(ulong)(blockIdx.x);
93 66 equemene

94 66 equemene
    uint z=seed_z-(uint)id;
95 66 equemene
    uint w=seed_w+(uint)id;
96 66 equemene

97 66 equemene
    uint jsr=seed_z;
98 66 equemene
    uint jcong=seed_w;
99 66 equemene

100 66 equemene
   for ( ulong i=0;i<iterations;i++) {
101 66 equemene

102 66 equemene
      // All version
103 66 equemene
      uint position=(uint)( ((ulong)MWC*(ulong)space)/MAX );
104 66 equemene

105 66 equemene
      // UMAX is set to avoid round over overflow
106 66 equemene
      atomicInc(&s[position],UMAX);
107 66 equemene
   }
108 66 equemene

109 66 equemene
   __syncthreads();
110 66 equemene
}
111 66 equemene

112 63 equemene
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
113 63 equemene
{
114 63 equemene
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
115 63 equemene
    const ulong size=(ulong)(gridDim.x*blockDim.x);
116 63 equemene
    const ulong block=(ulong)space/(ulong)size;
117 63 equemene

118 63 equemene
    uint z=seed_z-(uint)id;
119 63 equemene
    uint w=seed_w+(uint)id;
120 63 equemene

121 63 equemene
    uint jsr=seed_z;
122 63 equemene
    uint jcong=seed_w;
123 63 equemene

124 63 equemene
   for ( ulong i=0;i<iterations;i++) {
125 63 equemene

126 63 equemene
      // Dense version
127 63 equemene
       uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX );
128 63 equemene

129 63 equemene
      s[position]++;
130 63 equemene
   }
131 63 equemene

132 63 equemene
   __syncthreads();
133 57 equemene
}
134 63 equemene

135 63 equemene
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
136 63 equemene
{
137 63 equemene
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
138 63 equemene
    const ulong size=(ulong)(gridDim.x*blockDim.x);
139 63 equemene
    const ulong block=(ulong)space/(ulong)size;
140 63 equemene

141 63 equemene
    uint z=seed_z-(uint)id;
142 63 equemene
    uint w=seed_w+(uint)id;
143 63 equemene

144 63 equemene
    uint jsr=seed_z;
145 63 equemene
    uint jcong=seed_w;
146 63 equemene

147 63 equemene
   for ( ulong i=0;i<iterations;i++) {
148 63 equemene

149 63 equemene
      // Sparse version
150 63 equemene
       uint position=(uint)( (ulong)MWC*block/MAX*size+id );
151 63 equemene

152 63 equemene
      s[position]++;
153 63 equemene
   }
154 63 equemene

155 63 equemene
   __syncthreads();
156 57 equemene
}
157 57 equemene

158 63 equemene
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
159 57 equemene
{
160 63 equemene
    const ulong id=(ulong)(threadIdx.x);
161 63 equemene
    const ulong size=(ulong)(blockDim.x);
162 63 equemene
    const ulong block=(ulong)space/(ulong)size;
163 63 equemene

164 63 equemene
    uint z=seed_z-(uint)id;
165 63 equemene
    uint w=seed_w+(uint)id;
166 57 equemene

167 63 equemene
    uint jsr=seed_z;
168 63 equemene
    uint jcong=seed_w;
169 57 equemene

170 63 equemene
   for ( ulong i=0;i<iterations;i++) {
171 57 equemene

172 63 equemene
      // Dense version
173 63 equemene
       size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
174 57 equemene

175 63 equemene
      s[position]++;
176 63 equemene
   }
177 57 equemene

178 57 equemene

179 63 equemene
   __syncthreads();
180 63 equemene

181 57 equemene
}
182 57 equemene

183 63 equemene
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
184 63 equemene
{
185 63 equemene
    const ulong id=(ulong)threadIdx.x;
186 63 equemene
    const ulong size=(ulong)blockDim.x;
187 63 equemene
    const ulong block=(ulong)space/(ulong)size;
188 63 equemene

189 63 equemene
    uint z=seed_z-(uint)id;
190 63 equemene
    uint w=seed_w+(uint)id;
191 57 equemene

192 63 equemene
    uint jsr=seed_z;
193 63 equemene
    uint jcong=seed_w;
194 57 equemene

195 63 equemene
   for ( ulong i=0;i<iterations;i++) {
196 57 equemene

197 63 equemene
      // Sparse version
198 63 equemene
       size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
199 57 equemene

200 63 equemene
      s[position]++;
201 57 equemene
   }
202 57 equemene

203 57 equemene
   __syncthreads();
204 63 equemene

205 57 equemene
}
206 57 equemene

207 63 equemene
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
208 57 equemene
{
209 63 equemene
    const ulong id=(ulong)(blockIdx.x);
210 63 equemene
    const ulong size=(ulong)(gridDim.x);
211 63 equemene
    const ulong block=(ulong)space/(ulong)size;
212 63 equemene

213 63 equemene
    uint z=seed_z-(uint)id;
214 63 equemene
    uint w=seed_w+(uint)id;
215 57 equemene

216 63 equemene
    uint jsr=seed_z;
217 63 equemene
    uint jcong=seed_w;
218 57 equemene

219 63 equemene
   for ( ulong i=0;i<iterations;i++) {
220 57 equemene

221 63 equemene
      // Dense version
222 63 equemene
      size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
223 63 equemene

224 63 equemene
      s[position]++;
225 57 equemene
   }
226 63 equemene

227 63 equemene
   __syncthreads();
228 63 equemene
}
229 57 equemene

230 63 equemene
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
231 63 equemene
{
232 63 equemene
    const ulong id=(ulong)(blockIdx.x);
233 63 equemene
    const ulong size=(ulong)(gridDim.x);
234 63 equemene
    const ulong block=(ulong)space/(ulong)size;
235 63 equemene

236 63 equemene
    uint z=seed_z-(uint)id;
237 63 equemene
    uint w=seed_w+(uint)id;
238 63 equemene

239 63 equemene
    uint jsr=seed_z;
240 63 equemene
    uint jcong=seed_w;
241 63 equemene

242 63 equemene
   for ( ulong i=0;i<iterations;i++) {
243 63 equemene

244 63 equemene
      // Sparse version
245 63 equemene
      size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id );
246 63 equemene

247 63 equemene
      s[position]++;
248 63 equemene

249 63 equemene
   }
250 63 equemene

251 63 equemene
   //s[blockIdx.x]=blockIdx.x;
252 57 equemene
   __syncthreads();
253 63 equemene
}
254 57 equemene

255 57 equemene
"""
256 57 equemene
257 57 equemene
KERNEL_CODE_OPENCL="""
258 66 equemene
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
259 66 equemene

260 57 equemene
// Marsaglia RNG very simple implementation
261 57 equemene
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
262 57 equemene
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
263 57 equemene

264 57 equemene
#define MWC   (znew+wnew)
265 57 equemene
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
266 57 equemene
#define CONG  (jcong=69069*jcong+1234567)
267 57 equemene
#define KISS  ((MWC^CONG)+SHR3)
268 57 equemene

269 57 equemene
#define CONGfp CONG * 2.328306435454494e-10f
270 57 equemene
#define SHR3fp SHR3 * 2.328306435454494e-10f
271 57 equemene
#define MWCfp MWC * 2.328306435454494e-10f
272 57 equemene
#define KISSfp KISS * 2.328306435454494e-10f
273 57 equemene

274 60 equemene
#define MAX (ulong)4294967296
275 57 equemene

276 57 equemene
uint rotl(uint value, int shift) {
277 57 equemene
    return (value << shift) | (value >> (sizeof(value) * CHAR_BIT - shift));
278 57 equemene
}
279 57 equemene

280 57 equemene
uint rotr(uint value, int shift) {
281 57 equemene
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
282 57 equemene
}
283 57 equemene

284 66 equemene
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
285 66 equemene
{
286 66 equemene
   __private const ulong id=(ulong)get_global_id(0);
287 66 equemene

288 66 equemene
   __private uint z=seed_z-(uint)id;
289 66 equemene
   __private uint w=seed_w+(uint)id;
290 66 equemene

291 66 equemene
   __private uint jsr=seed_z;
292 66 equemene
   __private uint jcong=seed_w;
293 66 equemene

294 66 equemene
   for (__private ulong i=0;i<iterations;i++) {
295 66 equemene

296 66 equemene
      // Dense version
297 68 equemene
      __private size_t position=(size_t)( MWC%space );
298 66 equemene

299 66 equemene
      atomic_inc(&s[position]);
300 66 equemene
   }
301 66 equemene

302 66 equemene
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
303 66 equemene

304 66 equemene
}
305 66 equemene

306 68 equemene
__kernel void SplutterLocal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
307 57 equemene
{
308 63 equemene
   __private const ulong id=(ulong)get_local_id(0);
309 63 equemene

310 63 equemene
   __private uint z=seed_z-(uint)id;
311 63 equemene
   __private uint w=seed_w+(uint)id;
312 63 equemene

313 63 equemene
   __private uint jsr=seed_z;
314 63 equemene
   __private uint jcong=seed_w;
315 63 equemene

316 63 equemene
   for (__private ulong i=0;i<iterations;i++) {
317 63 equemene

318 63 equemene
      // Dense version
319 68 equemene
      //__private size_t position=(size_t)( (MWC+id*block)%space );
320 68 equemene
      __private size_t position=(size_t)( MWC%space );
321 63 equemene

322 68 equemene
      atomic_inc(&s[position]);
323 57 equemene
   }
324 57 equemene

325 57 equemene
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
326 57 equemene

327 57 equemene
}
328 57 equemene

329 68 equemene
__kernel void SplutterHybrid(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
330 57 equemene
{
331 68 equemene
   __private const ulong id=(ulong)(get_global_id(0)+get_local_id(0));
332 63 equemene

333 63 equemene
   __private uint z=seed_z-(uint)id;
334 63 equemene
   __private uint w=seed_w+(uint)id;
335 57 equemene

336 63 equemene
   __private uint jsr=seed_z;
337 63 equemene
   __private uint jcong=seed_w;
338 57 equemene

339 63 equemene
   for (__private ulong i=0;i<iterations;i++) {
340 57 equemene

341 63 equemene
      // Dense version
342 68 equemene
      __private size_t position=(size_t)( MWC%space );
343 57 equemene

344 68 equemene
      atomic_inc(&s[position]);
345 57 equemene
   }
346 63 equemene

347 63 equemene
}
348 57 equemene

349 57 equemene
"""
350 57 equemene
351 68 equemene
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density,Memory):
352 57 equemene
353 57 equemene
  # Avec PyCUDA autoinit, rien a faire !
354 66 equemene
355 57 equemene
  circleCU = cuda.InOut(circle)
356 57 equemene
357 57 equemene
  mod = SourceModule(KERNEL_CODE_CUDA)
358 57 equemene
359 66 equemene
  if Density=='Dense':
360 63 equemene
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
361 63 equemene
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
362 63 equemene
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
363 66 equemene
  elif Density=='Sparse':
364 63 equemene
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
365 63 equemene
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
366 63 equemene
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
367 66 equemene
  else:
368 66 equemene
    MetropolisBlocksCU=mod.get_function("SplutterGlobal")
369 66 equemene
370 57 equemene
  start = pycuda.driver.Event()
371 57 equemene
  stop = pycuda.driver.Event()
372 57 equemene
373 57 equemene
  MySplutter=numpy.zeros(steps)
374 57 equemene
  MyDuration=numpy.zeros(steps)
375 57 equemene
376 57 equemene
  if iterations%jobs==0:
377 57 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
378 57 equemene
  else:
379 57 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
380 57 equemene
381 57 equemene
  iterationsNew=iterationsCL*jobs
382 57 equemene
383 63 equemene
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
384 63 equemene
385 57 equemene
  for i in range(steps):
386 57 equemene
387 202 equemene
    start_time=time.time()
388 63 equemene
    Splutter[:]=0
389 57 equemene
390 307 equemene
    print(Splutter,len(Splutter))
391 57 equemene
392 57 equemene
    SplutterCU = cuda.InOut(Splutter)
393 57 equemene
394 57 equemene
    start.record()
395 57 equemene
    start.synchronize()
396 57 equemene
    if ParaStyle=='Blocks':
397 57 equemene
      MetropolisBlocksCU(SplutterCU,
398 57 equemene
                         numpy.uint32(len(Splutter)),
399 57 equemene
                         numpy.uint64(iterationsCL),
400 57 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
401 57 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
402 57 equemene
                         grid=(jobs,1),
403 57 equemene
                         block=(1,1,1))
404 57 equemene
405 307 equemene
      print("%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
406 307 equemene
            (Alu,jobs,1,ParaStyle))
407 57 equemene
    elif ParaStyle=='Hybrid':
408 57 equemene
      threads=BestThreadsNumber(jobs)
409 57 equemene
      MetropolisHybridCU(SplutterCU,
410 57 equemene
                         numpy.uint32(len(Splutter)),
411 57 equemene
                         numpy.uint64(iterationsCL),
412 57 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
413 57 equemene
                         numpy.uint32(nprnd(2**30/jobs)),
414 57 equemene
                         grid=(jobs,1),
415 57 equemene
                         block=(threads,1,1))
416 307 equemene
      print("%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
417 307 equemene
            (Alu,jobs/threads,threads,ParaStyle))
418 57 equemene
    else:
419 63 equemene
      MetropolisThreadsCU(SplutterCU,
420 57 equemene
                       numpy.uint32(len(Splutter)),
421 57 equemene
                       numpy.uint64(iterationsCL),
422 57 equemene
                       numpy.uint32(nprnd(2**30/jobs)),
423 57 equemene
                       numpy.uint32(nprnd(2**30/jobs)),
424 57 equemene
                       grid=(1,1),
425 57 equemene
                       block=(jobs,1,1))
426 307 equemene
      print("%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
427 307 equemene
            (Alu,1,jobs,ParaStyle))
428 57 equemene
    stop.record()
429 57 equemene
    stop.synchronize()
430 57 equemene
431 202 equemene
#    elapsed = start.time_till(stop)*1e-3
432 202 equemene
    elapsed = time.time()-start_time
433 57 equemene
434 307 equemene
    print(Splutter,sum(Splutter))
435 57 equemene
    MySplutter[i]=numpy.median(Splutter)
436 307 equemene
    print(numpy.mean(Splutter),MySplutter[i],numpy.std(Splutter))
437 57 equemene
438 57 equemene
    MyDuration[i]=elapsed
439 57 equemene
440 57 equemene
    #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
441 57 equemene
    #MyPi[i]=numpy.median(AllPi)
442 57 equemene
    #print MyPi[i],numpy.std(AllPi),MyDuration[i]
443 57 equemene
444 57 equemene
445 307 equemene
  print(jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
446 57 equemene
447 57 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
448 57 equemene
449 57 equemene
450 66 equemene
def MetropolisOpenCL(circle,iterations,steps,jobs,
451 68 equemene
                     ParaStyle,Alu,Device,Memory):
452 57 equemene
453 57 equemene
  # Initialisation des variables en les CASTant correctement
454 57 equemene
455 61 equemene
  MaxMemoryXPU=0
456 61 equemene
  MinMemoryXPU=0
457 61 equemene
458 57 equemene
  if Device==0:
459 307 equemene
    print("Enter XPU selector based on ALU type: first selected")
460 57 equemene
    HasXPU=False
461 57 equemene
    # Default Device selection based on ALU Type
462 57 equemene
    for platform in cl.get_platforms():
463 57 equemene
      for device in platform.get_devices():
464 202 equemene
        #deviceType=cl.device_type.to_string(device.type)
465 57 equemene
        deviceMemory=device.max_mem_alloc_size
466 61 equemene
        if deviceMemory>MaxMemoryXPU:
467 61 equemene
          MaxMemoryXPU=deviceMemory
468 61 equemene
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
469 61 equemene
          MinMemoryXPU=deviceMemory
470 202 equemene
        if not HasXPU:
471 57 equemene
          XPU=device
472 307 equemene
          print("XPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name))
473 57 equemene
          HasXPU=True
474 57 equemene
          MemoryXPU=deviceMemory
475 61 equemene
476 57 equemene
  else:
477 307 equemene
    print("Enter XPU selector based on device number & ALU type")
478 57 equemene
    Id=1
479 57 equemene
    HasXPU=False
480 57 equemene
    # Primary Device selection based on Device Id
481 57 equemene
    for platform in cl.get_platforms():
482 57 equemene
      for device in platform.get_devices():
483 202 equemene
        #deviceType=cl.device_type.to_string(device.type)
484 57 equemene
        deviceMemory=device.max_mem_alloc_size
485 61 equemene
        if deviceMemory>MaxMemoryXPU:
486 61 equemene
          MaxMemoryXPU=deviceMemory
487 61 equemene
        if deviceMemory<MinMemoryXPU or MinMemoryXPU==0:
488 61 equemene
          MinMemoryXPU=deviceMemory
489 202 equemene
        if Id==Device  and HasXPU==False:
490 57 equemene
          XPU=device
491 307 equemene
          print("CPU/GPU selected with Allocable Memory %i: %s" % (deviceMemory,device.name))
492 57 equemene
          HasXPU=True
493 57 equemene
          MemoryXPU=deviceMemory
494 57 equemene
        Id=Id+1
495 57 equemene
    if HasXPU==False:
496 307 equemene
      print("No XPU #%i of type %s found in all of %i devices, sorry..." % \
497 307 equemene
          (Device,Alu,Id-1))
498 57 equemene
      return(0,0,0)
499 57 equemene
500 307 equemene
  print("Allocable Memory is %i, between %i and %i " % (MemoryXPU,MinMemoryXPU,MaxMemoryXPU))
501 61 equemene
502 57 equemene
  # Je cree le contexte et la queue pour son execution
503 57 equemene
  ctx = cl.Context([XPU])
504 63 equemene
  queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
505 63 equemene
506 57 equemene
  # Je recupere les flag possibles pour les buffers
507 57 equemene
  mf = cl.mem_flags
508 63 equemene
509 57 equemene
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
510 63 equemene
511 57 equemene
  MyDuration=numpy.zeros(steps)
512 57 equemene
513 57 equemene
  if iterations%jobs==0:
514 57 equemene
    iterationsCL=numpy.uint64(iterations/jobs)
515 57 equemene
  else:
516 57 equemene
    iterationsCL=numpy.uint64(iterations/jobs+1)
517 63 equemene
518 57 equemene
  iterationsNew=numpy.uint64(iterationsCL*jobs)
519 57 equemene
520 57 equemene
  MySplutter=numpy.zeros(steps)
521 57 equemene
522 61 equemene
  MaxWorks=2**(int)(numpy.log2(MinMemoryXPU/4))
523 307 equemene
  print(MaxWorks,2**(int)(numpy.log2(MemoryXPU)))
524 61 equemene
525 62 equemene
  #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
526 68 equemene
  #Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
527 68 equemene
  Splutter=numpy.zeros(Memory).astype(numpy.uint32)
528 61 equemene
529 57 equemene
  for i in range(steps):
530 57 equemene
531 57 equemene
    #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
532 57 equemene
    #Splutter=numpy.zeros(1024).astype(numpy.uint32)
533 57 equemene
534 57 equemene
    #Splutter=numpy.zeros(jobs).astype(numpy.uint32)
535 57 equemene
536 62 equemene
    Splutter[:]=0
537 62 equemene
538 307 equemene
    print(Splutter,len(Splutter))
539 57 equemene
540 202 equemene
    h2d_time=time.time()
541 57 equemene
    SplutterCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=Splutter)
542 202 equemene
    print('From Host to Device time %f' % (time.time()-h2d_time))
543 57 equemene
544 202 equemene
    start_time=time.time()
545 57 equemene
    if ParaStyle=='Blocks':
546 57 equemene
      # Call OpenCL kernel
547 57 equemene
      # (1,) is Global work size (only 1 work size)
548 57 equemene
      # (1,) is local work size
549 57 equemene
      # circleCL is lattice translated in CL format
550 57 equemene
      # SeedZCL is lattice translated in CL format
551 57 equemene
      # SeedWCL is lattice translated in CL format
552 57 equemene
      # step is number of iterations
553 57 equemene
      # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
554 57 equemene
      #                                      SplutterCL,
555 57 equemene
      #                                      numpy.uint32(len(Splutter)),
556 57 equemene
      #                                      numpy.uint64(iterationsCL),
557 57 equemene
      #                                      numpy.uint32(nprnd(2**30/jobs)),
558 57 equemene
      #                                      numpy.uint32(nprnd(2**30/jobs)))
559 68 equemene
      CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None,
560 68 equemene
                                           SplutterCL,
561 68 equemene
                                           numpy.uint32(len(Splutter)),
562 68 equemene
                                           numpy.uint64(iterationsCL),
563 68 equemene
                                           numpy.uint32(nprnd(2**30/jobs)),
564 68 equemene
                                           numpy.uint32(nprnd(2**30/jobs)))
565 63 equemene
566 307 equemene
      print("%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
567 307 equemene
            (Alu,jobs,1,ParaStyle))
568 57 equemene
    elif ParaStyle=='Hybrid':
569 68 equemene
      #threads=BestThreadsNumber(jobs)
570 68 equemene
      threads=BestThreadsNumber(256)
571 307 equemene
      print("print",threads)
572 57 equemene
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
573 68 equemene
      CLLaunch=MetropolisCL.SplutterHybrid(queue,(jobs,),(threads,),
574 68 equemene
                                           SplutterCL,
575 68 equemene
                                           numpy.uint32(len(Splutter)),
576 68 equemene
                                           numpy.uint64(iterationsCL),
577 68 equemene
                                           numpy.uint32(nprnd(2**30/jobs)),
578 68 equemene
                                           numpy.uint32(nprnd(2**30/jobs)))
579 57 equemene
580 307 equemene
      print("%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
581 307 equemene
            (Alu,jobs/threads,threads,ParaStyle))
582 57 equemene
    else:
583 66 equemene
      # en OpenCL, necessaire de mettre un global_id identique au local_id
584 68 equemene
      CLLaunch=MetropolisCL.SplutterLocal(queue,(jobs,),(jobs,),
585 68 equemene
                                          SplutterCL,
586 68 equemene
                                          numpy.uint32(len(Splutter)),
587 68 equemene
                                          numpy.uint64(iterationsCL),
588 68 equemene
                                          numpy.uint32(nprnd(2**30/jobs)),
589 68 equemene
                                          numpy.uint32(nprnd(2**30/jobs)))
590 63 equemene
591 66 equemene
592 307 equemene
      print("%s with %i %s done" % (Alu,jobs,ParaStyle))
593 57 equemene
594 57 equemene
    CLLaunch.wait()
595 202 equemene
    d2h_time=time.time()
596 57 equemene
    cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
597 202 equemene
    print('From Device to Host %f' % (time.time()-d2h_time))
598 202 equemene
599 202 equemene
#    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
600 202 equemene
    elapsed = time.time()-start_time
601 202 equemene
    print('Elapsed compute time %f' % elapsed)
602 57 equemene
603 57 equemene
    MyDuration[i]=elapsed
604 307 equemene
    print(Splutter,sum(Splutter))
605 61 equemene
    #MySplutter[i]=numpy.median(Splutter)
606 307 equemene
    #print(numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter))
607 68 equemene
608 68 equemene
  SplutterCL.release()
609 57 equemene
610 307 equemene
  print(jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
611 57 equemene
612 57 equemene
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
613 57 equemene
614 57 equemene
615 57 equemene
def FitAndPrint(N,D,Curves):
616 57 equemene
617 57 equemene
  from scipy.optimize import curve_fit
618 57 equemene
  import matplotlib.pyplot as plt
619 57 equemene
620 57 equemene
  try:
621 57 equemene
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
622 57 equemene
623 57 equemene
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
624 57 equemene
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
625 57 equemene
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
626 57 equemene
    coeffs_Amdahl[0]=D[0]
627 307 equemene
    print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % \
628 307 equemene
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
629 57 equemene
  except:
630 307 equemene
    print("Impossible to fit for Amdahl law : only %i elements" % len(D))
631 57 equemene
632 57 equemene
  try:
633 57 equemene
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
634 57 equemene
635 57 equemene
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
636 57 equemene
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
637 57 equemene
    coeffs_AmdahlR[0]=D[0]
638 307 equemene
    print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % \
639 307 equemene
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
640 57 equemene
641 57 equemene
  except:
642 307 equemene
    print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
643 57 equemene
644 57 equemene
  try:
645 57 equemene
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
646 57 equemene
647 57 equemene
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
648 57 equemene
    # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
649 57 equemene
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
650 57 equemene
    coeffs_Mylq[0]=D[0]
651 307 equemene
    print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
652 57 equemene
                                                            coeffs_Mylq[1],
653 57 equemene
                                                            coeffs_Mylq[3],
654 307 equemene
                                                            coeffs_Mylq[2]))
655 57 equemene
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
656 57 equemene
                coeffs_Mylq[3])
657 57 equemene
  except:
658 307 equemene
    print("Impossible to fit for Mylq law : only %i elements" % len(D))
659 57 equemene
660 57 equemene
  try:
661 57 equemene
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
662 57 equemene
663 57 equemene
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
664 57 equemene
    # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
665 57 equemene
    # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
666 57 equemene
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
667 57 equemene
    coeffs_Mylq2[0]=D[0]
668 307 equemene
    print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % \
669 307 equemene
          (coeffs_Mylq2[0],coeffs_Mylq2[1],
670 307 equemene
           coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
671 57 equemene
672 57 equemene
  except:
673 307 equemene
    print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) )
674 57 equemene
675 57 equemene
  if Curves:
676 57 equemene
    plt.xlabel("Number of Threads/work Items")
677 57 equemene
    plt.ylabel("Total Elapsed Time")
678 57 equemene
679 57 equemene
    Experience,=plt.plot(N,D,'ro')
680 57 equemene
    try:
681 57 equemene
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")
682 57 equemene
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
683 57 equemene
    except:
684 307 equemene
      print("Fit curves seem not to be available")
685 57 equemene
686 57 equemene
    plt.legend()
687 57 equemene
    plt.show()
688 57 equemene
689 57 equemene
if __name__=='__main__':
690 57 equemene
691 57 equemene
  # Set defaults values
692 57 equemene
693 57 equemene
  # Alu can be CPU, GPU or ACCELERATOR
694 57 equemene
  Alu='CPU'
695 57 equemene
  # Id of GPU : 1 is for first find !
696 57 equemene
  Device=0
697 57 equemene
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
698 57 equemene
  GpuStyle='OpenCL'
699 57 equemene
  # Parallel distribution can be on Threads or Blocks
700 57 equemene
  ParaStyle='Blocks'
701 57 equemene
  # Iterations is integer
702 66 equemene
  Iterations=10000000
703 57 equemene
  # JobStart in first number of Jobs to explore
704 57 equemene
  JobStart=1
705 57 equemene
  # JobEnd is last number of Jobs to explore
706 57 equemene
  JobEnd=16
707 57 equemene
  # JobStep is the step of Jobs to explore
708 57 equemene
  JobStep=1
709 57 equemene
  # Redo is the times to redo the test to improve metrology
710 57 equemene
  Redo=1
711 57 equemene
  # OutMetrology is method for duration estimation : False is GPU inside
712 57 equemene
  OutMetrology=False
713 57 equemene
  Metrology='InMetro'
714 57 equemene
  # Curves is True to print the curves
715 57 equemene
  Curves=False
716 57 equemene
  # Fit is True to print the curves
717 57 equemene
  Fit=False
718 68 equemene
  # Memory of vector explored
719 68 equemene
  Memory=1024
720 57 equemene
721 57 equemene
  try:
722 68 equemene
    opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
723 57 equemene
  except getopt.GetoptError:
724 307 equemene
    print('%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0])
725 57 equemene
    sys.exit(2)
726 57 equemene
727 57 equemene
  for opt, arg in opts:
728 57 equemene
    if opt == '-h':
729 307 equemene
      print('%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0])
730 57 equemene
731 307 equemene
      print("\nInformations about devices detected under OpenCL:")
732 57 equemene
      # For PyOpenCL import
733 57 equemene
      try:
734 57 equemene
        import pyopencl as cl
735 57 equemene
        Id=1
736 57 equemene
        for platform in cl.get_platforms():
737 57 equemene
          for device in platform.get_devices():
738 202 equemene
            #deviceType=cl.device_type.to_string(device.type)
739 57 equemene
            deviceMemory=device.max_mem_alloc_size
740 307 equemene
            print("Device #%i from %s with memory %i : %s" % (Id,platform.vendor,deviceMemory,device.name.lstrip()))
741 57 equemene
            Id=Id+1
742 57 equemene
743 307 equemene
        print()
744 57 equemene
        sys.exit()
745 57 equemene
      except ImportError:
746 307 equemene
        print("Your platform does not seem to support OpenCL")
747 57 equemene
748 57 equemene
    elif opt == '-o':
749 57 equemene
      OutMetrology=True
750 57 equemene
      Metrology='OutMetro'
751 57 equemene
    elif opt == '-c':
752 57 equemene
      Curves=True
753 57 equemene
    elif opt == '-f':
754 57 equemene
      Fit=True
755 57 equemene
    elif opt in ("-a", "--alu"):
756 57 equemene
      Alu = arg
757 57 equemene
    elif opt in ("-d", "--device"):
758 57 equemene
      Device = int(arg)
759 57 equemene
    elif opt in ("-g", "--gpustyle"):
760 57 equemene
      GpuStyle = arg
761 57 equemene
    elif opt in ("-p", "--parastyle"):
762 57 equemene
      ParaStyle = arg
763 57 equemene
    elif opt in ("-i", "--iterations"):
764 57 equemene
      Iterations = numpy.uint64(arg)
765 57 equemene
    elif opt in ("-s", "--jobstart"):
766 57 equemene
      JobStart = int(arg)
767 57 equemene
    elif opt in ("-e", "--jobend"):
768 57 equemene
      JobEnd = int(arg)
769 57 equemene
    elif opt in ("-t", "--jobstep"):
770 57 equemene
      JobStep = int(arg)
771 57 equemene
    elif opt in ("-r", "--redo"):
772 57 equemene
      Redo = int(arg)
773 68 equemene
    elif opt in ("-m", "--memory"):
774 68 equemene
      Memory = int(arg)
775 57 equemene
776 57 equemene
  if Alu=='CPU' and GpuStyle=='CUDA':
777 307 equemene
    print("Alu can't be CPU for CUDA, set Alu to GPU")
778 57 equemene
    Alu='GPU'
779 57 equemene
780 57 equemene
  if ParaStyle not in ('Blocks','Threads','Hybrid'):
781 307 equemene
    print("%s not exists, ParaStyle set as Threads !" % ParaStyle)
782 66 equemene
    ParaStyle='Blocks'
783 57 equemene
784 307 equemene
  print("Compute unit : %s" % Alu)
785 307 equemene
  print("Device Identification : %s" % Device)
786 307 equemene
  print("GpuStyle used : %s" % GpuStyle)
787 307 equemene
  print("Parallel Style used : %s" % ParaStyle)
788 307 equemene
  print("Iterations : %s" % Iterations)
789 307 equemene
  print("Number of threads on start : %s" % JobStart)
790 307 equemene
  print("Number of threads on end : %s" % JobEnd)
791 307 equemene
  print("Number of redo : %s" % Redo)
792 307 equemene
  print("Memory  : %s" % Memory)
793 307 equemene
  print("Metrology done out of CPU/GPU : %r" % OutMetrology)
794 57 equemene
795 57 equemene
  if GpuStyle=='CUDA':
796 57 equemene
    try:
797 57 equemene
      # For PyCUDA import
798 57 equemene
      import pycuda.driver as cuda
799 57 equemene
      import pycuda.gpuarray as gpuarray
800 57 equemene
      import pycuda.autoinit
801 57 equemene
      from pycuda.compiler import SourceModule
802 57 equemene
    except ImportError:
803 307 equemene
      print("Platform does not seem to support CUDA")
804 57 equemene
805 57 equemene
  if GpuStyle=='OpenCL':
806 57 equemene
    try:
807 57 equemene
      # For PyOpenCL import
808 57 equemene
      import pyopencl as cl
809 57 equemene
      Id=1
810 57 equemene
      for platform in cl.get_platforms():
811 57 equemene
        for device in platform.get_devices():
812 202 equemene
          #deviceType=cl.device_type.to_string(device.type)
813 307 equemene
          print("Device #%i : %s" % (Id,device.name))
814 57 equemene
          if Id == Device:
815 57 equemene
            # Set the Alu as detected Device Type
816 202 equemene
            Alu='xPU'
817 57 equemene
          Id=Id+1
818 57 equemene
    except ImportError:
819 307 equemene
      print("Platform does not seem to support CUDA")
820 57 equemene
821 57 equemene
  average=numpy.array([]).astype(numpy.float32)
822 57 equemene
  median=numpy.array([]).astype(numpy.float32)
823 57 equemene
  stddev=numpy.array([]).astype(numpy.float32)
824 57 equemene
825 57 equemene
  ExploredJobs=numpy.array([]).astype(numpy.uint32)
826 57 equemene
827 57 equemene
  Jobs=JobStart
828 57 equemene
829 57 equemene
  while Jobs <= JobEnd:
830 57 equemene
    avg,med,std=0,0,0
831 57 equemene
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
832 57 equemene
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
833 57 equemene
834 57 equemene
    if OutMetrology:
835 57 equemene
      duration=numpy.array([]).astype(numpy.float32)
836 57 equemene
      for i in range(Redo):
837 57 equemene
        start=time.time()
838 57 equemene
        if GpuStyle=='CUDA':
839 57 equemene
          try:
840 68 equemene
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,
841 68 equemene
                                 Memory)
842 57 equemene
          except:
843 307 equemene
            print("Problem with %i // computations on Cuda" % Jobs)
844 57 equemene
        elif GpuStyle=='OpenCL':
845 57 equemene
          try:
846 57 equemene
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
847 68 equemene
                                   Alu,Device,Memory)
848 57 equemene
          except:
849 307 equemene
            print("Problem with %i // computations on OpenCL" % Jobs)
850 57 equemene
        duration=numpy.append(duration,time.time()-start)
851 57 equemene
      if (a,m,s) != (0,0,0):
852 57 equemene
        avg=numpy.mean(duration)
853 57 equemene
        med=numpy.median(duration)
854 57 equemene
        std=numpy.std(duration)
855 57 equemene
      else:
856 307 equemene
        print("Values seem to be wrong...")
857 57 equemene
    else:
858 57 equemene
      if GpuStyle=='CUDA':
859 57 equemene
        try:
860 66 equemene
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,
861 68 equemene
                                     Jobs,ParaStyle,Memory)
862 57 equemene
        except:
863 307 equemene
          print("Problem with %i // computations on Cuda" % Jobs)
864 57 equemene
      elif GpuStyle=='OpenCL':
865 57 equemene
        try:
866 68 equemene
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,
867 68 equemene
                                       ParaStyle,Alu,Device,Memory)
868 57 equemene
        except:
869 307 equemene
          print("Problem with %i // computations on OpenCL" % Jobs)
870 57 equemene
871 57 equemene
    if (avg,med,std) != (0,0,0):
872 307 equemene
      print("jobs,avg,med,std",Jobs,avg,med,std)
873 57 equemene
      average=numpy.append(average,avg)
874 57 equemene
      median=numpy.append(median,med)
875 57 equemene
      stddev=numpy.append(stddev,std)
876 57 equemene
    else:
877 307 equemene
      print("Values seem to be wrong...")
878 57 equemene
    #THREADS*=2
879 57 equemene
    if len(average)!=0:
880 60 equemene
      numpy.savez("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),(ExploredJobs,average,median,stddev))
881 57 equemene
      ToSave=[ ExploredJobs,average,median,stddev ]
882 60 equemene
      numpy.savetxt("Splutter_%s_%s_%s_%i_%i_%.8i_Device%i_%s_%s" % (Alu,GpuStyle,ParaStyle,JobStart,JobEnd,Iterations,Device,Metrology,gethostname()),numpy.transpose(ToSave))
883 57 equemene
    Jobs+=JobStep
884 57 equemene
885 57 equemene
  if Fit:
886 57 equemene
    FitAndPrint(ExploredJobs,median,Curves)