Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 158

Historique | Voir | Annoter | Télécharger (30,15 ko)

1
#!/usr/bin/env python3
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import itertools
26
from socket import gethostname
27

    
28
class PenStacle:
29
    """Pentacle of Statistics from data"""
30
    Avg=0
31
    Med=0
32
    Std=0
33
    Min=0
34
    Max=0
35
    def __init__(self,Data):
36
        self.Avg=numpy.average(Data)
37
        self.Med=numpy.median(Data)
38
        self.Std=numpy.std(Data)
39
        self.Max=numpy.max(Data)
40
        self.Min=numpy.min(Data)
41
    def display(self):
42
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43

    
44
class Experience:
45
    """Metrology for experiences"""
46
    DeviceStyle=''
47
    DeviceId=0
48
    AvgD=0
49
    MedD=0
50
    StdD=0
51
    MinD=0
52
    MaxD=0
53
    AvgR=0
54
    MedR=0
55
    StdR=0
56
    MinR=0
57
    MaxR=0
58
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59
        self.DeviceStyle=DeviceStyle
60
        self.DeviceId=DeviceId
61
        self.Iterations
62
        
63
    def Metrology(self,Data):
64
        Duration=PenStacle(Data)
65
        Rate=PenStacle(Iterations/Data)
66
        print("Duration %s" % Duration)
67
        print("Rate %s" % Rate)
68

    
69

    
70
        
71
def DictionariesAPI():
72
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74
    return(Marsaglia,Computing)
75
    
76
# find prime factors of a number
77
# Get for WWW :
78
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
79
def PrimeFactors(x):
80

    
81
    factorlist=numpy.array([]).astype('uint32')
82
    loop=2
83
    while loop<=x:
84
        if x%loop==0:
85
            x/=loop
86
            factorlist=numpy.append(factorlist,[loop])
87
        else:
88
            loop+=1
89
    return factorlist
90

    
91
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
92
# output is thread number
93
def BestThreadsNumber(jobs):
94
    factors=PrimeFactors(jobs)
95
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
96
    threads=1
97
    for factor in matrix.transpose().ravel():
98
        threads=threads*factor
99
        if threads*threads>jobs or threads>512:
100
            break
101
    return(long(threads))
102

    
103
# Predicted Amdahl Law (Reduced with s=1-p)  
104
def AmdahlR(N, T1, p):
105
    return (T1*(1-p+p/N))
106

    
107
# Predicted Amdahl Law
108
def Amdahl(N, T1, s, p):
109
    return (T1*(s+p/N))
110

    
111
# Predicted Mylq Law with first order
112
def Mylq(N, T1,s,c,p):
113
    return (T1*(s+p/N)+c*N)
114

    
115
# Predicted Mylq Law with second order
116
def Mylq2(N, T1,s,c1,c2,p):
117
    return (T1*(s+p/N)+c1*N+c2*N*N)
118

    
119
def KernelCodeCuda():
120
    KERNEL_CODE_CUDA="""
121
#define TCONG 0
122
#define TSHR3 1
123
#define TMWC 2
124
#define TKISS 3
125

126
#define TINT32 0
127
#define TINT64 1
128
#define TFP32 2
129
#define TFP64 3
130

131
// Marsaglia RNG very simple implementation
132

133
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
134
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
135
#define MWC   (znew+wnew)
136
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
137
#define CONG  (jcong=69069*jcong+1234567)
138
#define KISS  ((MWC^CONG)+SHR3)
139

140
#define MWCfp MWC * 2.328306435454494e-10f
141
#define KISSfp KISS * 2.328306435454494e-10f
142
#define SHR3fp SHR3 * 2.328306435454494e-10f
143
#define CONGfp CONG * 2.328306435454494e-10f
144

145
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
146
{
147

148
#if TRNG == TCONG
149
   uint jcong=seed_z+work;
150
#elif TRNG == TSHR3
151
   uint jsr=seed_w+work;
152
#elif TRNG == TMWC
153
   uint z=seed_z+work;
154
   uint w=seed_w+work;
155
#elif TRNG == TKISS
156
   uint jcong=seed_z+work;
157
   uint jsr=seed_w+work;
158
   uint z=seed_z-work;
159
   uint w=seed_w-work;
160
#endif
161

162
   ulong total=0;
163

164
   for (ulong i=0;i<iterations;i++) {
165

166
#if TYPE == TINT32
167
    #define THEONE 1073741824
168
    #if TRNG == TCONG
169
        uint x=CONG>>17 ;
170
        uint y=CONG>>17 ;
171
    #elif TRNG == TSHR3
172
        uint x=SHR3>>17 ;
173
        uint y=SHR3>>17 ;
174
    #elif TRNG == TMWC
175
        uint x=MWC>>17 ;
176
        uint y=MWC>>17 ;
177
    #elif TRNG == TKISS
178
        uint x=KISS>>17 ;
179
        uint y=KISS>>17 ;
180
    #endif
181
#elif TYPE == TINT64
182
    #define THEONE 4611686018427387904
183
    #if TRNG == TCONG
184
        ulong x=(ulong)(CONG>>1) ;
185
        ulong y=(ulong)(CONG>>1) ;
186
    #elif TRNG == TSHR3
187
        ulong x=(ulong)(SHR3>>1) ;
188
        ulong y=(ulong)(SHR3>>1) ;
189
    #elif TRNG == TMWC
190
        ulong x=(ulong)(MWC>>1) ;
191
        ulong y=(ulong)(MWC>>1) ;
192
    #elif TRNG == TKISS
193
        ulong x=(ulong)(KISS>>1) ;
194
        ulong y=(ulong)(KISS>>1) ;
195
    #endif
196
#elif TYPE == TFP32
197
    #define THEONE 1.0f
198
    #if TRNG == TCONG
199
        float x=CONGfp ;
200
        float y=CONGfp ;
201
    #elif TRNG == TSHR3
202
        float x=SHR3fp ;
203
        float y=SHR3fp ;
204
    #elif TRNG == TMWC
205
        float x=MWCfp ;
206
        float y=MWCfp ;
207
    #elif TRNG == TKISS
208
      float x=KISSfp ;
209
      float y=KISSfp ;
210
    #endif
211
#elif TYPE == TFP64
212
    #define THEONE 1.0f
213
    #if TRNG == TCONG
214
        double x=(double)CONGfp ;
215
        double y=(double)CONGfp ;
216
    #elif TRNG == TSHR3
217
        double x=(double)SHR3fp ;
218
        double y=(double)SHR3fp ;
219
    #elif TRNG == TMWC
220
        double x=(double)MWCfp ;
221
        double y=(double)MWCfp ;
222
    #elif TRNG == TKISS
223
        double x=(double)KISSfp ;
224
        double y=(double)KISSfp ;
225
    #endif
226
#endif
227

228
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
229
      total+=inside;
230
   }
231

232
   return(total);
233
}
234

235
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
236
{
237
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
238
   s[blockIdx.x]=total;
239
   __syncthreads();
240

241
}
242

243
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
244
{
245
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
246
   s[threadIdx.x]=total;
247
   __syncthreads();
248

249
}
250

251
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
252
{
253
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
254
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
255
   __syncthreads();
256
}
257

258
"""
259
    return(KERNEL_CODE_CUDA)
260

    
261
def KernelCodeOpenCL():
262
    KERNEL_CODE_OPENCL="""
263
#define TCONG 0
264
#define TSHR3 1
265
#define TMWC 2
266
#define TKISS 3
267

268
#define TINT32 0
269
#define TINT64 1
270
#define TFP32 2
271
#define TFP64 3
272

273
// Marsaglia RNG very simple implementation
274
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
275
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
276

277
#define MWC   (znew+wnew)
278
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
279
#define CONG  (jcong=69069*jcong+1234567)
280
#define KISS  ((MWC^CONG)+SHR3)
281

282
#define MWCfp MWC * 2.328306435454494e-10f
283
#define KISSfp KISS * 2.328306435454494e-10f
284
#define CONGfp CONG * 2.328306435454494e-10f
285
#define SHR3fp SHR3 * 2.328306435454494e-10f
286

287
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
288
{
289

290
#if TRNG == TCONG
291
   uint jcong=seed_z+work;
292
#elif TRNG == TSHR3
293
   uint jsr=seed_w+work;
294
#elif TRNG == TMWC
295
   uint z=seed_z+work;
296
   uint w=seed_w+work;
297
#elif TRNG == TKISS
298
   uint jcong=seed_z+work;
299
   uint jsr=seed_w+work;
300
   uint z=seed_z-work;
301
   uint w=seed_w-work;
302
#endif
303

304
   ulong total=0;
305

306
   for (ulong i=0;i<iterations;i++) {
307

308
#if TYPE == TINT32
309
    #define THEONE 1073741824
310
    #if TRNG == TCONG
311
        uint x=CONG>>17 ;
312
        uint y=CONG>>17 ;
313
    #elif TRNG == TSHR3
314
        uint x=SHR3>>17 ;
315
        uint y=SHR3>>17 ;
316
    #elif TRNG == TMWC
317
        uint x=MWC>>17 ;
318
        uint y=MWC>>17 ;
319
    #elif TRNG == TKISS
320
        uint x=KISS>>17 ;
321
        uint y=KISS>>17 ;
322
    #endif
323
#elif TYPE == TINT64
324
    #define THEONE 4611686018427387904
325
    #if TRNG == TCONG
326
        ulong x=(ulong)(CONG>>1) ;
327
        ulong y=(ulong)(CONG>>1) ;
328
    #elif TRNG == TSHR3
329
        ulong x=(ulong)(SHR3>>1) ;
330
        ulong y=(ulong)(SHR3>>1) ;
331
    #elif TRNG == TMWC
332
        ulong x=(ulong)(MWC>>1) ;
333
        ulong y=(ulong)(MWC>>1) ;
334
    #elif TRNG == TKISS
335
        ulong x=(ulong)(KISS>>1) ;
336
        ulong y=(ulong)(KISS>>1) ;
337
    #endif
338
#elif TYPE == TFP32
339
    #define THEONE 1.0f
340
    #if TRNG == TCONG
341
        float x=CONGfp ;
342
        float y=CONGfp ;
343
    #elif TRNG == TSHR3
344
        float x=SHR3fp ;
345
        float y=SHR3fp ;
346
    #elif TRNG == TMWC
347
        float x=MWCfp ;
348
        float y=MWCfp ;
349
    #elif TRNG == TKISS
350
      float x=KISSfp ;
351
      float y=KISSfp ;
352
    #endif
353
#elif TYPE == TFP64
354
#pragma OPENCL EXTENSION cl_khr_fp64: enable
355
    #define THEONE 1.0f
356
    #if TRNG == TCONG
357
        double x=(double)CONGfp ;
358
        double y=(double)CONGfp ;
359
    #elif TRNG == TSHR3
360
        double x=(double)SHR3fp ;
361
        double y=(double)SHR3fp ;
362
    #elif TRNG == TMWC
363
        double x=(double)MWCfp ;
364
        double y=(double)MWCfp ;
365
    #elif TRNG == TKISS
366
        double x=(double)KISSfp ;
367
        double y=(double)KISSfp ;
368
    #endif
369
#endif
370

371
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
372
      total+=inside;
373
   }
374
   
375
   return(total);
376
}
377

378
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
379
{
380
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
381
   barrier(CLK_GLOBAL_MEM_FENCE);
382
   s[get_global_id(0)]=total;     
383
}
384

385
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
386
{
387
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
388
   barrier(CLK_LOCAL_MEM_FENCE);
389
   s[get_local_id(0)]=total;
390
}
391

392
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
393
{
394
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
395
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
396
   s[get_global_id(0)]=total;
397
}
398

399
"""
400
    return(KERNEL_CODE_OPENCL)
401
    
402
def MetropolisCuda(InputCU):
403

    
404
    print("Inside ",InputCU)
405
    
406
    iterations=InputCU['Iterations']
407
    steps=InputCU['Steps']
408
    blocks=InputCU['Blocks']
409
    threads=InputCU['Threads']
410
    Device=InputCU['Device']
411
    RNG=InputCU['RNG']
412
    ValueType=InputCU['ValueType']
413
    
414
    Marsaglia,Computing=DictionariesAPI()
415
    
416
    try:
417
        # For PyCUDA import
418
        import pycuda.driver as cuda
419
        from pycuda.compiler import SourceModule
420
        
421
        cuda.init()
422
        for Id in range(cuda.Device.count()):
423
            if Id==Device:
424
                XPU=cuda.Device(Id)
425
                print("GPU selected %s" % XPU.name())
426
        print
427

    
428
    except ImportError:
429
        print("Platform does not seem to support CUDA")
430

    
431
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)  
432
    circleCU = cuda.InOut(circle)
433
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
434
    #cuda.memcpy_htod(circleCU, circle)
435

    
436
    Context=XPU.make_context()
437

    
438
    try:
439
        #mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
440
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
441
    except:
442
        print("Compilation seems to broke")
443
 
444
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
445
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
446
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
447

    
448
    MyDuration=numpy.zeros(steps)
449

    
450
    jobs=blocks*threads;
451

    
452
    iterationsCU=numpy.uint64(iterations/jobs)
453
    if iterations%jobs!=0:
454
        iterationsCU+=numpy.uint64(1)
455

    
456
    for i in range(steps):
457
        start_time=time.time()
458

    
459
        try:
460
            MetropolisHybridCU(circleCU,
461
                               numpy.uint64(iterationsCU),
462
                               numpy.uint32(nprnd(2**32)),
463
                               numpy.uint32(nprnd(2**32)),
464
                               grid=(blocks,1),block=(threads,1,1))
465
        except:
466
            print("Crash during CUDA call")
467

    
468
        elapsed = time.time()-start_time
469
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
470

    
471
        MyDuration[i]=elapsed
472

    
473
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
474
    print(OutputCU)
475
    Context.pop()
476
    
477
    #Context.detach()
478
    return(OutputCU)
479

    
480
def MetropolisOpenCL(InputCL):
481

    
482
    import pyopencl as cl
483

    
484
    print("Inside ",InputCL)
485
    
486
    iterations=InputCL['Iterations']
487
    steps=InputCL['Steps']
488
    blocks=InputCL['Blocks']
489
    threads=InputCL['Threads']
490
    Device=InputCL['Device']
491
    RNG=InputCL['RNG']
492
    ValueType=InputCL['ValueType']
493
    
494
    Marsaglia,Computing=DictionariesAPI()
495
    
496
    # Initialisation des variables en les CASTant correctement
497
    Id=0
498
    HasXPU=False
499
    for platform in cl.get_platforms():
500
        for device in platform.get_devices():
501
            if Id==Device:
502
                XPU=device
503
                print("CPU/GPU selected: ",device.name.lstrip())
504
                HasXPU=True
505
            Id+=1
506

    
507
    if HasXPU==False:
508
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
509
        sys.exit()              
510

    
511
    # Je cree le contexte et la queue pour son execution
512
    try:
513
        ctx = cl.Context([XPU])
514
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
515
    except:
516
        print("Crash during context creation")
517

    
518
    # Je recupere les flag possibles pour les buffers
519
    mf = cl.mem_flags
520
        
521
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
522
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
523
    
524
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType]))
525

    
526
    MyDuration=numpy.zeros(steps)
527
  
528
    jobs=blocks*threads;
529

    
530
    iterationsCL=numpy.uint64(iterations/jobs)
531
    if iterations%jobs!=0:
532
        iterationsCL+=1
533

    
534
    for i in range(steps):
535
        start_time=time.time()
536
        if threads == 1:
537
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
538
                                                 circleCL,
539
                                                 numpy.uint64(iterationsCL),
540
                                                 numpy.uint32(nprnd(2**32)),
541
                                                 numpy.uint32(nprnd(2**32)))
542
        else:
543
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
544
                                                 circleCL,
545
                                                 numpy.uint64(iterationsCL),
546
                                                 numpy.uint32(nprnd(2**32)),
547
                                                 numpy.uint32(nprnd(2**32)))
548

    
549
        CLLaunch.wait()
550
        cl.enqueue_copy(queue, circle, circleCL).wait()
551

    
552
        elapsed = time.time()-start_time
553
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
554
            
555
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
556
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
557

    
558
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
559
        MyDuration[i]=elapsed
560
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
561
        # MyPi[i]=numpy.median(AllPi)
562
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
563

    
564
    circleCL.release()
565

    
566
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
567
    print(OutputCL)
568
    return(OutputCL)
569

    
570

    
571
def FitAndPrint(N,D,Curves):
572

    
573
    from scipy.optimize import curve_fit
574
    import matplotlib.pyplot as plt
575

    
576
    try:
577
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
578

    
579
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
580
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
581
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
582
        coeffs_Amdahl[0]=D[0]
583
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
584
    except:
585
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
586

    
587
    try:
588
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
589

    
590
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
591
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
592
        coeffs_AmdahlR[0]=D[0]
593
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
594

    
595
    except:
596
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
597

    
598
    try:
599
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
600

    
601
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
602
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
603
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
604
        coeffs_Mylq[0]=D[0]
605
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
606
                                                                coeffs_Mylq[1],
607
                                                                coeffs_Mylq[3],
608
                                                                coeffs_Mylq[2]))
609
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
610
                    coeffs_Mylq[3])
611
    except:
612
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
613

    
614
    try:
615
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
616

    
617
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
618
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
619
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
620
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
621
        coeffs_Mylq2[0]=D[0]
622
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
623

    
624
    except:
625
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
626

    
627
    if Curves:
628
        plt.xlabel("Number of Threads/work Items")
629
        plt.ylabel("Total Elapsed Time")
630

    
631
        Experience,=plt.plot(N,D,'ro') 
632
    try:
633
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
634
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
635
    except:
636
        print("Fit curves seem not to be available")
637

    
638
    plt.legend()
639
    plt.show()
640

    
641
if __name__=='__main__':
642
    
643
    # Set defaults values
644
  
645
    # Id of Device : 1 is for first find !
646
    Device=1
647
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
648
    GpuStyle='OpenCL'
649
    # Iterations is integer
650
    Iterations=10000000
651
    # BlocksBlocks in first number of Blocks to explore
652
    BlocksBegin=1
653
    # BlocksEnd is last number of Blocks to explore
654
    BlocksEnd=1
655
    # BlocksStep is the step of Blocks to explore
656
    BlocksStep=1
657
    # ThreadsBlocks in first number of Blocks to explore
658
    ThreadsBegin=1
659
    # ThreadsEnd is last number of Blocks to explore
660
    ThreadsEnd=1
661
    # ThreadsStep is the step of Blocks to explore
662
    ThreadsStep=1
663
    # Redo is the times to redo the test to improve metrology
664
    Redo=1
665
    # OutMetrology is method for duration estimation : False is GPU inside
666
    OutMetrology=False
667
    Metrology='InMetro'
668
    # Curves is True to print the curves
669
    Curves=False
670
    # Fit is True to print the curves
671
    Fit=False
672
    # Marsaglia RNG
673
    RNG='MWC'
674
    # Value type : INT32, INT64, FP32, FP64
675
    ValueType='FP32'
676

    
677
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
678
    
679
    try:
680
        opts, args = getopt.getopt(sys.argv[1:],"hocg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
681
    except getopt.GetoptError:
682
        print(HowToUse % sys.argv[0])
683
        sys.exit(2)
684

    
685
    # List of Devices
686
    Devices=[]
687
    Alu={}
688
        
689
    for opt, arg in opts:
690
        if opt == '-h':
691
            print(HowToUse % sys.argv[0])
692

    
693
            print("\nInformations about devices detected under OpenCL API:")
694
            # For PyOpenCL import
695
            try:
696
                import pyopencl as cl
697
                Id=0
698
                for platform in cl.get_platforms():
699
                    for device in platform.get_devices():
700
                        #deviceType=cl.device_type.to_string(device.type)
701
                        deviceType="xPU"
702
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
703
                        Id=Id+1
704

    
705
            except:
706
                print("Your platform does not seem to support OpenCL")
707

    
708
            print("\nInformations about devices detected under CUDA API:")
709
            # For PyCUDA import
710
            try:
711
                import pycuda.driver as cuda
712
                cuda.init()
713
                for Id in range(cuda.Device.count()):
714
                    device=cuda.Device(Id)
715
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
716
                print
717
            except:
718
                print("Your platform does not seem to support CUDA")
719
        
720
            sys.exit()
721
        
722
        
723
        elif opt == '-o':
724
            OutMetrology=True
725
            Metrology='OutMetro'
726
        elif opt == '-c':
727
            Curves=True
728
        elif opt in ("-d", "--device"):
729
            Devices.append(int(arg))
730
        elif opt in ("-g", "--gpustyle"):
731
            GpuStyle = arg
732
        elif opt in ("-m", "--marsaglia"):
733
            RNG = arg
734
        elif opt in ("-v", "--valuetype"):
735
            ValueType = arg
736
        elif opt in ("-i", "--iterations"):
737
            Iterations = numpy.uint64(arg)
738
        elif opt in ("-b", "--blocksbegin"):
739
            BlocksBegin = int(arg)
740
        elif opt in ("-e", "--blocksend"):
741
            BlocksEnd = int(arg)
742
        elif opt in ("-s", "--blocksstep"):
743
            BlocksStep = int(arg)
744
        elif opt in ("-f", "--threadsfirst"):
745
            ThreadsBegin = int(arg)
746
        elif opt in ("-l", "--threadslast"):
747
            ThreadsEnd = int(arg)
748
        elif opt in ("-t", "--threadsstep"):
749
            ThreadsStep = int(arg)
750
        elif opt in ("-r", "--redo"):
751
            Redo = int(arg)
752

    
753
    print("Devices Identification : %s" % Devices)
754
    print("GpuStyle used : %s" % GpuStyle)
755
    print("Iterations : %s" % Iterations)
756
    print("Number of Blocks on begin : %s" % BlocksBegin)
757
    print("Number of Blocks on end : %s" % BlocksEnd)
758
    print("Step on Blocks : %s" % BlocksStep)
759
    print("Number of Threads on begin : %s" % ThreadsBegin)
760
    print("Number of Threads on end : %s" % ThreadsEnd)
761
    print("Step on Threads : %s" % ThreadsStep)
762
    print("Number of redo : %s" % Redo)
763
    print("Metrology done out of XPU : %r" % OutMetrology)
764
    print("Type of Marsaglia RNG used : %s" % RNG)
765
    print("Type of variable : %s" % ValueType)
766

    
767
    if GpuStyle=='CUDA':
768
        try:
769
            # For PyCUDA import
770
            import pycuda.driver as cuda
771
            
772
            cuda.init()
773
            for Id in range(cuda.Device.count()):
774
                device=cuda.Device(Id)
775
                print("Device #%i of type GPU : %s" % (Id,device.name()))
776
                if Id in Devices:
777
                    Alu[Id]='GPU'
778
            
779
        except ImportError:
780
            print("Platform does not seem to support CUDA")
781

    
782
    if GpuStyle=='OpenCL':
783
        try:
784
            # For PyOpenCL import
785
            import pyopencl as cl
786
            Id=0
787
            for platform in cl.get_platforms():
788
                for device in platform.get_devices():
789
                    #deviceType=cl.device_type.to_string(device.type)
790
                    deviceType="xPU"
791
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
792

    
793
                    if Id in Devices:
794
                    # Set the Alu as detected Device Type
795
                        Alu[Id]=deviceType
796
                    Id=Id+1
797
        except ImportError:
798
            print("Platform does not seem to support OpenCL")
799

    
800
    print(Devices,Alu)
801
            
802
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
803
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
804
    
805
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
806
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
807
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
808
    avgD=numpy.array([]).astype(numpy.float32)
809
    medD=numpy.array([]).astype(numpy.float32)
810
    stdD=numpy.array([]).astype(numpy.float32)
811
    minD=numpy.array([]).astype(numpy.float32)
812
    maxD=numpy.array([]).astype(numpy.float32)
813
    avgR=numpy.array([]).astype(numpy.float32)
814
    medR=numpy.array([]).astype(numpy.float32)
815
    stdR=numpy.array([]).astype(numpy.float32)
816
    minR=numpy.array([]).astype(numpy.float32)
817
    maxR=numpy.array([]).astype(numpy.float32)
818

    
819
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
820
        
821
        # print Blocks,Threads
822
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
823
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
824
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
825
        ExploredThreads=numpy.append(ExploredThreads,Threads)
826
        
827
        if OutMetrology: 
828
            DurationItem=numpy.array([]).astype(numpy.float32)
829
            Duration=numpy.array([]).astype(numpy.float32)
830
            Rate=numpy.array([]).astype(numpy.float32)
831
            for i in range(Redo):
832
                start=time.time()
833
                if GpuStyle=='CUDA':
834
                    try:
835
                        InputCU={}
836
                        InputCU['Iterations']=Iterations
837
                        InputCU['Steps']=1
838
                        InputCU['Blocks']=Blocks
839
                        InputCU['Threads']=Threads
840
                        InputCU['Device']=Devices[0]
841
                        InputCU['RNG']=RNG
842
                        InputCU['ValueType']=ValueType                    
843
                        OutputCU=MetropolisCuda(InputCU)
844
                        Inside=OutputCU['Circle']
845
                        NewIterations=OutputCU['NewIterations']
846
                        Duration=OutputCU['Duration']
847
                    except:
848
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
849
                elif GpuStyle=='OpenCL':
850
                    try:
851
                        InputCL={}
852
                        InputCL['Iterations']=Iterations
853
                        InputCL['Steps']=1
854
                        InputCL['Blocks']=Blocks
855
                        InputCL['Threads']=Threads
856
                        InputCL['Device']=Devices[0]
857
                        InputCL['RNG']=RNG
858
                        InputCL['ValueType']=ValueType                    
859
                        OutputCL=MetropolisOpenCL(InputCL)
860
                        Inside=OutputCL['Circle']
861
                        NewIterations=OutputCL['NewIterations']
862
                        Duration=OutputCL['Duration']
863
                    except:
864
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
865
                Duration=numpy.append(Duration,time.time()-start)
866
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
867
        else:
868
            if GpuStyle=='CUDA':
869
                try:
870
                    InputCU={}
871
                    InputCU['Iterations']=Iterations
872
                    InputCU['Steps']=Redo
873
                    InputCU['Blocks']=Blocks
874
                    InputCU['Threads']=Threads
875
                    InputCU['Device']=Devices[0]
876
                    InputCU['RNG']=RNG
877
                    InputCU['ValueType']=ValueType
878
                    OutputCU=MetropolisCuda(InputCU)
879
                    Inside=OutputCU['Inside']
880
                    NewIterations=OutputCU['NewIterations']
881
                    Duration=OutputCU['Duration']
882
                except:
883
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
884
                try:
885
                    pycuda.context.pop()
886
                except:
887
                    pass
888
            elif GpuStyle=='OpenCL':
889
                try:
890
                    InputCL={}
891
                    InputCL['Iterations']=Iterations
892
                    InputCL['Steps']=Redo
893
                    InputCL['Blocks']=Blocks
894
                    InputCL['Threads']=Threads
895
                    InputCL['Device']=Devices[0]
896
                    InputCL['RNG']=RNG
897
                    InputCL['ValueType']=ValueType
898
                    OutputCL=MetropolisOpenCL(InputCL)
899
                    Inside=OutputCL['Inside']
900
                    NewIterations=OutputCL['NewIterations']
901
                    Duration=OutputCL['Duration']
902
                except:
903
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
904
            Rate=NewIterations/Duration
905
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
906

    
907
            
908
        avgD=numpy.append(avgD,numpy.average(Duration))
909
        medD=numpy.append(medD,numpy.median(Duration))
910
        stdD=numpy.append(stdD,numpy.std(Duration))
911
        minD=numpy.append(minD,numpy.min(Duration))
912
        maxD=numpy.append(maxD,numpy.max(Duration))
913
        avgR=numpy.append(avgR,numpy.average(Rate))
914
        medR=numpy.append(medR,numpy.median(Rate))
915
        stdR=numpy.append(stdR,numpy.std(Rate))
916
        minR=numpy.append(minR,numpy.min(Rate))
917
        maxR=numpy.append(maxR,numpy.max(Rate))
918

    
919
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
920
        
921
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
922
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
923
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
924

    
925
    if Fit:
926
        FitAndPrint(ExploredJobs,median,Curves)