Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 195

Historique | Voir | Annoter | Télécharger (31,43 ko)

1
#!/usr/bin/env python3
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import itertools
26
from socket import gethostname
27

    
28
class PenStacle:
29
    """Pentacle of Statistics from data"""
30
    Avg=0
31
    Med=0
32
    Std=0
33
    Min=0
34
    Max=0
35
    def __init__(self,Data):
36
        self.Avg=numpy.average(Data)
37
        self.Med=numpy.median(Data)
38
        self.Std=numpy.std(Data)
39
        self.Max=numpy.max(Data)
40
        self.Min=numpy.min(Data)
41
    def display(self):
42
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43

    
44
class Experience:
45
    """Metrology for experiences"""
46
    DeviceStyle=''
47
    DeviceId=0
48
    AvgD=0
49
    MedD=0
50
    StdD=0
51
    MinD=0
52
    MaxD=0
53
    AvgR=0
54
    MedR=0
55
    StdR=0
56
    MinR=0
57
    MaxR=0
58
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59
        self.DeviceStyle=DeviceStyle
60
        self.DeviceId=DeviceId
61
        self.Iterations
62
        
63
    def Metrology(self,Data):
64
        Duration=PenStacle(Data)
65
        Rate=PenStacle(Iterations/Data)
66
        print("Duration %s" % Duration)
67
        print("Rate %s" % Rate)
68

    
69

    
70
        
71
def DictionariesAPI():
72
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74
    Test={True:1,False:0}
75
    return(Marsaglia,Computing,Test)
76
    
77
# find prime factors of a number
78
# Get for WWW :
79
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
80
def PrimeFactors(x):
81

    
82
    factorlist=numpy.array([]).astype('uint32')
83
    loop=2
84
    while loop<=x:
85
        if x%loop==0:
86
            x/=loop
87
            factorlist=numpy.append(factorlist,[loop])
88
        else:
89
            loop+=1
90
    return factorlist
91

    
92
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
93
# output is thread number
94
def BestThreadsNumber(jobs):
95
    factors=PrimeFactors(jobs)
96
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
97
    threads=1
98
    for factor in matrix.transpose().ravel():
99
        threads=threads*factor
100
        if threads*threads>jobs or threads>512:
101
            break
102
    return(long(threads))
103

    
104
# Predicted Amdahl Law (Reduced with s=1-p)  
105
def AmdahlR(N, T1, p):
106
    return (T1*(1-p+p/N))
107

    
108
# Predicted Amdahl Law
109
def Amdahl(N, T1, s, p):
110
    return (T1*(s+p/N))
111

    
112
# Predicted Mylq Law with first order
113
def Mylq(N, T1,s,c,p):
114
    return (T1*(s+p/N)+c*N)
115

    
116
# Predicted Mylq Law with second order
117
def Mylq2(N, T1,s,c1,c2,p):
118
    return (T1*(s+p/N)+c1*N+c2*N*N)
119

    
120
def KernelCodeCuda():
121
    KERNEL_CODE_CUDA="""
122
#define TCONG 0
123
#define TSHR3 1
124
#define TMWC 2
125
#define TKISS 3
126

127
#define TINT32 0
128
#define TINT64 1
129
#define TFP32 2
130
#define TFP64 3
131

132
#define IFTHEN 1
133

134
// Marsaglia RNG very simple implementation
135

136
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
137
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
138
#define MWC   (znew+wnew)
139
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
140
#define CONG  (jcong=69069*jcong+1234567)
141
#define KISS  ((MWC^CONG)+SHR3)
142

143
#define MWCfp MWC * 2.328306435454494e-10f
144
#define KISSfp KISS * 2.328306435454494e-10f
145
#define SHR3fp SHR3 * 2.328306435454494e-10f
146
#define CONGfp CONG * 2.328306435454494e-10f
147

148
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
149
{
150

151
#if TRNG == TCONG
152
   uint jcong=seed_z+work;
153
#elif TRNG == TSHR3
154
   uint jsr=seed_w+work;
155
#elif TRNG == TMWC
156
   uint z=seed_z+work;
157
   uint w=seed_w+work;
158
#elif TRNG == TKISS
159
   uint jcong=seed_z+work;
160
   uint jsr=seed_w+work;
161
   uint z=seed_z-work;
162
   uint w=seed_w-work;
163
#endif
164

165
   ulong total=0;
166

167
   for (ulong i=0;i<iterations;i++) {
168

169
#if TYPE == TINT32
170
    #define THEONE 1073741824
171
    #if TRNG == TCONG
172
        uint x=CONG>>17 ;
173
        uint y=CONG>>17 ;
174
    #elif TRNG == TSHR3
175
        uint x=SHR3>>17 ;
176
        uint y=SHR3>>17 ;
177
    #elif TRNG == TMWC
178
        uint x=MWC>>17 ;
179
        uint y=MWC>>17 ;
180
    #elif TRNG == TKISS
181
        uint x=KISS>>17 ;
182
        uint y=KISS>>17 ;
183
    #endif
184
#elif TYPE == TINT64
185
    #define THEONE 4611686018427387904
186
    #if TRNG == TCONG
187
        ulong x=(ulong)(CONG>>1) ;
188
        ulong y=(ulong)(CONG>>1) ;
189
    #elif TRNG == TSHR3
190
        ulong x=(ulong)(SHR3>>1) ;
191
        ulong y=(ulong)(SHR3>>1) ;
192
    #elif TRNG == TMWC
193
        ulong x=(ulong)(MWC>>1) ;
194
        ulong y=(ulong)(MWC>>1) ;
195
    #elif TRNG == TKISS
196
        ulong x=(ulong)(KISS>>1) ;
197
        ulong y=(ulong)(KISS>>1) ;
198
    #endif
199
#elif TYPE == TFP32
200
    #define THEONE 1.0f
201
    #if TRNG == TCONG
202
        float x=CONGfp ;
203
        float y=CONGfp ;
204
    #elif TRNG == TSHR3
205
        float x=SHR3fp ;
206
        float y=SHR3fp ;
207
    #elif TRNG == TMWC
208
        float x=MWCfp ;
209
        float y=MWCfp ;
210
    #elif TRNG == TKISS
211
      float x=KISSfp ;
212
      float y=KISSfp ;
213
    #endif
214
#elif TYPE == TFP64
215
    #define THEONE 1.0f
216
    #if TRNG == TCONG
217
        double x=(double)CONGfp ;
218
        double y=(double)CONGfp ;
219
    #elif TRNG == TSHR3
220
        double x=(double)SHR3fp ;
221
        double y=(double)SHR3fp ;
222
    #elif TRNG == TMWC
223
        double x=(double)MWCfp ;
224
        double y=(double)MWCfp ;
225
    #elif TRNG == TKISS
226
        double x=(double)KISSfp ;
227
        double y=(double)KISSfp ;
228
    #endif
229
#endif
230

231
#if TEST == IFTHEN
232
      if ((x*x+y*y) <=THEONE) {
233
         total+=1;
234
      }
235
#else
236
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
237
      total+=inside;
238
#endif
239
   }
240

241
   return(total);
242
}
243

244
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
245
{
246
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
247
   s[blockIdx.x]=total;
248
   __syncthreads();
249

250
}
251

252
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
253
{
254
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
255
   s[threadIdx.x]=total;
256
   __syncthreads();
257

258
}
259

260
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
261
{
262
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
263
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
264
   __syncthreads();
265
}
266

267
"""
268
    return(KERNEL_CODE_CUDA)
269

    
270
def KernelCodeOpenCL():
271
    KERNEL_CODE_OPENCL="""
272
#define TCONG 0
273
#define TSHR3 1
274
#define TMWC 2
275
#define TKISS 3
276

277
#define TINT32 0
278
#define TINT64 1
279
#define TFP32 2
280
#define TFP64 3
281

282
#define IFTHEN 1
283

284
// Marsaglia RNG very simple implementation
285
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
286
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
287

288
#define MWC   (znew+wnew)
289
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
290
#define CONG  (jcong=69069*jcong+1234567)
291
#define KISS  ((MWC^CONG)+SHR3)
292

293
#define MWCfp MWC * 2.328306435454494e-10f
294
#define KISSfp KISS * 2.328306435454494e-10f
295
#define CONGfp CONG * 2.328306435454494e-10f
296
#define SHR3fp SHR3 * 2.328306435454494e-10f
297

298
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
299
{
300

301
#if TRNG == TCONG
302
   uint jcong=seed_z+work;
303
#elif TRNG == TSHR3
304
   uint jsr=seed_w+work;
305
#elif TRNG == TMWC
306
   uint z=seed_z+work;
307
   uint w=seed_w+work;
308
#elif TRNG == TKISS
309
   uint jcong=seed_z+work;
310
   uint jsr=seed_w+work;
311
   uint z=seed_z-work;
312
   uint w=seed_w-work;
313
#endif
314

315
   ulong total=0;
316

317
   for (ulong i=0;i<iterations;i++) {
318

319
#if TYPE == TINT32
320
    #define THEONE 1073741824
321
    #if TRNG == TCONG
322
        uint x=CONG>>17 ;
323
        uint y=CONG>>17 ;
324
    #elif TRNG == TSHR3
325
        uint x=SHR3>>17 ;
326
        uint y=SHR3>>17 ;
327
    #elif TRNG == TMWC
328
        uint x=MWC>>17 ;
329
        uint y=MWC>>17 ;
330
    #elif TRNG == TKISS
331
        uint x=KISS>>17 ;
332
        uint y=KISS>>17 ;
333
    #endif
334
#elif TYPE == TINT64
335
    #define THEONE 4611686018427387904
336
    #if TRNG == TCONG
337
        ulong x=(ulong)(CONG>>1) ;
338
        ulong y=(ulong)(CONG>>1) ;
339
    #elif TRNG == TSHR3
340
        ulong x=(ulong)(SHR3>>1) ;
341
        ulong y=(ulong)(SHR3>>1) ;
342
    #elif TRNG == TMWC
343
        ulong x=(ulong)(MWC>>1) ;
344
        ulong y=(ulong)(MWC>>1) ;
345
    #elif TRNG == TKISS
346
        ulong x=(ulong)(KISS>>1) ;
347
        ulong y=(ulong)(KISS>>1) ;
348
    #endif
349
#elif TYPE == TFP32
350
    #define THEONE 1.0f
351
    #if TRNG == TCONG
352
        float x=CONGfp ;
353
        float y=CONGfp ;
354
    #elif TRNG == TSHR3
355
        float x=SHR3fp ;
356
        float y=SHR3fp ;
357
    #elif TRNG == TMWC
358
        float x=MWCfp ;
359
        float y=MWCfp ;
360
    #elif TRNG == TKISS
361
      float x=KISSfp ;
362
      float y=KISSfp ;
363
    #endif
364
#elif TYPE == TFP64
365
#pragma OPENCL EXTENSION cl_khr_fp64: enable
366
    #define THEONE 1.0f
367
    #if TRNG == TCONG
368
        double x=(double)CONGfp ;
369
        double y=(double)CONGfp ;
370
    #elif TRNG == TSHR3
371
        double x=(double)SHR3fp ;
372
        double y=(double)SHR3fp ;
373
    #elif TRNG == TMWC
374
        double x=(double)MWCfp ;
375
        double y=(double)MWCfp ;
376
    #elif TRNG == TKISS
377
        double x=(double)KISSfp ;
378
        double y=(double)KISSfp ;
379
    #endif
380
#endif
381

382
#if TEST == IFTHEN
383
      if ((x*x+y*y) <= THEONE) {
384
         total+=1;
385
      }
386
#else
387
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
388
      total+=inside;
389
#endif
390
   }
391
   
392
   return(total);
393
}
394

395
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
396
{
397
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
398
   barrier(CLK_GLOBAL_MEM_FENCE);
399
   s[get_global_id(0)]=total;     
400
}
401

402
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
403
{
404
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
405
   barrier(CLK_LOCAL_MEM_FENCE);
406
   s[get_local_id(0)]=total;
407
}
408

409
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
410
{
411
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
412
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
413
   s[get_global_id(0)]=total;
414
}
415

416
"""
417
    return(KERNEL_CODE_OPENCL)
418
    
419
def MetropolisCuda(InputCU):
420

    
421
    print("Inside ",InputCU)
422
    
423
    iterations=InputCU['Iterations']
424
    steps=InputCU['Steps']
425
    blocks=InputCU['Blocks']
426
    threads=InputCU['Threads']
427
    Device=InputCU['Device']
428
    RNG=InputCU['RNG']
429
    ValueType=InputCU['ValueType']
430
    TestType=InputCU['IfThen']
431
    
432
    Marsaglia,Computing,Test=DictionariesAPI()
433
    
434
    try:
435
        # For PyCUDA import
436
        import pycuda.driver as cuda
437
        from pycuda.compiler import SourceModule
438
        
439
        cuda.init()
440
        for Id in range(cuda.Device.count()):
441
            if Id==Device:
442
                XPU=cuda.Device(Id)
443
                print("GPU selected %s" % XPU.name())
444
        print
445

    
446
    except ImportError:
447
        print("Platform does not seem to support CUDA")
448

    
449
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)  
450
    circleCU = cuda.InOut(circle)
451
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
452
    #cuda.memcpy_htod(circleCU, circle)
453

    
454
    Context=XPU.make_context()
455

    
456
    try:
457
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
458
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
459
        # Needed to set the compiler via ccbin for CUDA9 implementation
460
        #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
461
    except:
462
        print("Compilation seems to break")
463
 
464
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
465
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
466
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
467

    
468
    MyDuration=numpy.zeros(steps)
469

    
470
    jobs=blocks*threads;
471

    
472
    iterationsCU=numpy.uint64(iterations/jobs)
473
    if iterations%jobs!=0:
474
        iterationsCU+=numpy.uint64(1)
475

    
476
    for i in range(steps):
477
        start_time=time.time()
478

    
479
        try:
480
            MetropolisHybridCU(circleCU,
481
                               numpy.uint64(iterationsCU),
482
                               numpy.uint32(110271),
483
                               numpy.uint32(101008),
484
                               # numpy.uint32(nprnd(2**32)),
485
                               # numpy.uint32(nprnd(2**32)),
486
                               grid=(blocks,1),block=(threads,1,1))
487
        except:
488
            print("Crash during CUDA call")
489

    
490
        elapsed = time.time()-start_time
491
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
492

    
493
        MyDuration[i]=elapsed
494

    
495
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
496
    print(OutputCU)
497
    Context.pop()
498
    
499
    Context.detach()
500

    
501
    return(OutputCU)
502

    
503
def MetropolisOpenCL(InputCL):
504

    
505
    import pyopencl as cl
506

    
507
    print("Inside ",InputCL)
508
    
509
    iterations=InputCL['Iterations']
510
    steps=InputCL['Steps']
511
    blocks=InputCL['Blocks']
512
    threads=InputCL['Threads']
513
    Device=InputCL['Device']
514
    RNG=InputCL['RNG']  
515
    ValueType=InputCL['ValueType']
516
    TestType=InputCL['IfThen']
517

    
518
    Marsaglia,Computing,Test=DictionariesAPI()
519
    
520
    # Initialisation des variables en les CASTant correctement
521
    Id=0
522
    HasXPU=False
523
    for platform in cl.get_platforms():
524
        for device in platform.get_devices():
525
            if Id==Device:
526
                XPU=device
527
                print("CPU/GPU selected: ",device.name.lstrip())
528
                HasXPU=True
529
            Id+=1
530

    
531
    if HasXPU==False:
532
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
533
        sys.exit()              
534

    
535
        
536
    # Je cree le contexte et la queue pour son execution
537
    try:
538
        ctx = cl.Context(devices=[XPU])
539
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
540
    except:
541
        print("Crash during context creation")
542

    
543
    # Je recupere les flag possibles pour les buffers
544
    mf = cl.mem_flags
545
        
546
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
547
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
548
    
549
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
550

    
551
    MyDuration=numpy.zeros(steps)
552
  
553
    jobs=blocks*threads;
554

    
555
    iterationsCL=numpy.uint64(iterations/jobs)
556
    if iterations%jobs!=0:
557
        iterationsCL+=1
558

    
559
    for i in range(steps):
560
        start_time=time.time()
561
        if threads == 1:
562
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
563
                                                 circleCL,
564
                                                 numpy.uint64(iterationsCL),
565
                                                 numpy.uint32(110271),
566
                                                 numpy.uint32(101008))
567
                                                 # numpy.uint32(nprnd(2**32)),
568
                                                 # numpy.uint32(nprnd(2**32)))
569
        else:
570
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
571
                                                 circleCL,
572
                                                 numpy.uint64(iterationsCL),
573
                                                 numpy.uint32(110271),
574
                                                 numpy.uint32(101008))
575
                                                 # numpy.uint32(nprnd(2**32)),
576
                                                 # numpy.uint32(nprnd(2**32)))
577

    
578
        CLLaunch.wait()
579
        cl.enqueue_copy(queue, circle, circleCL).wait()
580

    
581
        elapsed = time.time()-start_time
582
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
583
            
584
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
585
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
586

    
587
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
588
        MyDuration[i]=elapsed
589
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
590
        # MyPi[i]=numpy.median(AllPi)
591
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
592

    
593
    circleCL.release()
594

    
595
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
596
    print(OutputCL)
597
    return(OutputCL)
598

    
599

    
600
def FitAndPrint(N,D,Curves):
601

    
602
    from scipy.optimize import curve_fit
603
    import matplotlib.pyplot as plt
604

    
605
    try:
606
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
607

    
608
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
609
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
610
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
611
        coeffs_Amdahl[0]=D[0]
612
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
613
    except:
614
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
615

    
616
    try:
617
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
618

    
619
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
620
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
621
        coeffs_AmdahlR[0]=D[0]
622
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
623

    
624
    except:
625
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
626

    
627
    try:
628
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
629

    
630
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
631
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
632
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
633
        coeffs_Mylq[0]=D[0]
634
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
635
                                                                coeffs_Mylq[1],
636
                                                                coeffs_Mylq[3],
637
                                                                coeffs_Mylq[2]))
638
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
639
                    coeffs_Mylq[3])
640
    except:
641
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
642

    
643
    try:
644
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
645

    
646
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
647
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
648
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
649
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
650
        coeffs_Mylq2[0]=D[0]
651
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
652

    
653
    except:
654
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
655

    
656
    if Curves:
657
        plt.xlabel("Number of Threads/work Items")
658
        plt.ylabel("Total Elapsed Time")
659

    
660
        Experience,=plt.plot(N,D,'ro') 
661
    try:
662
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
663
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
664
    except:
665
        print("Fit curves seem not to be available")
666

    
667
    plt.legend()
668
    plt.show()
669

    
670
if __name__=='__main__':
671
    
672
    # Set defaults values
673
  
674
    # Id of Device : 1 is for first find !
675
    Device=1
676
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
677
    GpuStyle='OpenCL'
678
    # Iterations is integer
679
    Iterations=10000000
680
    # BlocksBlocks in first number of Blocks to explore
681
    BlocksBegin=1
682
    # BlocksEnd is last number of Blocks to explore
683
    BlocksEnd=1
684
    # BlocksStep is the step of Blocks to explore
685
    BlocksStep=1
686
    # ThreadsBlocks in first number of Blocks to explore
687
    ThreadsBegin=1
688
    # ThreadsEnd is last number of Blocks to explore
689
    ThreadsEnd=1
690
    # ThreadsStep is the step of Blocks to explore
691
    ThreadsStep=1
692
    # Redo is the times to redo the test to improve metrology
693
    Redo=1
694
    # OutMetrology is method for duration estimation : False is GPU inside
695
    OutMetrology=False
696
    Metrology='InMetro'
697
    # Curves is True to print the curves
698
    Curves=False
699
    # Fit is True to print the curves
700
    Fit=False
701
    # Inside based on If
702
    IfThen=False
703
    # Marsaglia RNG
704
    RNG='MWC'
705
    # Value type : INT32, INT64, FP32, FP64
706
    ValueType='FP32'
707

    
708
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
709
    
710
    try:
711
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
712
    except getopt.GetoptError:
713
        print(HowToUse % sys.argv[0])
714
        sys.exit(2)
715

    
716
    # List of Devices
717
    Devices=[]
718
    Alu={}
719
        
720
    for opt, arg in opts:
721
        if opt == '-h':
722
            print(HowToUse % sys.argv[0])
723

    
724
            print("\nInformations about devices detected under OpenCL API:")
725
            # For PyOpenCL import
726
            try:
727
                import pyopencl as cl
728
                Id=0
729
                for platform in cl.get_platforms():
730
                    for device in platform.get_devices():
731
                        #deviceType=cl.device_type.to_string(device.type)
732
                        deviceType="xPU"
733
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
734
                        Id=Id+1
735

    
736
            except:
737
                print("Your platform does not seem to support OpenCL")
738

    
739
            print("\nInformations about devices detected under CUDA API:")
740
            # For PyCUDA import
741
            try:
742
                import pycuda.driver as cuda
743
                cuda.init()
744
                for Id in range(cuda.Device.count()):
745
                    device=cuda.Device(Id)
746
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
747
                print
748
            except:
749
                print("Your platform does not seem to support CUDA")
750
        
751
            sys.exit()
752
        
753
        
754
        elif opt == '-o':
755
            OutMetrology=True
756
            Metrology='OutMetro'
757
        elif opt == '-c':
758
            Curves=True
759
        elif opt == '-k':
760
            IfThen=True
761
        elif opt in ("-d", "--device"):
762
            Devices.append(int(arg))
763
        elif opt in ("-g", "--gpustyle"):
764
            GpuStyle = arg
765
        elif opt in ("-m", "--marsaglia"):
766
            RNG = arg
767
        elif opt in ("-v", "--valuetype"):
768
            ValueType = arg
769
        elif opt in ("-i", "--iterations"):
770
            Iterations = numpy.uint64(arg)
771
        elif opt in ("-b", "--blocksbegin"):
772
            BlocksBegin = int(arg)
773
            BlocksEnd= BlocksBegin
774
        elif opt in ("-e", "--blocksend"):
775
            BlocksEnd = int(arg)
776
        elif opt in ("-s", "--blocksstep"):
777
            BlocksStep = int(arg)
778
        elif opt in ("-f", "--threadsfirst"):
779
            ThreadsBegin = int(arg)
780
            ThreadsEnd = ThreadsBegin
781
        elif opt in ("-l", "--threadslast"):
782
            ThreadsEnd = int(arg)
783
        elif opt in ("-t", "--threadsstep"):
784
            ThreadsStep = int(arg)
785
        elif opt in ("-r", "--redo"):
786
            Redo = int(arg)
787

    
788
    print("Devices Identification : %s" % Devices)
789
    print("GpuStyle used : %s" % GpuStyle)
790
    print("Iterations : %s" % Iterations)
791
    print("Number of Blocks on begin : %s" % BlocksBegin)
792
    print("Number of Blocks on end : %s" % BlocksEnd)
793
    print("Step on Blocks : %s" % BlocksStep)
794
    print("Number of Threads on begin : %s" % ThreadsBegin)
795
    print("Number of Threads on end : %s" % ThreadsEnd)
796
    print("Step on Threads : %s" % ThreadsStep)
797
    print("Number of redo : %s" % Redo)
798
    print("Metrology done out of XPU : %r" % OutMetrology)
799
    print("Type of Marsaglia RNG used : %s" % RNG)
800
    print("Type of variable : %s" % ValueType)
801

    
802
    if GpuStyle=='CUDA':
803
        try:
804
            # For PyCUDA import
805
            import pycuda.driver as cuda
806
            
807
            cuda.init()
808
            for Id in range(cuda.Device.count()):
809
                device=cuda.Device(Id)
810
                print("Device #%i of type GPU : %s" % (Id,device.name()))
811
                if Id in Devices:
812
                    Alu[Id]='GPU'
813
            
814
        except ImportError:
815
            print("Platform does not seem to support CUDA")
816

    
817
    if GpuStyle=='OpenCL':
818
        try:
819
            # For PyOpenCL import
820
            import pyopencl as cl
821
            Id=0
822
            for platform in cl.get_platforms():
823
                for device in platform.get_devices():
824
                    #deviceType=cl.device_type.to_string(device.type)
825
                    deviceType="xPU"
826
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
827

    
828
                    if Id in Devices:
829
                    # Set the Alu as detected Device Type
830
                        Alu[Id]=deviceType
831
                    Id=Id+1
832
        except ImportError:
833
            print("Platform does not seem to support OpenCL")
834

    
835
    print(Devices,Alu)
836
            
837
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
838
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
839
    
840
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
841
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
842
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
843
    avgD=numpy.array([]).astype(numpy.float32)
844
    medD=numpy.array([]).astype(numpy.float32)
845
    stdD=numpy.array([]).astype(numpy.float32)
846
    minD=numpy.array([]).astype(numpy.float32)
847
    maxD=numpy.array([]).astype(numpy.float32)
848
    avgR=numpy.array([]).astype(numpy.float32)
849
    medR=numpy.array([]).astype(numpy.float32)
850
    stdR=numpy.array([]).astype(numpy.float32)
851
    minR=numpy.array([]).astype(numpy.float32)
852
    maxR=numpy.array([]).astype(numpy.float32)
853

    
854
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
855
        
856
        # print Blocks,Threads
857
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
858
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
859
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
860
        ExploredThreads=numpy.append(ExploredThreads,Threads)
861
        
862
        if OutMetrology: 
863
            DurationItem=numpy.array([]).astype(numpy.float32)
864
            Duration=numpy.array([]).astype(numpy.float32)
865
            Rate=numpy.array([]).astype(numpy.float32)
866
            for i in range(Redo):
867
                start=time.time()
868
                if GpuStyle=='CUDA':
869
                    try:
870
                        InputCU={}
871
                        InputCU['Iterations']=Iterations
872
                        InputCU['Steps']=1
873
                        InputCU['Blocks']=Blocks
874
                        InputCU['Threads']=Threads
875
                        InputCU['Device']=Devices[0]
876
                        InputCU['RNG']=RNG
877
                        InputCU['ValueType']=ValueType                    
878
                        InputCU['IfThen']=IfThen                    
879
                        OutputCU=MetropolisCuda(InputCU)
880
                        Inside=OutputCU['Circle']
881
                        NewIterations=OutputCU['NewIterations']
882
                        Duration=OutputCU['Duration']
883
                    except:
884
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
885
                elif GpuStyle=='OpenCL':
886
                    try:
887
                        InputCL={}
888
                        InputCL['Iterations']=Iterations
889
                        InputCL['Steps']=1
890
                        InputCL['Blocks']=Blocks
891
                        InputCL['Threads']=Threads
892
                        InputCL['Device']=Devices[0]
893
                        InputCL['RNG']=RNG
894
                        InputCL['ValueType']=ValueType
895
                        InputCL['IfThen']=IfThen                    
896
                        OutputCL=MetropolisOpenCL(InputCL)
897
                        Inside=OutputCL['Circle']
898
                        NewIterations=OutputCL['NewIterations']
899
                        Duration=OutputCL['Duration']
900
                    except:
901
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
902
                Duration=numpy.append(Duration,time.time()-start)
903
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
904
        else:
905
            if GpuStyle=='CUDA':
906
                try:
907
                    InputCU={}
908
                    InputCU['Iterations']=Iterations
909
                    InputCU['Steps']=Redo
910
                    InputCU['Blocks']=Blocks
911
                    InputCU['Threads']=Threads
912
                    InputCU['Device']=Devices[0]
913
                    InputCU['RNG']=RNG
914
                    InputCU['ValueType']=ValueType
915
                    InputCU['IfThen']=IfThen                    
916
                    OutputCU=MetropolisCuda(InputCU)
917
                    Inside=OutputCU['Inside']
918
                    NewIterations=OutputCU['NewIterations']
919
                    Duration=OutputCU['Duration']
920
                    pycuda.context.pop()
921
                except:
922
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
923
            elif GpuStyle=='OpenCL':
924
                try:
925
                    InputCL={}
926
                    InputCL['Iterations']=Iterations
927
                    InputCL['Steps']=Redo
928
                    InputCL['Blocks']=Blocks
929
                    InputCL['Threads']=Threads
930
                    InputCL['Device']=Devices[0]
931
                    InputCL['RNG']=RNG
932
                    InputCL['ValueType']=ValueType
933
                    InputCL['IfThen']=IfThen
934
                    OutputCL=MetropolisOpenCL(InputCL)
935
                    Inside=OutputCL['Inside']
936
                    NewIterations=OutputCL['NewIterations']
937
                    Duration=OutputCL['Duration']
938
                except:
939
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
940
            Rate=NewIterations/Duration
941
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
942

    
943
            
944
        avgD=numpy.append(avgD,numpy.average(Duration))
945
        medD=numpy.append(medD,numpy.median(Duration))
946
        stdD=numpy.append(stdD,numpy.std(Duration))
947
        minD=numpy.append(minD,numpy.min(Duration))
948
        maxD=numpy.append(maxD,numpy.max(Duration))
949
        avgR=numpy.append(avgR,numpy.average(Rate))
950
        medR=numpy.append(medR,numpy.median(Rate))
951
        stdR=numpy.append(stdR,numpy.std(Rate))
952
        minR=numpy.append(minR,numpy.min(Rate))
953
        maxR=numpy.append(maxR,numpy.max(Rate))
954

    
955
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
956
        
957
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
958
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
959
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
960

    
961
    if Fit:
962
        FitAndPrint(ExploredJobs,median,Curves)