Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 246

Historique | Voir | Annoter | Télécharger (31,3 ko)

1
#!/usr/bin/env python3
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import itertools
26
from socket import gethostname
27

    
28
class PenStacle:
29
    """Pentacle of Statistics from data"""
30
    Avg=0
31
    Med=0
32
    Std=0
33
    Min=0
34
    Max=0
35
    def __init__(self,Data):
36
        self.Avg=numpy.average(Data)
37
        self.Med=numpy.median(Data)
38
        self.Std=numpy.std(Data)
39
        self.Max=numpy.max(Data)
40
        self.Min=numpy.min(Data)
41
    def display(self):
42
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43

    
44
class Experience:
45
    """Metrology for experiences"""
46
    DeviceStyle=''
47
    DeviceId=0
48
    AvgD=0
49
    MedD=0
50
    StdD=0
51
    MinD=0
52
    MaxD=0
53
    AvgR=0
54
    MedR=0
55
    StdR=0
56
    MinR=0
57
    MaxR=0
58
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59
        self.DeviceStyle=DeviceStyle
60
        self.DeviceId=DeviceId
61
        self.Iterations
62
        
63
    def Metrology(self,Data):
64
        Duration=PenStacle(Data)
65
        Rate=PenStacle(Iterations/Data)
66
        print("Duration %s" % Duration)
67
        print("Rate %s" % Rate)
68

    
69

    
70
        
71
def DictionariesAPI():
72
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74
    Test={True:1,False:0}
75
    return(Marsaglia,Computing,Test)
76
    
77
# find prime factors of a number
78
# Get for WWW :
79
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
80
def PrimeFactors(x):
81

    
82
    factorlist=numpy.array([]).astype('uint32')
83
    loop=2
84
    while loop<=x:
85
        if x%loop==0:
86
            x/=loop
87
            factorlist=numpy.append(factorlist,[loop])
88
        else:
89
            loop+=1
90
    return factorlist
91

    
92
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
93
# output is thread number
94
def BestThreadsNumber(jobs):
95
    factors=PrimeFactors(jobs)
96
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
97
    threads=1
98
    for factor in matrix.transpose().ravel():
99
        threads=threads*factor
100
        if threads*threads>jobs or threads>512:
101
            break
102
    return(long(threads))
103

    
104
# Predicted Amdahl Law (Reduced with s=1-p)  
105
def AmdahlR(N, T1, p):
106
    return (T1*(1-p+p/N))
107

    
108
# Predicted Amdahl Law
109
def Amdahl(N, T1, s, p):
110
    return (T1*(s+p/N))
111

    
112
# Predicted Mylq Law with first order
113
def Mylq(N, T1,s,c,p):
114
    return (T1*(s+p/N)+c*N)
115

    
116
# Predicted Mylq Law with second order
117
def Mylq2(N, T1,s,c1,c2,p):
118
    return (T1*(s+p/N)+c1*N+c2*N*N)
119

    
120
def KernelCodeCuda():
121
    KERNEL_CODE_CUDA="""
122
#define TCONG 0
123
#define TSHR3 1
124
#define TMWC 2
125
#define TKISS 3
126

127
#define TINT32 0
128
#define TINT64 1
129
#define TFP32 2
130
#define TFP64 3
131

132
#define IFTHEN 1
133

134
// Marsaglia RNG very simple implementation
135

136
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
137
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
138
#define MWC   (znew+wnew)
139
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
140
#define CONG  (jcong=69069*jcong+1234567)
141
#define KISS  ((MWC^CONG)+SHR3)
142

143
#define MWCfp MWC * 2.328306435454494e-10f
144
#define KISSfp KISS * 2.328306435454494e-10f
145
#define SHR3fp SHR3 * 2.328306435454494e-10f
146
#define CONGfp CONG * 2.328306435454494e-10f
147

148
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
149
{
150

151
#if TRNG == TCONG
152
   uint jcong=seed_z+work;
153
#elif TRNG == TSHR3
154
   uint jsr=seed_w+work;
155
#elif TRNG == TMWC
156
   uint z=seed_z+work;
157
   uint w=seed_w+work;
158
#elif TRNG == TKISS
159
   uint jcong=seed_z+work;
160
   uint jsr=seed_w+work;
161
   uint z=seed_z-work;
162
   uint w=seed_w-work;
163
#endif
164

165
   ulong total=0;
166

167
   for (ulong i=0;i<iterations;i++) {
168

169
#if TYPE == TINT32
170
    #define THEONE 1073741824
171
    #if TRNG == TCONG
172
        uint x=CONG>>17 ;
173
        uint y=CONG>>17 ;
174
    #elif TRNG == TSHR3
175
        uint x=SHR3>>17 ;
176
        uint y=SHR3>>17 ;
177
    #elif TRNG == TMWC
178
        uint x=MWC>>17 ;
179
        uint y=MWC>>17 ;
180
    #elif TRNG == TKISS
181
        uint x=KISS>>17 ;
182
        uint y=KISS>>17 ;
183
    #endif
184
#elif TYPE == TINT64
185
    #define THEONE 4611686018427387904
186
    #if TRNG == TCONG
187
        ulong x=(ulong)(CONG>>1) ;
188
        ulong y=(ulong)(CONG>>1) ;
189
    #elif TRNG == TSHR3
190
        ulong x=(ulong)(SHR3>>1) ;
191
        ulong y=(ulong)(SHR3>>1) ;
192
    #elif TRNG == TMWC
193
        ulong x=(ulong)(MWC>>1) ;
194
        ulong y=(ulong)(MWC>>1) ;
195
    #elif TRNG == TKISS
196
        ulong x=(ulong)(KISS>>1) ;
197
        ulong y=(ulong)(KISS>>1) ;
198
    #endif
199
#elif TYPE == TFP32
200
    #define THEONE 1.0f
201
    #if TRNG == TCONG
202
        float x=CONGfp ;
203
        float y=CONGfp ;
204
    #elif TRNG == TSHR3
205
        float x=SHR3fp ;
206
        float y=SHR3fp ;
207
    #elif TRNG == TMWC
208
        float x=MWCfp ;
209
        float y=MWCfp ;
210
    #elif TRNG == TKISS
211
      float x=KISSfp ;
212
      float y=KISSfp ;
213
    #endif
214
#elif TYPE == TFP64
215
    #define THEONE 1.0f
216
    #if TRNG == TCONG
217
        double x=(double)CONGfp ;
218
        double y=(double)CONGfp ;
219
    #elif TRNG == TSHR3
220
        double x=(double)SHR3fp ;
221
        double y=(double)SHR3fp ;
222
    #elif TRNG == TMWC
223
        double x=(double)MWCfp ;
224
        double y=(double)MWCfp ;
225
    #elif TRNG == TKISS
226
        double x=(double)KISSfp ;
227
        double y=(double)KISSfp ;
228
    #endif
229
#endif
230

231
#if TEST == IFTHEN
232
      if ((x*x+y*y) <=THEONE) {
233
         total+=1;
234
      }
235
#else
236
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
237
      total+=inside;
238
#endif
239
   }
240

241
   return(total);
242
}
243

244
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
245
{
246
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
247
   s[blockIdx.x]=total;
248
   __syncthreads();
249

250
}
251

252
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
253
{
254
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
255
   s[threadIdx.x]=total;
256
   __syncthreads();
257

258
}
259

260
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
261
{
262
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
263
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
264
   __syncthreads();
265
}
266

267
"""
268
    return(KERNEL_CODE_CUDA)
269

    
270
def KernelCodeOpenCL():
271
    KERNEL_CODE_OPENCL="""
272
#define TCONG 0
273
#define TSHR3 1
274
#define TMWC 2
275
#define TKISS 3
276

277
#define TINT32 0
278
#define TINT64 1
279
#define TFP32 2
280
#define TFP64 3
281

282
#define IFTHEN 1
283

284
// Marsaglia RNG very simple implementation
285
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
286
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
287

288
#define MWC   (znew+wnew)
289
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
290
#define CONG  (jcong=69069*jcong+1234567)
291
#define KISS  ((MWC^CONG)+SHR3)
292

293
#define MWCfp MWC * 2.328306435454494e-10f
294
#define KISSfp KISS * 2.328306435454494e-10f
295
#define CONGfp CONG * 2.328306435454494e-10f
296
#define SHR3fp SHR3 * 2.328306435454494e-10f
297

298
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
299
{
300

301
#if TRNG == TCONG
302
   uint jcong=seed_z+work;
303
#elif TRNG == TSHR3
304
   uint jsr=seed_w+work;
305
#elif TRNG == TMWC
306
   uint z=seed_z+work;
307
   uint w=seed_w+work;
308
#elif TRNG == TKISS
309
   uint jcong=seed_z+work;
310
   uint jsr=seed_w+work;
311
   uint z=seed_z-work;
312
   uint w=seed_w-work;
313
#endif
314

315
   ulong total=0;
316

317
   for (ulong i=0;i<iterations;i++) {
318

319
#if TYPE == TINT32
320
    #define THEONE 1073741824
321
    #if TRNG == TCONG
322
        uint x=CONG>>17 ;
323
        uint y=CONG>>17 ;
324
    #elif TRNG == TSHR3
325
        uint x=SHR3>>17 ;
326
        uint y=SHR3>>17 ;
327
    #elif TRNG == TMWC
328
        uint x=MWC>>17 ;
329
        uint y=MWC>>17 ;
330
    #elif TRNG == TKISS
331
        uint x=KISS>>17 ;
332
        uint y=KISS>>17 ;
333
    #endif
334
#elif TYPE == TINT64
335
    #define THEONE 4611686018427387904
336
    #if TRNG == TCONG
337
        ulong x=(ulong)(CONG>>1) ;
338
        ulong y=(ulong)(CONG>>1) ;
339
    #elif TRNG == TSHR3
340
        ulong x=(ulong)(SHR3>>1) ;
341
        ulong y=(ulong)(SHR3>>1) ;
342
    #elif TRNG == TMWC
343
        ulong x=(ulong)(MWC>>1) ;
344
        ulong y=(ulong)(MWC>>1) ;
345
    #elif TRNG == TKISS
346
        ulong x=(ulong)(KISS>>1) ;
347
        ulong y=(ulong)(KISS>>1) ;
348
    #endif
349
#elif TYPE == TFP32
350
    #define THEONE 1.0f
351
    #if TRNG == TCONG
352
        float x=CONGfp ;
353
        float y=CONGfp ;
354
    #elif TRNG == TSHR3
355
        float x=SHR3fp ;
356
        float y=SHR3fp ;
357
    #elif TRNG == TMWC
358
        float x=MWCfp ;
359
        float y=MWCfp ;
360
    #elif TRNG == TKISS
361
      float x=KISSfp ;
362
      float y=KISSfp ;
363
    #endif
364
#elif TYPE == TFP64
365
#pragma OPENCL EXTENSION cl_khr_fp64: enable
366
    #define THEONE 1.0f
367
    #if TRNG == TCONG
368
        double x=(double)CONGfp ;
369
        double y=(double)CONGfp ;
370
    #elif TRNG == TSHR3
371
        double x=(double)SHR3fp ;
372
        double y=(double)SHR3fp ;
373
    #elif TRNG == TMWC
374
        double x=(double)MWCfp ;
375
        double y=(double)MWCfp ;
376
    #elif TRNG == TKISS
377
        double x=(double)KISSfp ;
378
        double y=(double)KISSfp ;
379
    #endif
380
#endif
381

382
#if TEST == IFTHEN
383
      if ((x*x+y*y) <= THEONE) {
384
         total+=1;
385
      }
386
#else
387
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
388
      total+=inside;
389
#endif
390
   }
391
   
392
   return(total);
393
}
394

395
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
396
{
397
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
398
   barrier(CLK_GLOBAL_MEM_FENCE);
399
   s[get_global_id(0)]=total;     
400
}
401

402
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
403
{
404
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
405
   barrier(CLK_LOCAL_MEM_FENCE);
406
   s[get_local_id(0)]=total;
407
}
408

409
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
410
{
411
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
412
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
413
   s[get_global_id(0)]=total;
414
}
415

416
"""
417
    return(KERNEL_CODE_OPENCL)
418
    
419
def MetropolisCuda(InputCU):
420

    
421
    print("Inside ",InputCU)
422
    
423
    iterations=InputCU['Iterations']
424
    steps=InputCU['Steps']
425
    blocks=InputCU['Blocks']
426
    threads=InputCU['Threads']
427
    Device=InputCU['Device']
428
    RNG=InputCU['RNG']
429
    ValueType=InputCU['ValueType']
430
    TestType=InputCU['IfThen']
431
    Seeds=InputCU['Seeds']
432
    
433
    Marsaglia,Computing,Test=DictionariesAPI()
434
    
435
    try:
436
        # For PyCUDA import
437
        import pycuda.driver as cuda
438
        from pycuda.compiler import SourceModule
439
        
440
        cuda.init()
441
        for Id in range(cuda.Device.count()):
442
            if Id==Device:
443
                XPU=cuda.Device(Id)
444
                print("GPU selected %s" % XPU.name())
445
        print
446

    
447
    except ImportError:
448
        print("Platform does not seem to support CUDA")
449

    
450
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)  
451
    circleCU = cuda.InOut(circle)
452
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
453
    #cuda.memcpy_htod(circleCU, circle)
454

    
455
    Context=XPU.make_context()
456

    
457
    try:
458
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
459
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
460
        # Needed to set the compiler via ccbin for CUDA9 implementation
461
        #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
462
    except:
463
        print("Compilation seems to break")
464
 
465
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
466
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
467
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
468

    
469
    MyDuration=numpy.zeros(steps)
470

    
471
    jobs=blocks*threads;
472

    
473
    iterationsCU=numpy.uint64(iterations/jobs)
474
    if iterations%jobs!=0:
475
        iterationsCU+=numpy.uint64(1)
476

    
477
    for i in range(steps):
478
        start_time=time.time()
479

    
480
        try:
481
            MetropolisHybridCU(circleCU,
482
                               numpy.uint64(iterationsCU),
483
                               numpy.uint32(Seeds[0]),
484
                               numpy.uint32(Seeds[1]),
485
                               grid=(blocks,1),block=(threads,1,1))
486
        except:
487
            print("Crash during CUDA call")
488

    
489
        elapsed = time.time()-start_time
490
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
491

    
492
        MyDuration[i]=elapsed
493

    
494
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
495
    print(OutputCU)
496
    Context.pop()
497
    
498
    Context.detach()
499

    
500
    return(OutputCU)
501

    
502
def MetropolisOpenCL(InputCL):
503

    
504
    import pyopencl as cl
505

    
506
    print("Inside ",InputCL)
507
    
508
    iterations=InputCL['Iterations']
509
    steps=InputCL['Steps']
510
    blocks=InputCL['Blocks']
511
    threads=InputCL['Threads']
512
    Device=InputCL['Device']
513
    RNG=InputCL['RNG']  
514
    ValueType=InputCL['ValueType']
515
    TestType=InputCL['IfThen']
516
    Seeds=InputCL['Seeds']
517

    
518
    Marsaglia,Computing,Test=DictionariesAPI()
519

    
520
    # Initialisation des variables en les CASTant correctement
521
    Id=0
522
    HasXPU=False
523
    for platform in cl.get_platforms():
524
        for device in platform.get_devices():
525
            if Id==Device:
526
                XPU=device
527
                print("CPU/GPU selected: ",device.name.lstrip())
528
                HasXPU=True
529
            Id+=1
530
            print(Id)
531

    
532
    if HasXPU==False:
533
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
534
        sys.exit()           
535

    
536
    print(XPU)
537
    # Je cree le contexte et la queue pour son execution
538
    try:
539
        ctx = cl.Context(devices=[XPU])
540
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
541
    except:
542
        print("Crash during context creation")
543

    
544
    # Je recupere les flag possibles pour les buffers
545
    mf = cl.mem_flags
546
        
547
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
548
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
549
    
550
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
551

    
552
    MyDuration=numpy.zeros(steps)
553
  
554
    jobs=blocks*threads;
555

    
556
    iterationsCL=numpy.uint64(iterations/jobs)
557
    if iterations%jobs!=0:
558
        iterationsCL+=1
559

    
560
    for i in range(steps):
561
        start_time=time.time()
562
        if threads == 1:
563
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
564
                                                 circleCL,
565
                                                 numpy.uint64(iterationsCL),
566
                                                 numpy.uint32(Seeds[0]),
567
                                                 numpy.uint32(Seeds[1]))
568
        else:
569
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
570
                                                 circleCL,
571
                                                 numpy.uint64(iterationsCL),
572
                                                 numpy.uint32(Seeds[0]),
573
                                                 numpy.uint32(Seeds[1]))
574

    
575
        CLLaunch.wait()
576
        cl.enqueue_copy(queue, circle, circleCL).wait()
577

    
578
        elapsed = time.time()-start_time
579
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
580
            
581
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
582
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
583

    
584
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
585
        MyDuration[i]=elapsed
586
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
587
        # MyPi[i]=numpy.median(AllPi)
588
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
589

    
590
    circleCL.release()
591

    
592
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
593
    print(OutputCL)
594
    return(OutputCL)
595

    
596

    
597
def FitAndPrint(N,D,Curves):
598

    
599
    from scipy.optimize import curve_fit
600
    import matplotlib.pyplot as plt
601

    
602
    try:
603
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
604

    
605
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
606
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
607
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
608
        coeffs_Amdahl[0]=D[0]
609
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
610
    except:
611
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
612

    
613
    try:
614
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
615

    
616
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
617
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
618
        coeffs_AmdahlR[0]=D[0]
619
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
620

    
621
    except:
622
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
623

    
624
    try:
625
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
626

    
627
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
628
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
629
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
630
        coeffs_Mylq[0]=D[0]
631
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
632
                                                                coeffs_Mylq[1],
633
                                                                coeffs_Mylq[3],
634
                                                                coeffs_Mylq[2]))
635
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
636
                    coeffs_Mylq[3])
637
    except:
638
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
639

    
640
    try:
641
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
642

    
643
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
644
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
645
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
646
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
647
        coeffs_Mylq2[0]=D[0]
648
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
649

    
650
    except:
651
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
652

    
653
    if Curves:
654
        plt.xlabel("Number of Threads/work Items")
655
        plt.ylabel("Total Elapsed Time")
656

    
657
        Experience,=plt.plot(N,D,'ro') 
658
    try:
659
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
660
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
661
    except:
662
        print("Fit curves seem not to be available")
663

    
664
    plt.legend()
665
    plt.show()
666

    
667
if __name__=='__main__':
668
    
669
    # Set defaults values
670
  
671
    # Id of Device : 1 is for first find !
672
    Device=1
673
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
674
    GpuStyle='OpenCL'
675
    # Iterations is integer
676
    Iterations=10000000
677
    # BlocksBlocks in first number of Blocks to explore
678
    BlocksBegin=1
679
    # BlocksEnd is last number of Blocks to explore
680
    BlocksEnd=1
681
    # BlocksStep is the step of Blocks to explore
682
    BlocksStep=1
683
    # ThreadsBlocks in first number of Blocks to explore
684
    ThreadsBegin=1
685
    # ThreadsEnd is last number of Blocks to explore
686
    ThreadsEnd=1
687
    # ThreadsStep is the step of Blocks to explore
688
    ThreadsStep=1
689
    # Redo is the times to redo the test to improve metrology
690
    Redo=1
691
    # OutMetrology is method for duration estimation : False is GPU inside
692
    OutMetrology=False
693
    Metrology='InMetro'
694
    # Curves is True to print the curves
695
    Curves=False
696
    # Fit is True to print the curves
697
    Fit=False
698
    # Inside based on If
699
    IfThen=False
700
    # Marsaglia RNG
701
    RNG='MWC'
702
    # Value type : INT32, INT64, FP32, FP64
703
    ValueType='FP32'
704
    # Seeds for RNG
705
    Seeds=110271,101008
706

    
707
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
708
    
709
    try:
710
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
711
    except getopt.GetoptError:
712
        print(HowToUse % sys.argv[0])
713
        sys.exit(2)
714

    
715
    # List of Devices
716
    Devices=[]
717
    Alu={}
718
        
719
    for opt, arg in opts:
720
        if opt == '-h':
721
            print(HowToUse % sys.argv[0])
722

    
723
            print("\nInformations about devices detected under OpenCL API:")
724
            # For PyOpenCL import
725
            try:
726
                import pyopencl as cl
727
                Id=0
728
                for platform in cl.get_platforms():
729
                    for device in platform.get_devices():
730
                        #deviceType=cl.device_type.to_string(device.type)
731
                        deviceType="xPU"
732
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
733
                        Id=Id+1
734

    
735
            except:
736
                print("Your platform does not seem to support OpenCL")
737

    
738
            print("\nInformations about devices detected under CUDA API:")
739
            # For PyCUDA import
740
            try:
741
                import pycuda.driver as cuda
742
                cuda.init()
743
                for Id in range(cuda.Device.count()):
744
                    device=cuda.Device(Id)
745
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
746
                print
747
            except:
748
                print("Your platform does not seem to support CUDA")
749
        
750
            sys.exit()
751
        
752
        
753
        elif opt == '-o':
754
            OutMetrology=True
755
            Metrology='OutMetro'
756
        elif opt == '-c':
757
            Curves=True
758
        elif opt == '-k':
759
            IfThen=True
760
        elif opt in ("-d", "--device"):
761
            Devices.append(int(arg))
762
        elif opt in ("-g", "--gpustyle"):
763
            GpuStyle = arg
764
        elif opt in ("-m", "--marsaglia"):
765
            RNG = arg
766
        elif opt in ("-v", "--valuetype"):
767
            ValueType = arg
768
        elif opt in ("-i", "--iterations"):
769
            Iterations = numpy.uint64(arg)
770
        elif opt in ("-b", "--blocksbegin"):
771
            BlocksBegin = int(arg)
772
            BlocksEnd= BlocksBegin
773
        elif opt in ("-e", "--blocksend"):
774
            BlocksEnd = int(arg)
775
        elif opt in ("-s", "--blocksstep"):
776
            BlocksStep = int(arg)
777
        elif opt in ("-f", "--threadsfirst"):
778
            ThreadsBegin = int(arg)
779
            ThreadsEnd = ThreadsBegin
780
        elif opt in ("-l", "--threadslast"):
781
            ThreadsEnd = int(arg)
782
        elif opt in ("-t", "--threadsstep"):
783
            ThreadsStep = int(arg)
784
        elif opt in ("-r", "--redo"):
785
            Redo = int(arg)
786

    
787
    print("Devices Identification : %s" % Devices)
788
    print("GpuStyle used : %s" % GpuStyle)
789
    print("Iterations : %s" % Iterations)
790
    print("Number of Blocks on begin : %s" % BlocksBegin)
791
    print("Number of Blocks on end : %s" % BlocksEnd)
792
    print("Step on Blocks : %s" % BlocksStep)
793
    print("Number of Threads on begin : %s" % ThreadsBegin)
794
    print("Number of Threads on end : %s" % ThreadsEnd)
795
    print("Step on Threads : %s" % ThreadsStep)
796
    print("Number of redo : %s" % Redo)
797
    print("Metrology done out of XPU : %r" % OutMetrology)
798
    print("Type of Marsaglia RNG used : %s" % RNG)
799
    print("Type of variable : %s" % ValueType)
800

    
801
    if GpuStyle=='CUDA':
802
        try:
803
            # For PyCUDA import
804
            import pycuda.driver as cuda
805
            
806
            cuda.init()
807
            for Id in range(cuda.Device.count()):
808
                device=cuda.Device(Id)
809
                print("Device #%i of type GPU : %s" % (Id,device.name()))
810
                if Id in Devices:
811
                    Alu[Id]='GPU'
812
            
813
        except ImportError:
814
            print("Platform does not seem to support CUDA")
815

    
816
    if GpuStyle=='OpenCL':
817
        try:
818
            # For PyOpenCL import
819
            import pyopencl as cl
820
            Id=0
821
            for platform in cl.get_platforms():
822
                for device in platform.get_devices():
823
                    #deviceType=cl.device_type.to_string(device.type)
824
                    deviceType="xPU"
825
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
826

    
827
                    if Id in Devices:
828
                    # Set the Alu as detected Device Type
829
                        Alu[Id]=deviceType
830
                    Id=Id+1
831
        except ImportError:
832
            print("Platform does not seem to support OpenCL")
833

    
834
    print(Devices,Alu)
835
            
836
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
837
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
838
    
839
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
840
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
841
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
842
    avgD=numpy.array([]).astype(numpy.float32)
843
    medD=numpy.array([]).astype(numpy.float32)
844
    stdD=numpy.array([]).astype(numpy.float32)
845
    minD=numpy.array([]).astype(numpy.float32)
846
    maxD=numpy.array([]).astype(numpy.float32)
847
    avgR=numpy.array([]).astype(numpy.float32)
848
    medR=numpy.array([]).astype(numpy.float32)
849
    stdR=numpy.array([]).astype(numpy.float32)
850
    minR=numpy.array([]).astype(numpy.float32)
851
    maxR=numpy.array([]).astype(numpy.float32)
852

    
853
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
854
        
855
        # print Blocks,Threads
856
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
857
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
858
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
859
        ExploredThreads=numpy.append(ExploredThreads,Threads)
860
        
861
        if OutMetrology: 
862
            DurationItem=numpy.array([]).astype(numpy.float32)
863
            Duration=numpy.array([]).astype(numpy.float32)
864
            Rate=numpy.array([]).astype(numpy.float32)
865
            for i in range(Redo):
866
                start=time.time()
867
                if GpuStyle=='CUDA':
868
                    try:
869
                        InputCU={}
870
                        InputCU['Iterations']=Iterations
871
                        InputCU['Steps']=1
872
                        InputCU['Blocks']=Blocks
873
                        InputCU['Threads']=Threads
874
                        InputCU['Device']=Devices[0]
875
                        InputCU['RNG']=RNG
876
                        InputCU['Seeds']=Seeds
877
                        InputCU['ValueType']=ValueType                    
878
                        InputCU['IfThen']=IfThen                    
879
                        OutputCU=MetropolisCuda(InputCU)
880
                        Inside=OutputCU['Circle']
881
                        NewIterations=OutputCU['NewIterations']
882
                        Duration=OutputCU['Duration']
883
                    except:
884
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
885
                elif GpuStyle=='OpenCL':
886
                    try:
887
                        InputCL={}
888
                        InputCL['Iterations']=Iterations
889
                        InputCL['Steps']=1
890
                        InputCL['Blocks']=Blocks
891
                        InputCL['Threads']=Threads
892
                        InputCL['Device']=Devices[0]
893
                        InputCL['RNG']=RNG
894
                        InputCL['Seeds']=Seeds
895
                        InputCL['ValueType']=ValueType
896
                        InputCL['IfThen']=IfThen                    
897
                        OutputCL=MetropolisOpenCL(InputCL)
898
                        Inside=OutputCL['Circle']
899
                        NewIterations=OutputCL['NewIterations']
900
                        Duration=OutputCL['Duration']
901
                    except:
902
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
903
                Duration=numpy.append(Duration,time.time()-start)
904
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
905
        else:
906
            if GpuStyle=='CUDA':
907
                try:
908
                    InputCU={}
909
                    InputCU['Iterations']=Iterations
910
                    InputCU['Steps']=Redo
911
                    InputCU['Blocks']=Blocks
912
                    InputCU['Threads']=Threads
913
                    InputCU['Device']=Devices[0]
914
                    InputCU['RNG']=RNG
915
                    InputCU['Seeds']=Seeds
916
                    InputCU['ValueType']=ValueType
917
                    InputCU['IfThen']=IfThen                    
918
                    OutputCU=MetropolisCuda(InputCU)
919
                    Inside=OutputCU['Inside']
920
                    NewIterations=OutputCU['NewIterations']
921
                    Duration=OutputCU['Duration']
922
                    pycuda.context.pop()
923
                except:
924
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
925
            elif GpuStyle=='OpenCL':
926
                try:
927
                    InputCL={}
928
                    InputCL['Iterations']=Iterations
929
                    InputCL['Steps']=Redo
930
                    InputCL['Blocks']=Blocks
931
                    InputCL['Threads']=Threads
932
                    InputCL['Device']=Devices[0]
933
                    InputCL['RNG']=RNG
934
                    InputCL['Seeds']=Seeds
935
                    InputCL['ValueType']=ValueType
936
                    InputCL['IfThen']=IfThen
937
                    OutputCL=MetropolisOpenCL(InputCL)
938
                    Inside=OutputCL['Inside']
939
                    NewIterations=OutputCL['NewIterations']
940
                    Duration=OutputCL['Duration']
941
                except:
942
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
943
            Rate=NewIterations/Duration
944
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
945

    
946
        avgD=numpy.append(avgD,numpy.average(Duration))
947
        medD=numpy.append(medD,numpy.median(Duration))
948
        stdD=numpy.append(stdD,numpy.std(Duration))
949
        minD=numpy.append(minD,numpy.min(Duration))
950
        maxD=numpy.append(maxD,numpy.max(Duration))
951
        avgR=numpy.append(avgR,numpy.average(Rate))
952
        medR=numpy.append(medR,numpy.median(Rate))
953
        stdR=numpy.append(stdR,numpy.std(Rate))
954
        minR=numpy.append(minR,numpy.min(Rate))
955
        maxR=numpy.append(maxR,numpy.max(Rate))
956

    
957
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
958
        
959
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
960
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
961
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
962

    
963
    if Fit:
964
        FitAndPrint(ExploredJobs,median,Curves)