Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 194

Historique | Voir | Annoter | Télécharger (31,04 ko)

1
#!/usr/bin/env python3
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import itertools
26
from socket import gethostname
27

    
28
class PenStacle:
29
    """Pentacle of Statistics from data"""
30
    Avg=0
31
    Med=0
32
    Std=0
33
    Min=0
34
    Max=0
35
    def __init__(self,Data):
36
        self.Avg=numpy.average(Data)
37
        self.Med=numpy.median(Data)
38
        self.Std=numpy.std(Data)
39
        self.Max=numpy.max(Data)
40
        self.Min=numpy.min(Data)
41
    def display(self):
42
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43

    
44
class Experience:
45
    """Metrology for experiences"""
46
    DeviceStyle=''
47
    DeviceId=0
48
    AvgD=0
49
    MedD=0
50
    StdD=0
51
    MinD=0
52
    MaxD=0
53
    AvgR=0
54
    MedR=0
55
    StdR=0
56
    MinR=0
57
    MaxR=0
58
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59
        self.DeviceStyle=DeviceStyle
60
        self.DeviceId=DeviceId
61
        self.Iterations
62
        
63
    def Metrology(self,Data):
64
        Duration=PenStacle(Data)
65
        Rate=PenStacle(Iterations/Data)
66
        print("Duration %s" % Duration)
67
        print("Rate %s" % Rate)
68

    
69

    
70
        
71
def DictionariesAPI():
72
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74
    Test={True:1,False:0}
75
    return(Marsaglia,Computing,Test)
76
    
77
# find prime factors of a number
78
# Get for WWW :
79
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
80
def PrimeFactors(x):
81

    
82
    factorlist=numpy.array([]).astype('uint32')
83
    loop=2
84
    while loop<=x:
85
        if x%loop==0:
86
            x/=loop
87
            factorlist=numpy.append(factorlist,[loop])
88
        else:
89
            loop+=1
90
    return factorlist
91

    
92
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
93
# output is thread number
94
def BestThreadsNumber(jobs):
95
    factors=PrimeFactors(jobs)
96
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
97
    threads=1
98
    for factor in matrix.transpose().ravel():
99
        threads=threads*factor
100
        if threads*threads>jobs or threads>512:
101
            break
102
    return(long(threads))
103

    
104
# Predicted Amdahl Law (Reduced with s=1-p)  
105
def AmdahlR(N, T1, p):
106
    return (T1*(1-p+p/N))
107

    
108
# Predicted Amdahl Law
109
def Amdahl(N, T1, s, p):
110
    return (T1*(s+p/N))
111

    
112
# Predicted Mylq Law with first order
113
def Mylq(N, T1,s,c,p):
114
    return (T1*(s+p/N)+c*N)
115

    
116
# Predicted Mylq Law with second order
117
def Mylq2(N, T1,s,c1,c2,p):
118
    return (T1*(s+p/N)+c1*N+c2*N*N)
119

    
120
def KernelCodeCuda():
121
    KERNEL_CODE_CUDA="""
122
#define TCONG 0
123
#define TSHR3 1
124
#define TMWC 2
125
#define TKISS 3
126

127
#define TINT32 0
128
#define TINT64 1
129
#define TFP32 2
130
#define TFP64 3
131

132
#define IFTHEN 1
133

134
// Marsaglia RNG very simple implementation
135

136
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
137
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
138
#define MWC   (znew+wnew)
139
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
140
#define CONG  (jcong=69069*jcong+1234567)
141
#define KISS  ((MWC^CONG)+SHR3)
142

143
#define MWCfp MWC * 2.328306435454494e-10f
144
#define KISSfp KISS * 2.328306435454494e-10f
145
#define SHR3fp SHR3 * 2.328306435454494e-10f
146
#define CONGfp CONG * 2.328306435454494e-10f
147

148
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
149
{
150

151
#if TRNG == TCONG
152
   uint jcong=seed_z+work;
153
#elif TRNG == TSHR3
154
   uint jsr=seed_w+work;
155
#elif TRNG == TMWC
156
   uint z=seed_z+work;
157
   uint w=seed_w+work;
158
#elif TRNG == TKISS
159
   uint jcong=seed_z+work;
160
   uint jsr=seed_w+work;
161
   uint z=seed_z-work;
162
   uint w=seed_w-work;
163
#endif
164

165
   ulong total=0;
166

167
   for (ulong i=0;i<iterations;i++) {
168

169
#if TYPE == TINT32
170
    #define THEONE 1073741824
171
    #if TRNG == TCONG
172
        uint x=CONG>>17 ;
173
        uint y=CONG>>17 ;
174
    #elif TRNG == TSHR3
175
        uint x=SHR3>>17 ;
176
        uint y=SHR3>>17 ;
177
    #elif TRNG == TMWC
178
        uint x=MWC>>17 ;
179
        uint y=MWC>>17 ;
180
    #elif TRNG == TKISS
181
        uint x=KISS>>17 ;
182
        uint y=KISS>>17 ;
183
    #endif
184
#elif TYPE == TINT64
185
    #define THEONE 4611686018427387904
186
    #if TRNG == TCONG
187
        ulong x=(ulong)(CONG>>1) ;
188
        ulong y=(ulong)(CONG>>1) ;
189
    #elif TRNG == TSHR3
190
        ulong x=(ulong)(SHR3>>1) ;
191
        ulong y=(ulong)(SHR3>>1) ;
192
    #elif TRNG == TMWC
193
        ulong x=(ulong)(MWC>>1) ;
194
        ulong y=(ulong)(MWC>>1) ;
195
    #elif TRNG == TKISS
196
        ulong x=(ulong)(KISS>>1) ;
197
        ulong y=(ulong)(KISS>>1) ;
198
    #endif
199
#elif TYPE == TFP32
200
    #define THEONE 1.0f
201
    #if TRNG == TCONG
202
        float x=CONGfp ;
203
        float y=CONGfp ;
204
    #elif TRNG == TSHR3
205
        float x=SHR3fp ;
206
        float y=SHR3fp ;
207
    #elif TRNG == TMWC
208
        float x=MWCfp ;
209
        float y=MWCfp ;
210
    #elif TRNG == TKISS
211
      float x=KISSfp ;
212
      float y=KISSfp ;
213
    #endif
214
#elif TYPE == TFP64
215
    #define THEONE 1.0f
216
    #if TRNG == TCONG
217
        double x=(double)CONGfp ;
218
        double y=(double)CONGfp ;
219
    #elif TRNG == TSHR3
220
        double x=(double)SHR3fp ;
221
        double y=(double)SHR3fp ;
222
    #elif TRNG == TMWC
223
        double x=(double)MWCfp ;
224
        double y=(double)MWCfp ;
225
    #elif TRNG == TKISS
226
        double x=(double)KISSfp ;
227
        double y=(double)KISSfp ;
228
    #endif
229
#endif
230

231
#if TEST == IFTHEN
232
      if ((x*x+y*y) <=THEONE) {
233
         total+=1;
234
      }
235
#else
236
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
237
      total+=inside;
238
#endif
239
   }
240

241
   return(total);
242
}
243

244
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
245
{
246
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
247
   s[blockIdx.x]=total;
248
   __syncthreads();
249

250
}
251

252
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
253
{
254
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
255
   s[threadIdx.x]=total;
256
   __syncthreads();
257

258
}
259

260
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
261
{
262
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
263
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
264
   __syncthreads();
265
}
266

267
"""
268
    return(KERNEL_CODE_CUDA)
269

    
270
def KernelCodeOpenCL():
271
    KERNEL_CODE_OPENCL="""
272
#define TCONG 0
273
#define TSHR3 1
274
#define TMWC 2
275
#define TKISS 3
276

277
#define TINT32 0
278
#define TINT64 1
279
#define TFP32 2
280
#define TFP64 3
281

282
#define IFTHEN 1
283

284
// Marsaglia RNG very simple implementation
285
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
286
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
287

288
#define MWC   (znew+wnew)
289
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
290
#define CONG  (jcong=69069*jcong+1234567)
291
#define KISS  ((MWC^CONG)+SHR3)
292

293
#define MWCfp MWC * 2.328306435454494e-10f
294
#define KISSfp KISS * 2.328306435454494e-10f
295
#define CONGfp CONG * 2.328306435454494e-10f
296
#define SHR3fp SHR3 * 2.328306435454494e-10f
297

298
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
299
{
300

301
#if TRNG == TCONG
302
   uint jcong=seed_z+work;
303
#elif TRNG == TSHR3
304
   uint jsr=seed_w+work;
305
#elif TRNG == TMWC
306
   uint z=seed_z+work;
307
   uint w=seed_w+work;
308
#elif TRNG == TKISS
309
   uint jcong=seed_z+work;
310
   uint jsr=seed_w+work;
311
   uint z=seed_z-work;
312
   uint w=seed_w-work;
313
#endif
314

315
   ulong total=0;
316

317
   for (ulong i=0;i<iterations;i++) {
318

319
#if TYPE == TINT32
320
    #define THEONE 1073741824
321
    #if TRNG == TCONG
322
        uint x=CONG>>17 ;
323
        uint y=CONG>>17 ;
324
    #elif TRNG == TSHR3
325
        uint x=SHR3>>17 ;
326
        uint y=SHR3>>17 ;
327
    #elif TRNG == TMWC
328
        uint x=MWC>>17 ;
329
        uint y=MWC>>17 ;
330
    #elif TRNG == TKISS
331
        uint x=KISS>>17 ;
332
        uint y=KISS>>17 ;
333
    #endif
334
#elif TYPE == TINT64
335
    #define THEONE 4611686018427387904
336
    #if TRNG == TCONG
337
        ulong x=(ulong)(CONG>>1) ;
338
        ulong y=(ulong)(CONG>>1) ;
339
    #elif TRNG == TSHR3
340
        ulong x=(ulong)(SHR3>>1) ;
341
        ulong y=(ulong)(SHR3>>1) ;
342
    #elif TRNG == TMWC
343
        ulong x=(ulong)(MWC>>1) ;
344
        ulong y=(ulong)(MWC>>1) ;
345
    #elif TRNG == TKISS
346
        ulong x=(ulong)(KISS>>1) ;
347
        ulong y=(ulong)(KISS>>1) ;
348
    #endif
349
#elif TYPE == TFP32
350
    #define THEONE 1.0f
351
    #if TRNG == TCONG
352
        float x=CONGfp ;
353
        float y=CONGfp ;
354
    #elif TRNG == TSHR3
355
        float x=SHR3fp ;
356
        float y=SHR3fp ;
357
    #elif TRNG == TMWC
358
        float x=MWCfp ;
359
        float y=MWCfp ;
360
    #elif TRNG == TKISS
361
      float x=KISSfp ;
362
      float y=KISSfp ;
363
    #endif
364
#elif TYPE == TFP64
365
#pragma OPENCL EXTENSION cl_khr_fp64: enable
366
    #define THEONE 1.0f
367
    #if TRNG == TCONG
368
        double x=(double)CONGfp ;
369
        double y=(double)CONGfp ;
370
    #elif TRNG == TSHR3
371
        double x=(double)SHR3fp ;
372
        double y=(double)SHR3fp ;
373
    #elif TRNG == TMWC
374
        double x=(double)MWCfp ;
375
        double y=(double)MWCfp ;
376
    #elif TRNG == TKISS
377
        double x=(double)KISSfp ;
378
        double y=(double)KISSfp ;
379
    #endif
380
#endif
381

382
#if TEST == IFTHEN
383
      if ((x*x+y*y) <= THEONE) {
384
         total+=1;
385
      }
386
#else
387
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
388
      total+=inside;
389
#endif
390
   }
391
   
392
   return(total);
393
}
394

395
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
396
{
397
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
398
   barrier(CLK_GLOBAL_MEM_FENCE);
399
   s[get_global_id(0)]=total;     
400
}
401

402
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
403
{
404
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
405
   barrier(CLK_LOCAL_MEM_FENCE);
406
   s[get_local_id(0)]=total;
407
}
408

409
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
410
{
411
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
412
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
413
   s[get_global_id(0)]=total;
414
}
415

416
"""
417
    return(KERNEL_CODE_OPENCL)
418
    
419
def MetropolisCuda(InputCU):
420

    
421
    print("Inside ",InputCU)
422
    
423
    iterations=InputCU['Iterations']
424
    steps=InputCU['Steps']
425
    blocks=InputCU['Blocks']
426
    threads=InputCU['Threads']
427
    Device=InputCU['Device']
428
    RNG=InputCU['RNG']
429
    ValueType=InputCU['ValueType']
430
    TestType=InputCU['IfThen']
431
    
432
    Marsaglia,Computing,Test=DictionariesAPI()
433
    
434
    try:
435
        # For PyCUDA import
436
        import pycuda.driver as cuda
437
        from pycuda.compiler import SourceModule
438
        
439
        cuda.init()
440
        for Id in range(cuda.Device.count()):
441
            if Id==Device:
442
                XPU=cuda.Device(Id)
443
                print("GPU selected %s" % XPU.name())
444
        print
445

    
446
    except ImportError:
447
        print("Platform does not seem to support CUDA")
448

    
449
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)  
450
    circleCU = cuda.InOut(circle)
451
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
452
    #cuda.memcpy_htod(circleCU, circle)
453

    
454
    Context=XPU.make_context()
455

    
456
    try:
457
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
458
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
459
        # Needed to set the compiler via ccbin for CUDA9 implementation
460
        #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
461
    except:
462
        print("Compilation seems to break")
463
 
464
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
465
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
466
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
467

    
468
    MyDuration=numpy.zeros(steps)
469

    
470
    jobs=blocks*threads;
471

    
472
    iterationsCU=numpy.uint64(iterations/jobs)
473
    if iterations%jobs!=0:
474
        iterationsCU+=numpy.uint64(1)
475

    
476
    for i in range(steps):
477
        start_time=time.time()
478

    
479
        try:
480
            MetropolisHybridCU(circleCU,
481
                               numpy.uint64(iterationsCU),
482
                               numpy.uint32(nprnd(2**32)),
483
                               numpy.uint32(nprnd(2**32)),
484
                               grid=(blocks,1),block=(threads,1,1))
485
        except:
486
            print("Crash during CUDA call")
487

    
488
        elapsed = time.time()-start_time
489
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
490

    
491
        MyDuration[i]=elapsed
492

    
493
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
494
    print(OutputCU)
495
    Context.pop()
496
    
497
    Context.detach()
498

    
499
    return(OutputCU)
500

    
501
def MetropolisOpenCL(InputCL):
502

    
503
    import pyopencl as cl
504

    
505
    print("Inside ",InputCL)
506
    
507
    iterations=InputCL['Iterations']
508
    steps=InputCL['Steps']
509
    blocks=InputCL['Blocks']
510
    threads=InputCL['Threads']
511
    Device=InputCL['Device']
512
    RNG=InputCL['RNG']  
513
    ValueType=InputCL['ValueType']
514
    TestType=InputCL['IfThen']
515

    
516
    Marsaglia,Computing,Test=DictionariesAPI()
517
    
518
    # Initialisation des variables en les CASTant correctement
519
    Id=0
520
    HasXPU=False
521
    for platform in cl.get_platforms():
522
        for device in platform.get_devices():
523
            if Id==Device:
524
                XPU=device
525
                print("CPU/GPU selected: ",device.name.lstrip())
526
                HasXPU=True
527
            Id+=1
528

    
529
    if HasXPU==False:
530
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
531
        sys.exit()              
532

    
533
        
534
    # Je cree le contexte et la queue pour son execution
535
    try:
536
        ctx = cl.Context(devices=[XPU])
537
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
538
    except:
539
        print("Crash during context creation")
540

    
541
    # Je recupere les flag possibles pour les buffers
542
    mf = cl.mem_flags
543
        
544
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
545
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
546
    
547
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
548

    
549
    MyDuration=numpy.zeros(steps)
550
  
551
    jobs=blocks*threads;
552

    
553
    iterationsCL=numpy.uint64(iterations/jobs)
554
    if iterations%jobs!=0:
555
        iterationsCL+=1
556

    
557
    for i in range(steps):
558
        start_time=time.time()
559
        if threads == 1:
560
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
561
                                                 circleCL,
562
                                                 numpy.uint64(iterationsCL),
563
                                                 numpy.uint32(nprnd(2**32)),
564
                                                 numpy.uint32(nprnd(2**32)))
565
        else:
566
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
567
                                                 circleCL,
568
                                                 numpy.uint64(iterationsCL),
569
                                                 numpy.uint32(nprnd(2**32)),
570
                                                 numpy.uint32(nprnd(2**32)))
571

    
572
        CLLaunch.wait()
573
        cl.enqueue_copy(queue, circle, circleCL).wait()
574

    
575
        elapsed = time.time()-start_time
576
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
577
            
578
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
579
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
580

    
581
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
582
        MyDuration[i]=elapsed
583
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
584
        # MyPi[i]=numpy.median(AllPi)
585
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
586

    
587
    circleCL.release()
588

    
589
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
590
    print(OutputCL)
591
    return(OutputCL)
592

    
593

    
594
def FitAndPrint(N,D,Curves):
595

    
596
    from scipy.optimize import curve_fit
597
    import matplotlib.pyplot as plt
598

    
599
    try:
600
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
601

    
602
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
603
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
604
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
605
        coeffs_Amdahl[0]=D[0]
606
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
607
    except:
608
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
609

    
610
    try:
611
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
612

    
613
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
614
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
615
        coeffs_AmdahlR[0]=D[0]
616
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
617

    
618
    except:
619
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
620

    
621
    try:
622
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
623

    
624
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
625
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
626
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
627
        coeffs_Mylq[0]=D[0]
628
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
629
                                                                coeffs_Mylq[1],
630
                                                                coeffs_Mylq[3],
631
                                                                coeffs_Mylq[2]))
632
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
633
                    coeffs_Mylq[3])
634
    except:
635
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
636

    
637
    try:
638
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
639

    
640
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
641
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
642
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
643
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
644
        coeffs_Mylq2[0]=D[0]
645
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
646

    
647
    except:
648
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
649

    
650
    if Curves:
651
        plt.xlabel("Number of Threads/work Items")
652
        plt.ylabel("Total Elapsed Time")
653

    
654
        Experience,=plt.plot(N,D,'ro') 
655
    try:
656
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
657
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
658
    except:
659
        print("Fit curves seem not to be available")
660

    
661
    plt.legend()
662
    plt.show()
663

    
664
if __name__=='__main__':
665
    
666
    # Set defaults values
667
  
668
    # Id of Device : 1 is for first find !
669
    Device=1
670
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
671
    GpuStyle='OpenCL'
672
    # Iterations is integer
673
    Iterations=10000000
674
    # BlocksBlocks in first number of Blocks to explore
675
    BlocksBegin=1
676
    # BlocksEnd is last number of Blocks to explore
677
    BlocksEnd=1
678
    # BlocksStep is the step of Blocks to explore
679
    BlocksStep=1
680
    # ThreadsBlocks in first number of Blocks to explore
681
    ThreadsBegin=1
682
    # ThreadsEnd is last number of Blocks to explore
683
    ThreadsEnd=1
684
    # ThreadsStep is the step of Blocks to explore
685
    ThreadsStep=1
686
    # Redo is the times to redo the test to improve metrology
687
    Redo=1
688
    # OutMetrology is method for duration estimation : False is GPU inside
689
    OutMetrology=False
690
    Metrology='InMetro'
691
    # Curves is True to print the curves
692
    Curves=False
693
    # Fit is True to print the curves
694
    Fit=False
695
    # Inside based on If
696
    IfThen=False
697
    # Marsaglia RNG
698
    RNG='MWC'
699
    # Value type : INT32, INT64, FP32, FP64
700
    ValueType='FP32'
701

    
702
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
703
    
704
    try:
705
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
706
    except getopt.GetoptError:
707
        print(HowToUse % sys.argv[0])
708
        sys.exit(2)
709

    
710
    # List of Devices
711
    Devices=[]
712
    Alu={}
713
        
714
    for opt, arg in opts:
715
        if opt == '-h':
716
            print(HowToUse % sys.argv[0])
717

    
718
            print("\nInformations about devices detected under OpenCL API:")
719
            # For PyOpenCL import
720
            try:
721
                import pyopencl as cl
722
                Id=0
723
                for platform in cl.get_platforms():
724
                    for device in platform.get_devices():
725
                        #deviceType=cl.device_type.to_string(device.type)
726
                        deviceType="xPU"
727
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
728
                        Id=Id+1
729

    
730
            except:
731
                print("Your platform does not seem to support OpenCL")
732

    
733
            print("\nInformations about devices detected under CUDA API:")
734
            # For PyCUDA import
735
            try:
736
                import pycuda.driver as cuda
737
                cuda.init()
738
                for Id in range(cuda.Device.count()):
739
                    device=cuda.Device(Id)
740
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
741
                print
742
            except:
743
                print("Your platform does not seem to support CUDA")
744
        
745
            sys.exit()
746
        
747
        
748
        elif opt == '-o':
749
            OutMetrology=True
750
            Metrology='OutMetro'
751
        elif opt == '-c':
752
            Curves=True
753
        elif opt == '-k':
754
            IfThen=True
755
        elif opt in ("-d", "--device"):
756
            Devices.append(int(arg))
757
        elif opt in ("-g", "--gpustyle"):
758
            GpuStyle = arg
759
        elif opt in ("-m", "--marsaglia"):
760
            RNG = arg
761
        elif opt in ("-v", "--valuetype"):
762
            ValueType = arg
763
        elif opt in ("-i", "--iterations"):
764
            Iterations = numpy.uint64(arg)
765
        elif opt in ("-b", "--blocksbegin"):
766
            BlocksBegin = int(arg)
767
            BlocksEnd= BlocksBegin
768
        elif opt in ("-e", "--blocksend"):
769
            BlocksEnd = int(arg)
770
        elif opt in ("-s", "--blocksstep"):
771
            BlocksStep = int(arg)
772
        elif opt in ("-f", "--threadsfirst"):
773
            ThreadsBegin = int(arg)
774
            ThreadsEnd = ThreadsBegin
775
        elif opt in ("-l", "--threadslast"):
776
            ThreadsEnd = int(arg)
777
        elif opt in ("-t", "--threadsstep"):
778
            ThreadsStep = int(arg)
779
        elif opt in ("-r", "--redo"):
780
            Redo = int(arg)
781

    
782
    print("Devices Identification : %s" % Devices)
783
    print("GpuStyle used : %s" % GpuStyle)
784
    print("Iterations : %s" % Iterations)
785
    print("Number of Blocks on begin : %s" % BlocksBegin)
786
    print("Number of Blocks on end : %s" % BlocksEnd)
787
    print("Step on Blocks : %s" % BlocksStep)
788
    print("Number of Threads on begin : %s" % ThreadsBegin)
789
    print("Number of Threads on end : %s" % ThreadsEnd)
790
    print("Step on Threads : %s" % ThreadsStep)
791
    print("Number of redo : %s" % Redo)
792
    print("Metrology done out of XPU : %r" % OutMetrology)
793
    print("Type of Marsaglia RNG used : %s" % RNG)
794
    print("Type of variable : %s" % ValueType)
795

    
796
    if GpuStyle=='CUDA':
797
        try:
798
            # For PyCUDA import
799
            import pycuda.driver as cuda
800
            
801
            cuda.init()
802
            for Id in range(cuda.Device.count()):
803
                device=cuda.Device(Id)
804
                print("Device #%i of type GPU : %s" % (Id,device.name()))
805
                if Id in Devices:
806
                    Alu[Id]='GPU'
807
            
808
        except ImportError:
809
            print("Platform does not seem to support CUDA")
810

    
811
    if GpuStyle=='OpenCL':
812
        try:
813
            # For PyOpenCL import
814
            import pyopencl as cl
815
            Id=0
816
            for platform in cl.get_platforms():
817
                for device in platform.get_devices():
818
                    #deviceType=cl.device_type.to_string(device.type)
819
                    deviceType="xPU"
820
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
821

    
822
                    if Id in Devices:
823
                    # Set the Alu as detected Device Type
824
                        Alu[Id]=deviceType
825
                    Id=Id+1
826
        except ImportError:
827
            print("Platform does not seem to support OpenCL")
828

    
829
    print(Devices,Alu)
830
            
831
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
832
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
833
    
834
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
835
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
836
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
837
    avgD=numpy.array([]).astype(numpy.float32)
838
    medD=numpy.array([]).astype(numpy.float32)
839
    stdD=numpy.array([]).astype(numpy.float32)
840
    minD=numpy.array([]).astype(numpy.float32)
841
    maxD=numpy.array([]).astype(numpy.float32)
842
    avgR=numpy.array([]).astype(numpy.float32)
843
    medR=numpy.array([]).astype(numpy.float32)
844
    stdR=numpy.array([]).astype(numpy.float32)
845
    minR=numpy.array([]).astype(numpy.float32)
846
    maxR=numpy.array([]).astype(numpy.float32)
847

    
848
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
849
        
850
        # print Blocks,Threads
851
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
852
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
853
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
854
        ExploredThreads=numpy.append(ExploredThreads,Threads)
855
        
856
        if OutMetrology: 
857
            DurationItem=numpy.array([]).astype(numpy.float32)
858
            Duration=numpy.array([]).astype(numpy.float32)
859
            Rate=numpy.array([]).astype(numpy.float32)
860
            for i in range(Redo):
861
                start=time.time()
862
                if GpuStyle=='CUDA':
863
                    try:
864
                        InputCU={}
865
                        InputCU['Iterations']=Iterations
866
                        InputCU['Steps']=1
867
                        InputCU['Blocks']=Blocks
868
                        InputCU['Threads']=Threads
869
                        InputCU['Device']=Devices[0]
870
                        InputCU['RNG']=RNG
871
                        InputCU['ValueType']=ValueType                    
872
                        InputCU['IfThen']=IfThen                    
873
                        OutputCU=MetropolisCuda(InputCU)
874
                        Inside=OutputCU['Circle']
875
                        NewIterations=OutputCU['NewIterations']
876
                        Duration=OutputCU['Duration']
877
                    except:
878
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
879
                elif GpuStyle=='OpenCL':
880
                    try:
881
                        InputCL={}
882
                        InputCL['Iterations']=Iterations
883
                        InputCL['Steps']=1
884
                        InputCL['Blocks']=Blocks
885
                        InputCL['Threads']=Threads
886
                        InputCL['Device']=Devices[0]
887
                        InputCL['RNG']=RNG
888
                        InputCL['ValueType']=ValueType
889
                        InputCL['IfThen']=IfThen                    
890
                        OutputCL=MetropolisOpenCL(InputCL)
891
                        Inside=OutputCL['Circle']
892
                        NewIterations=OutputCL['NewIterations']
893
                        Duration=OutputCL['Duration']
894
                    except:
895
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
896
                Duration=numpy.append(Duration,time.time()-start)
897
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
898
        else:
899
            if GpuStyle=='CUDA':
900
                try:
901
                    InputCU={}
902
                    InputCU['Iterations']=Iterations
903
                    InputCU['Steps']=Redo
904
                    InputCU['Blocks']=Blocks
905
                    InputCU['Threads']=Threads
906
                    InputCU['Device']=Devices[0]
907
                    InputCU['RNG']=RNG
908
                    InputCU['ValueType']=ValueType
909
                    InputCU['IfThen']=IfThen                    
910
                    OutputCU=MetropolisCuda(InputCU)
911
                    Inside=OutputCU['Inside']
912
                    NewIterations=OutputCU['NewIterations']
913
                    Duration=OutputCU['Duration']
914
                    pycuda.context.pop()
915
                except:
916
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
917
            elif GpuStyle=='OpenCL':
918
                try:
919
                    InputCL={}
920
                    InputCL['Iterations']=Iterations
921
                    InputCL['Steps']=Redo
922
                    InputCL['Blocks']=Blocks
923
                    InputCL['Threads']=Threads
924
                    InputCL['Device']=Devices[0]
925
                    InputCL['RNG']=RNG
926
                    InputCL['ValueType']=ValueType
927
                    InputCL['IfThen']=IfThen
928
                    OutputCL=MetropolisOpenCL(InputCL)
929
                    Inside=OutputCL['Inside']
930
                    NewIterations=OutputCL['NewIterations']
931
                    Duration=OutputCL['Duration']
932
                except:
933
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
934
            Rate=NewIterations/Duration
935
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
936

    
937
            
938
        avgD=numpy.append(avgD,numpy.average(Duration))
939
        medD=numpy.append(medD,numpy.median(Duration))
940
        stdD=numpy.append(stdD,numpy.std(Duration))
941
        minD=numpy.append(minD,numpy.min(Duration))
942
        maxD=numpy.append(maxD,numpy.max(Duration))
943
        avgR=numpy.append(avgR,numpy.average(Rate))
944
        medR=numpy.append(medR,numpy.median(Rate))
945
        stdR=numpy.append(stdR,numpy.std(Rate))
946
        minR=numpy.append(minR,numpy.min(Rate))
947
        maxR=numpy.append(maxR,numpy.max(Rate))
948

    
949
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
950
        
951
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
952
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
953
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
954

    
955
    if Fit:
956
        FitAndPrint(ExploredJobs,median,Curves)