Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 287

Historique | Voir | Annoter | Télécharger (31,71 ko)

1
#!/usr/bin/env python3
2
#
3
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
4
#
5
# performs an estimation of Pi using Monte Carlo method
6
# a large amount of iterations is divided and distributed to compute units
7
# a lot of options are provided to perform scalabilty tests
8
#
9
# use -h for complete set of options
10
#
11
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@ens-lyon.fr> 
12
# 
13
# Part of matrix programs from: https://forge.cbp.ens-lyon.fr/svn/bench4gpu/
14

    
15
# Thanks to Andreas Klockner for PyCUDA:
16
# http://mathema.tician.de/software/pycuda
17
# Thanks to Andreas Klockner for PyOpenCL:
18
# http://mathema.tician.de/software/pyopencl
19
# 
20

    
21
# 2013-01-01 : problems with launch timeout
22
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
23
# Option "Interactive" "0" in /etc/X11/xorg.conf
24

    
25
# Common tools
26
import numpy
27
from numpy.random import randint as nprnd
28
import sys
29
import getopt
30
import time
31
import itertools
32
from socket import gethostname
33

    
34
class PenStacle:
35
    """Pentacle of Statistics from data"""
36
    Avg=0
37
    Med=0
38
    Std=0
39
    Min=0
40
    Max=0
41
    def __init__(self,Data):
42
        self.Avg=numpy.average(Data)
43
        self.Med=numpy.median(Data)
44
        self.Std=numpy.std(Data)
45
        self.Max=numpy.max(Data)
46
        self.Min=numpy.min(Data)
47
    def display(self):
48
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
49

    
50
class Experience:
51
    """Metrology for experiences"""
52
    DeviceStyle=''
53
    DeviceId=0
54
    AvgD=0
55
    MedD=0
56
    StdD=0
57
    MinD=0
58
    MaxD=0
59
    AvgR=0
60
    MedR=0
61
    StdR=0
62
    MinR=0
63
    MaxR=0
64
    def __init__(self,DeviceStyle,DeviceId,Iterations):
65
        self.DeviceStyle=DeviceStyle
66
        self.DeviceId=DeviceId
67
        self.Iterations
68
        
69
    def Metrology(self,Data):
70
        Duration=PenStacle(Data)
71
        Rate=PenStacle(Iterations/Data)
72
        print("Duration %s" % Duration)
73
        print("Rate %s" % Rate)
74

    
75

    
76
        
77
def DictionariesAPI():
78
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
79
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
80
    Test={True:1,False:0}
81
    return(Marsaglia,Computing,Test)
82
    
83
# find prime factors of a number
84
# Get for WWW :
85
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
86
def PrimeFactors(x):
87

    
88
    factorlist=numpy.array([]).astype('uint32')
89
    loop=2
90
    while loop<=x:
91
        if x%loop==0:
92
            x/=loop
93
            factorlist=numpy.append(factorlist,[loop])
94
        else:
95
            loop+=1
96
    return factorlist
97

    
98
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
99
# output is thread number
100
def BestThreadsNumber(jobs):
101
    factors=PrimeFactors(jobs)
102
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
103
    threads=1
104
    for factor in matrix.transpose().ravel():
105
        threads=threads*factor
106
        if threads*threads>jobs or threads>512:
107
            break
108
    return(long(threads))
109

    
110
# Predicted Amdahl Law (Reduced with s=1-p)  
111
def AmdahlR(N, T1, p):
112
    return (T1*(1-p+p/N))
113

    
114
# Predicted Amdahl Law
115
def Amdahl(N, T1, s, p):
116
    return (T1*(s+p/N))
117

    
118
# Predicted Mylq Law with first order
119
def Mylq(N, T1,s,c,p):
120
    return (T1*(s+p/N)+c*N)
121

    
122
# Predicted Mylq Law with second order
123
def Mylq2(N, T1,s,c1,c2,p):
124
    return (T1*(s+p/N)+c1*N+c2*N*N)
125

    
126
def KernelCodeCuda():
127
    KERNEL_CODE_CUDA="""
128
#define TCONG 0
129
#define TSHR3 1
130
#define TMWC 2
131
#define TKISS 3
132

133
#define TINT32 0
134
#define TINT64 1
135
#define TFP32 2
136
#define TFP64 3
137

138
#define IFTHEN 1
139

140
// Marsaglia RNG very simple implementation
141

142
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
143
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
144
#define MWC   (znew+wnew)
145
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
146
#define CONG  (jcong=69069*jcong+1234567)
147
#define KISS  ((MWC^CONG)+SHR3)
148

149
#define MWCfp MWC * 2.328306435454494e-10f
150
#define KISSfp KISS * 2.328306435454494e-10f
151
#define SHR3fp SHR3 * 2.328306435454494e-10f
152
#define CONGfp CONG * 2.328306435454494e-10f
153

154
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
155
{
156

157
#if TRNG == TCONG
158
   uint jcong=seed_z+work;
159
#elif TRNG == TSHR3
160
   uint jsr=seed_w+work;
161
#elif TRNG == TMWC
162
   uint z=seed_z+work;
163
   uint w=seed_w+work;
164
#elif TRNG == TKISS
165
   uint jcong=seed_z+work;
166
   uint jsr=seed_w+work;
167
   uint z=seed_z-work;
168
   uint w=seed_w-work;
169
#endif
170

171
   ulong total=0;
172

173
   for (ulong i=0;i<iterations;i++) {
174

175
#if TYPE == TINT32
176
    #define THEONE 1073741824
177
    #if TRNG == TCONG
178
        uint x=CONG>>17 ;
179
        uint y=CONG>>17 ;
180
    #elif TRNG == TSHR3
181
        uint x=SHR3>>17 ;
182
        uint y=SHR3>>17 ;
183
    #elif TRNG == TMWC
184
        uint x=MWC>>17 ;
185
        uint y=MWC>>17 ;
186
    #elif TRNG == TKISS
187
        uint x=KISS>>17 ;
188
        uint y=KISS>>17 ;
189
    #endif
190
#elif TYPE == TINT64
191
    #define THEONE 4611686018427387904
192
    #if TRNG == TCONG
193
        ulong x=(ulong)(CONG>>1) ;
194
        ulong y=(ulong)(CONG>>1) ;
195
    #elif TRNG == TSHR3
196
        ulong x=(ulong)(SHR3>>1) ;
197
        ulong y=(ulong)(SHR3>>1) ;
198
    #elif TRNG == TMWC
199
        ulong x=(ulong)(MWC>>1) ;
200
        ulong y=(ulong)(MWC>>1) ;
201
    #elif TRNG == TKISS
202
        ulong x=(ulong)(KISS>>1) ;
203
        ulong y=(ulong)(KISS>>1) ;
204
    #endif
205
#elif TYPE == TFP32
206
    #define THEONE 1.0f
207
    #if TRNG == TCONG
208
        float x=CONGfp ;
209
        float y=CONGfp ;
210
    #elif TRNG == TSHR3
211
        float x=SHR3fp ;
212
        float y=SHR3fp ;
213
    #elif TRNG == TMWC
214
        float x=MWCfp ;
215
        float y=MWCfp ;
216
    #elif TRNG == TKISS
217
      float x=KISSfp ;
218
      float y=KISSfp ;
219
    #endif
220
#elif TYPE == TFP64
221
    #define THEONE 1.0f
222
    #if TRNG == TCONG
223
        double x=(double)CONGfp ;
224
        double y=(double)CONGfp ;
225
    #elif TRNG == TSHR3
226
        double x=(double)SHR3fp ;
227
        double y=(double)SHR3fp ;
228
    #elif TRNG == TMWC
229
        double x=(double)MWCfp ;
230
        double y=(double)MWCfp ;
231
    #elif TRNG == TKISS
232
        double x=(double)KISSfp ;
233
        double y=(double)KISSfp ;
234
    #endif
235
#endif
236

237
#if TEST == IFTHEN
238
      if ((x*x+y*y) <=THEONE) {
239
         total+=1;
240
      }
241
#else
242
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
243
      total+=inside;
244
#endif
245
   }
246

247
   return(total);
248
}
249

250
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
251
{
252
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
253
   s[blockIdx.x]=total;
254
   __syncthreads();
255

256
}
257

258
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
259
{
260
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
261
   s[threadIdx.x]=total;
262
   __syncthreads();
263

264
}
265

266
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
267
{
268
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
269
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
270
   __syncthreads();
271
}
272

273
"""
274
    return(KERNEL_CODE_CUDA)
275

    
276
def KernelCodeOpenCL():
277
    KERNEL_CODE_OPENCL="""
278
#define TCONG 0
279
#define TSHR3 1
280
#define TMWC 2
281
#define TKISS 3
282

283
#define TINT32 0
284
#define TINT64 1
285
#define TFP32 2
286
#define TFP64 3
287

288
#define IFTHEN 1
289

290
// Marsaglia RNG very simple implementation
291
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
292
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
293

294
#define MWC   (znew+wnew)
295
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
296
#define CONG  (jcong=69069*jcong+1234567)
297
#define KISS  ((MWC^CONG)+SHR3)
298

299
#define MWCfp MWC * 2.328306435454494e-10f
300
#define KISSfp KISS * 2.328306435454494e-10f
301
#define CONGfp CONG * 2.328306435454494e-10f
302
#define SHR3fp SHR3 * 2.328306435454494e-10f
303

304
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
305
{
306

307
#if TRNG == TCONG
308
   uint jcong=seed_z+work;
309
#elif TRNG == TSHR3
310
   uint jsr=seed_w+work;
311
#elif TRNG == TMWC
312
   uint z=seed_z+work;
313
   uint w=seed_w+work;
314
#elif TRNG == TKISS
315
   uint jcong=seed_z+work;
316
   uint jsr=seed_w+work;
317
   uint z=seed_z-work;
318
   uint w=seed_w-work;
319
#endif
320

321
   ulong total=0;
322

323
   for (ulong i=0;i<iterations;i++) {
324

325
#if TYPE == TINT32
326
    #define THEONE 1073741824
327
    #if TRNG == TCONG
328
        uint x=CONG>>17 ;
329
        uint y=CONG>>17 ;
330
    #elif TRNG == TSHR3
331
        uint x=SHR3>>17 ;
332
        uint y=SHR3>>17 ;
333
    #elif TRNG == TMWC
334
        uint x=MWC>>17 ;
335
        uint y=MWC>>17 ;
336
    #elif TRNG == TKISS
337
        uint x=KISS>>17 ;
338
        uint y=KISS>>17 ;
339
    #endif
340
#elif TYPE == TINT64
341
    #define THEONE 4611686018427387904
342
    #if TRNG == TCONG
343
        ulong x=(ulong)(CONG>>1) ;
344
        ulong y=(ulong)(CONG>>1) ;
345
    #elif TRNG == TSHR3
346
        ulong x=(ulong)(SHR3>>1) ;
347
        ulong y=(ulong)(SHR3>>1) ;
348
    #elif TRNG == TMWC
349
        ulong x=(ulong)(MWC>>1) ;
350
        ulong y=(ulong)(MWC>>1) ;
351
    #elif TRNG == TKISS
352
        ulong x=(ulong)(KISS>>1) ;
353
        ulong y=(ulong)(KISS>>1) ;
354
    #endif
355
#elif TYPE == TFP32
356
    #define THEONE 1.0f
357
    #if TRNG == TCONG
358
        float x=CONGfp ;
359
        float y=CONGfp ;
360
    #elif TRNG == TSHR3
361
        float x=SHR3fp ;
362
        float y=SHR3fp ;
363
    #elif TRNG == TMWC
364
        float x=MWCfp ;
365
        float y=MWCfp ;
366
    #elif TRNG == TKISS
367
      float x=KISSfp ;
368
      float y=KISSfp ;
369
    #endif
370
#elif TYPE == TFP64
371
#pragma OPENCL EXTENSION cl_khr_fp64: enable
372
    #define THEONE 1.0f
373
    #if TRNG == TCONG
374
        double x=(double)CONGfp ;
375
        double y=(double)CONGfp ;
376
    #elif TRNG == TSHR3
377
        double x=(double)SHR3fp ;
378
        double y=(double)SHR3fp ;
379
    #elif TRNG == TMWC
380
        double x=(double)MWCfp ;
381
        double y=(double)MWCfp ;
382
    #elif TRNG == TKISS
383
        double x=(double)KISSfp ;
384
        double y=(double)KISSfp ;
385
    #endif
386
#endif
387

388
#if TEST == IFTHEN
389
      if ((x*x+y*y) <= THEONE) {
390
         total+=1;
391
      }
392
#else
393
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
394
      total+=inside;
395
#endif
396
   }
397
   
398
   return(total);
399
}
400

401
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
402
{
403
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
404
   barrier(CLK_GLOBAL_MEM_FENCE);
405
   s[get_global_id(0)]=total;     
406
}
407

408
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
409
{
410
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
411
   barrier(CLK_LOCAL_MEM_FENCE);
412
   s[get_local_id(0)]=total;
413
}
414

415
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
416
{
417
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
418
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
419
   s[get_global_id(0)]=total;
420
}
421

422
"""
423
    return(KERNEL_CODE_OPENCL)
424
    
425
def MetropolisCuda(InputCU):
426

    
427
    print("Inside ",InputCU)
428
    
429
    iterations=InputCU['Iterations']
430
    steps=InputCU['Steps']
431
    blocks=InputCU['Blocks']
432
    threads=InputCU['Threads']
433
    Device=InputCU['Device']
434
    RNG=InputCU['RNG']
435
    ValueType=InputCU['ValueType']
436
    TestType=InputCU['IfThen']
437
    Seeds=InputCU['Seeds']
438
    
439
    Marsaglia,Computing,Test=DictionariesAPI()
440
    
441
    try:
442
        # For PyCUDA import
443
        import pycuda.driver as cuda
444
        from pycuda.compiler import SourceModule
445
        
446
        cuda.init()
447
        for Id in range(cuda.Device.count()):
448
            if Id==Device:
449
                XPU=cuda.Device(Id)
450
                print("GPU selected %s" % XPU.name())
451
        print
452

    
453
    except ImportError:
454
        print("Platform does not seem to support CUDA")
455

    
456
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)  
457
    circleCU = cuda.InOut(circle)
458
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
459
    #cuda.memcpy_htod(circleCU, circle)
460

    
461
    Context=XPU.make_context()
462

    
463
    try:
464
        mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
465
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
466
        # Needed to set the compiler via ccbin for CUDA9 implementation
467
        #mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.9','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
468
    except:
469
        print("Compilation seems to break")
470
 
471
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
472
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
473
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
474

    
475
    MyDuration=numpy.zeros(steps)
476

    
477
    jobs=blocks*threads;
478

    
479
    iterationsCU=numpy.uint64(iterations/jobs)
480
    if iterations%jobs!=0:
481
        iterationsCU+=numpy.uint64(1)
482

    
483
    for i in range(steps):
484
        start_time=time.time()
485

    
486
        try:
487
            MetropolisHybridCU(circleCU,
488
                               numpy.uint64(iterationsCU),
489
                               numpy.uint32(Seeds[0]),
490
                               numpy.uint32(Seeds[1]),
491
                               grid=(blocks,1),block=(threads,1,1))
492
        except:
493
            print("Crash during CUDA call")
494

    
495
        elapsed = time.time()-start_time
496
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
497

    
498
        MyDuration[i]=elapsed
499

    
500
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
501
    print(OutputCU)
502
    Context.pop()
503
    
504
    Context.detach()
505

    
506
    return(OutputCU)
507

    
508
def MetropolisOpenCL(InputCL):
509

    
510
    import pyopencl as cl
511

    
512
    iterations=InputCL['Iterations']
513
    steps=InputCL['Steps']
514
    blocks=InputCL['Blocks']
515
    threads=InputCL['Threads']
516
    Device=InputCL['Device']
517
    RNG=InputCL['RNG']  
518
    ValueType=InputCL['ValueType']
519
    TestType=InputCL['IfThen']
520
    Seeds=InputCL['Seeds']
521

    
522
    Marsaglia,Computing,Test=DictionariesAPI()
523

    
524
    # Initialisation des variables en les CASTant correctement
525
    Id=0
526
    HasXPU=False
527
    for platform in cl.get_platforms():
528
        for device in platform.get_devices():
529
            if Id==Device:
530
                XPU=device
531
                print("CPU/GPU selected: ",device.name.lstrip())
532
                HasXPU=True
533
            Id+=1
534
            # print(Id)
535

    
536
    if HasXPU==False:
537
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
538
        sys.exit()           
539

    
540
    # Je cree le contexte et la queue pour son execution
541
    try:
542
        ctx = cl.Context(devices=[XPU])
543
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
544
    except:
545
        print("Crash during context creation")
546

    
547
    # Je recupere les flag possibles pour les buffers
548
    mf = cl.mem_flags
549
        
550
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
551
    #    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
552
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY,circle.nbytes)
553
    
554
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
555

    
556
    MyDuration=numpy.zeros(steps)
557
  
558
    jobs=blocks*threads;
559

    
560
    iterationsCL=numpy.uint64(iterations/jobs)
561
    if iterations%jobs!=0:
562
        iterationsCL+=1
563

    
564
    for i in range(steps):
565
        start_time=time.time()
566
        if threads == 1:
567
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
568
                                                 circleCL,
569
                                                 numpy.uint64(iterationsCL),
570
                                                 numpy.uint32(Seeds[0]),
571
                                                 numpy.uint32(Seeds[1]))
572
        else:
573
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
574
                                                 circleCL,
575
                                                 numpy.uint64(iterationsCL),
576
                                                 numpy.uint32(Seeds[0]),
577
                                                 numpy.uint32(Seeds[1]))
578

    
579
        CLLaunch.wait()
580
        cl.enqueue_copy(queue, circle, circleCL).wait()
581

    
582
        elapsed = time.time()-start_time
583
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
584
            
585
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
586
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
587

    
588
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
589
        MyDuration[i]=elapsed
590
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
591
        # MyPi[i]=numpy.median(AllPi)
592
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
593

    
594
    circleCL.release()
595

    
596
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
597
    # print(OutputCL)
598
    return(OutputCL)
599

    
600

    
601
def FitAndPrint(N,D,Curves):
602

    
603
    from scipy.optimize import curve_fit
604
    import matplotlib.pyplot as plt
605

    
606
    try:
607
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
608

    
609
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
610
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
611
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
612
        coeffs_Amdahl[0]=D[0]
613
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
614
    except:
615
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
616

    
617
    try:
618
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
619

    
620
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
621
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
622
        coeffs_AmdahlR[0]=D[0]
623
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
624

    
625
    except:
626
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
627

    
628
    try:
629
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
630

    
631
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
632
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
633
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
634
        coeffs_Mylq[0]=D[0]
635
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
636
                                                                coeffs_Mylq[1],
637
                                                                coeffs_Mylq[3],
638
                                                                coeffs_Mylq[2]))
639
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
640
                    coeffs_Mylq[3])
641
    except:
642
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
643

    
644
    try:
645
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
646

    
647
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
648
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
649
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
650
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
651
        coeffs_Mylq2[0]=D[0]
652
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
653

    
654
    except:
655
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
656

    
657
    if Curves:
658
        plt.xlabel("Number of Threads/work Items")
659
        plt.ylabel("Total Elapsed Time")
660

    
661
        Experience,=plt.plot(N,D,'ro') 
662
    try:
663
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
664
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
665
    except:
666
        print("Fit curves seem not to be available")
667

    
668
    plt.legend()
669
    plt.show()
670

    
671
if __name__=='__main__':
672
    
673
    # Set defaults values
674
  
675
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
676
    GpuStyle='OpenCL'
677
    # Iterations is integer
678
    Iterations=1000000000
679
    # BlocksBlocks in first number of Blocks to explore
680
    BlocksBegin=1024
681
    # BlocksEnd is last number of Blocks to explore
682
    BlocksEnd=1024
683
    # BlocksStep is the step of Blocks to explore
684
    BlocksStep=1
685
    # ThreadsBlocks in first number of Blocks to explore
686
    ThreadsBegin=1
687
    # ThreadsEnd is last number of Blocks to explore
688
    ThreadsEnd=1
689
    # ThreadsStep is the step of Blocks to explore
690
    ThreadsStep=1
691
    # Redo is the times to redo the test to improve metrology
692
    Redo=1
693
    # OutMetrology is method for duration estimation : False is GPU inside
694
    OutMetrology=False
695
    Metrology='InMetro'
696
    # Curves is True to print the curves
697
    Curves=False
698
    # Fit is True to print the curves
699
    Fit=False
700
    # Inside based on If
701
    IfThen=False
702
    # Marsaglia RNG
703
    RNG='MWC'
704
    # Value type : INT32, INT64, FP32, FP64
705
    ValueType='FP32'
706
    # Seeds for RNG
707
    Seeds=110271,101008
708

    
709
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
710
    
711
    try:
712
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
713
    except getopt.GetoptError:
714
        print(HowToUse % sys.argv[0])
715
        sys.exit(2)
716

    
717
    # List of Devices
718
    Devices=[]
719
    Alu={}
720
        
721
    for opt, arg in opts:
722
        if opt == '-h':
723
            print(HowToUse % sys.argv[0])
724

    
725
            print("\nInformations about devices detected under OpenCL API:")
726
            # For PyOpenCL import
727
            try:
728
                import pyopencl as cl
729
                Id=0
730
                for platform in cl.get_platforms():
731
                    for device in platform.get_devices():
732
                        #deviceType=cl.device_type.to_string(device.type)
733
                        deviceType="xPU"
734
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
735
                        Id=Id+1
736

    
737
            except:
738
                print("Your platform does not seem to support OpenCL")
739

    
740
            print("\nInformations about devices detected under CUDA API:")
741
            # For PyCUDA import
742
            try:
743
                import pycuda.driver as cuda
744
                cuda.init()
745
                for Id in range(cuda.Device.count()):
746
                    device=cuda.Device(Id)
747
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
748
                print
749
            except:
750
                print("Your platform does not seem to support CUDA")
751
        
752
            sys.exit()
753
        
754
        elif opt == '-o':
755
            OutMetrology=True
756
            Metrology='OutMetro'
757
        elif opt == '-c':
758
            Curves=True
759
        elif opt == '-k':
760
            IfThen=True
761
        elif opt in ("-d", "--device"):
762
            Devices.append(int(arg))
763
        elif opt in ("-g", "--gpustyle"):
764
            GpuStyle = arg
765
        elif opt in ("-m", "--marsaglia"):
766
            RNG = arg
767
        elif opt in ("-v", "--valuetype"):
768
            ValueType = arg
769
        elif opt in ("-i", "--iterations"):
770
            Iterations = numpy.uint64(arg)
771
        elif opt in ("-b", "--blocksbegin"):
772
            BlocksBegin = int(arg)
773
            BlocksEnd= BlocksBegin
774
        elif opt in ("-e", "--blocksend"):
775
            BlocksEnd = int(arg)
776
        elif opt in ("-s", "--blocksstep"):
777
            BlocksStep = int(arg)
778
        elif opt in ("-f", "--threadsfirst"):
779
            ThreadsBegin = int(arg)
780
            ThreadsEnd = ThreadsBegin
781
        elif opt in ("-l", "--threadslast"):
782
            ThreadsEnd = int(arg)
783
        elif opt in ("-t", "--threadsstep"):
784
            ThreadsStep = int(arg)
785
        elif opt in ("-r", "--redo"):
786
            Redo = int(arg)
787

    
788
    # If no device has been specified, take the first one!
789
    if len(Devices)==0:
790
        Devices.append(0)
791
        
792
    print("Devices Identification : %s" % Devices)
793
    print("GpuStyle used : %s" % GpuStyle)
794
    print("Iterations : %s" % Iterations)
795
    print("Number of Blocks on begin : %s" % BlocksBegin)
796
    print("Number of Blocks on end : %s" % BlocksEnd)
797
    print("Step on Blocks : %s" % BlocksStep)
798
    print("Number of Threads on begin : %s" % ThreadsBegin)
799
    print("Number of Threads on end : %s" % ThreadsEnd)
800
    print("Step on Threads : %s" % ThreadsStep)
801
    print("Number of redo : %s" % Redo)
802
    print("Metrology done out of XPU : %r" % OutMetrology)
803
    print("Type of Marsaglia RNG used : %s" % RNG)
804
    print("Type of variable : %s" % ValueType)
805

    
806
    if GpuStyle=='CUDA':
807
        try:
808
            # For PyCUDA import
809
            import pycuda.driver as cuda
810
            
811
            cuda.init()
812
            for Id in range(cuda.Device.count()):
813
                device=cuda.Device(Id)
814
                print("Device #%i of type GPU : %s" % (Id,device.name()))
815
                if Id in Devices:
816
                    Alu[Id]='GPU'
817
            
818
        except ImportError:
819
            print("Platform does not seem to support CUDA")
820

    
821
    if GpuStyle=='OpenCL':
822
        try:
823
            # For PyOpenCL import
824
            import pyopencl as cl
825
            Id=0
826
            for platform in cl.get_platforms():
827
                for device in platform.get_devices():
828
                    #deviceType=cl.device_type.to_string(device.type)
829
                    deviceType="xPU"
830
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
831

    
832
                    if Id in Devices:
833
                    # Set the Alu as detected Device Type
834
                        Alu[Id]=deviceType
835
                    Id=Id+1
836
        except ImportError:
837
            print("Platform does not seem to support OpenCL")
838

    
839
    #print(Devices,Alu)
840
            
841
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
842
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
843
    
844
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
845
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
846
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
847
    avgD=numpy.array([]).astype(numpy.float32)
848
    medD=numpy.array([]).astype(numpy.float32)
849
    stdD=numpy.array([]).astype(numpy.float32)
850
    minD=numpy.array([]).astype(numpy.float32)
851
    maxD=numpy.array([]).astype(numpy.float32)
852
    avgR=numpy.array([]).astype(numpy.float32)
853
    medR=numpy.array([]).astype(numpy.float32)
854
    stdR=numpy.array([]).astype(numpy.float32)
855
    minR=numpy.array([]).astype(numpy.float32)
856
    maxR=numpy.array([]).astype(numpy.float32)
857

    
858
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
859
        
860
        # print Blocks,Threads
861
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
862
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
863
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
864
        ExploredThreads=numpy.append(ExploredThreads,Threads)
865
        
866
        if OutMetrology: 
867
            DurationItem=numpy.array([]).astype(numpy.float32)
868
            Duration=numpy.array([]).astype(numpy.float32)
869
            Rate=numpy.array([]).astype(numpy.float32)
870
            for i in range(Redo):
871
                start=time.time()
872
                if GpuStyle=='CUDA':
873
                    try:
874
                        InputCU={}
875
                        InputCU['Iterations']=Iterations
876
                        InputCU['Steps']=1
877
                        InputCU['Blocks']=Blocks
878
                        InputCU['Threads']=Threads
879
                        InputCU['Device']=Devices[0]
880
                        InputCU['RNG']=RNG
881
                        InputCU['Seeds']=Seeds
882
                        InputCU['ValueType']=ValueType                    
883
                        InputCU['IfThen']=IfThen                    
884
                        OutputCU=MetropolisCuda(InputCU)
885
                        Inside=OutputCU['Circle']
886
                        NewIterations=OutputCU['NewIterations']
887
                        Duration=OutputCU['Duration']
888
                    except:
889
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
890
                elif GpuStyle=='OpenCL':
891
                    try:
892
                        InputCL={}
893
                        InputCL['Iterations']=Iterations
894
                        InputCL['Steps']=1
895
                        InputCL['Blocks']=Blocks
896
                        InputCL['Threads']=Threads
897
                        InputCL['Device']=Devices[0]
898
                        InputCL['RNG']=RNG
899
                        InputCL['Seeds']=Seeds
900
                        InputCL['ValueType']=ValueType
901
                        InputCL['IfThen']=IfThen                    
902
                        OutputCL=MetropolisOpenCL(InputCL)
903
                        Inside=OutputCL['Circle']
904
                        NewIterations=OutputCL['NewIterations']
905
                        Duration=OutputCL['Duration']
906
                    except:
907
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
908
                Duration=numpy.append(Duration,time.time()-start)
909
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
910
        else:
911
            if GpuStyle=='CUDA':
912
                try:
913
                    InputCU={}
914
                    InputCU['Iterations']=Iterations
915
                    InputCU['Steps']=Redo
916
                    InputCU['Blocks']=Blocks
917
                    InputCU['Threads']=Threads
918
                    InputCU['Device']=Devices[0]
919
                    InputCU['RNG']=RNG
920
                    InputCU['Seeds']=Seeds
921
                    InputCU['ValueType']=ValueType
922
                    InputCU['IfThen']=IfThen                    
923
                    OutputCU=MetropolisCuda(InputCU)
924
                    Inside=OutputCU['Inside']
925
                    NewIterations=OutputCU['NewIterations']
926
                    Duration=OutputCU['Duration']
927
                    pycuda.context.pop()
928
                except:
929
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
930
            elif GpuStyle=='OpenCL':
931
                try:
932
                    InputCL={}
933
                    InputCL['Iterations']=Iterations
934
                    InputCL['Steps']=Redo
935
                    InputCL['Blocks']=Blocks
936
                    InputCL['Threads']=Threads
937
                    InputCL['Device']=Devices[0]
938
                    InputCL['RNG']=RNG
939
                    InputCL['Seeds']=Seeds
940
                    InputCL['ValueType']=ValueType
941
                    InputCL['IfThen']=IfThen
942
                    OutputCL=MetropolisOpenCL(InputCL)
943
                    Inside=OutputCL['Inside']
944
                    NewIterations=OutputCL['NewIterations']
945
                    Duration=OutputCL['Duration']
946
                except:
947
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
948
            Rate=NewIterations/Duration[-1]
949
            print("Itops %i\nLogItops %.2f " % (int(Rate),numpy.log(Rate)/numpy.log(10)))
950
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
951

    
952
        avgD=numpy.append(avgD,numpy.average(Duration))
953
        medD=numpy.append(medD,numpy.median(Duration))
954
        stdD=numpy.append(stdD,numpy.std(Duration))
955
        minD=numpy.append(minD,numpy.min(Duration))
956
        maxD=numpy.append(maxD,numpy.max(Duration))
957
        avgR=numpy.append(avgR,numpy.average(Rate))
958
        medR=numpy.append(medR,numpy.median(Rate))
959
        stdR=numpy.append(stdR,numpy.std(Rate))
960
        minR=numpy.append(minR,numpy.min(Rate))
961
        maxR=numpy.append(maxR,numpy.max(Rate))
962

    
963
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
964
        
965
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
966
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
967
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
968

    
969
    if Fit:
970
        FitAndPrint(ExploredJobs,median,Curves)