Statistiques
| Révision :

root / Pi / XPU / PiXPU.py @ 187

Historique | Voir | Annoter | Télécharger (31,02 ko)

1
#!/usr/bin/env python3
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import itertools
26
from socket import gethostname
27

    
28
class PenStacle:
29
    """Pentacle of Statistics from data"""
30
    Avg=0
31
    Med=0
32
    Std=0
33
    Min=0
34
    Max=0
35
    def __init__(self,Data):
36
        self.Avg=numpy.average(Data)
37
        self.Med=numpy.median(Data)
38
        self.Std=numpy.std(Data)
39
        self.Max=numpy.max(Data)
40
        self.Min=numpy.min(Data)
41
    def display(self):
42
        print("%s %s %s %s %s" % (self.Avg,self.Med,self.Std,self.Min,self.Max))
43

    
44
class Experience:
45
    """Metrology for experiences"""
46
    DeviceStyle=''
47
    DeviceId=0
48
    AvgD=0
49
    MedD=0
50
    StdD=0
51
    MinD=0
52
    MaxD=0
53
    AvgR=0
54
    MedR=0
55
    StdR=0
56
    MinR=0
57
    MaxR=0
58
    def __init__(self,DeviceStyle,DeviceId,Iterations):
59
        self.DeviceStyle=DeviceStyle
60
        self.DeviceId=DeviceId
61
        self.Iterations
62
        
63
    def Metrology(self,Data):
64
        Duration=PenStacle(Data)
65
        Rate=PenStacle(Iterations/Data)
66
        print("Duration %s" % Duration)
67
        print("Rate %s" % Rate)
68

    
69

    
70
        
71
def DictionariesAPI():
72
    Marsaglia={'CONG':0,'SHR3':1,'MWC':2,'KISS':3}
73
    Computing={'INT32':0,'INT64':1,'FP32':2,'FP64':3}
74
    Test={True:1,False:0}
75
    return(Marsaglia,Computing,Test)
76
    
77
# find prime factors of a number
78
# Get for WWW :
79
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
80
def PrimeFactors(x):
81

    
82
    factorlist=numpy.array([]).astype('uint32')
83
    loop=2
84
    while loop<=x:
85
        if x%loop==0:
86
            x/=loop
87
            factorlist=numpy.append(factorlist,[loop])
88
        else:
89
            loop+=1
90
    return factorlist
91

    
92
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
93
# output is thread number
94
def BestThreadsNumber(jobs):
95
    factors=PrimeFactors(jobs)
96
    matrix=numpy.append([factors],[factors[::-1]],axis=0)
97
    threads=1
98
    for factor in matrix.transpose().ravel():
99
        threads=threads*factor
100
        if threads*threads>jobs or threads>512:
101
            break
102
    return(long(threads))
103

    
104
# Predicted Amdahl Law (Reduced with s=1-p)  
105
def AmdahlR(N, T1, p):
106
    return (T1*(1-p+p/N))
107

    
108
# Predicted Amdahl Law
109
def Amdahl(N, T1, s, p):
110
    return (T1*(s+p/N))
111

    
112
# Predicted Mylq Law with first order
113
def Mylq(N, T1,s,c,p):
114
    return (T1*(s+p/N)+c*N)
115

    
116
# Predicted Mylq Law with second order
117
def Mylq2(N, T1,s,c1,c2,p):
118
    return (T1*(s+p/N)+c1*N+c2*N*N)
119

    
120
def KernelCodeCuda():
121
    KERNEL_CODE_CUDA="""
122
#define TCONG 0
123
#define TSHR3 1
124
#define TMWC 2
125
#define TKISS 3
126

127
#define TINT32 0
128
#define TINT64 1
129
#define TFP32 2
130
#define TFP64 3
131

132
#define IFTHEN 1
133

134
// Marsaglia RNG very simple implementation
135

136
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
137
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
138
#define MWC   (znew+wnew)
139
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
140
#define CONG  (jcong=69069*jcong+1234567)
141
#define KISS  ((MWC^CONG)+SHR3)
142

143
#define MWCfp MWC * 2.328306435454494e-10f
144
#define KISSfp KISS * 2.328306435454494e-10f
145
#define SHR3fp SHR3 * 2.328306435454494e-10f
146
#define CONGfp CONG * 2.328306435454494e-10f
147

148
__device__ ulong MainLoop(ulong iterations,uint seed_w,uint seed_z,size_t work)
149
{
150

151
#if TRNG == TCONG
152
   uint jcong=seed_z+work;
153
#elif TRNG == TSHR3
154
   uint jsr=seed_w+work;
155
#elif TRNG == TMWC
156
   uint z=seed_z+work;
157
   uint w=seed_w+work;
158
#elif TRNG == TKISS
159
   uint jcong=seed_z+work;
160
   uint jsr=seed_w+work;
161
   uint z=seed_z-work;
162
   uint w=seed_w-work;
163
#endif
164

165
   ulong total=0;
166

167
   for (ulong i=0;i<iterations;i++) {
168

169
#if TYPE == TINT32
170
    #define THEONE 1073741824
171
    #if TRNG == TCONG
172
        uint x=CONG>>17 ;
173
        uint y=CONG>>17 ;
174
    #elif TRNG == TSHR3
175
        uint x=SHR3>>17 ;
176
        uint y=SHR3>>17 ;
177
    #elif TRNG == TMWC
178
        uint x=MWC>>17 ;
179
        uint y=MWC>>17 ;
180
    #elif TRNG == TKISS
181
        uint x=KISS>>17 ;
182
        uint y=KISS>>17 ;
183
    #endif
184
#elif TYPE == TINT64
185
    #define THEONE 4611686018427387904
186
    #if TRNG == TCONG
187
        ulong x=(ulong)(CONG>>1) ;
188
        ulong y=(ulong)(CONG>>1) ;
189
    #elif TRNG == TSHR3
190
        ulong x=(ulong)(SHR3>>1) ;
191
        ulong y=(ulong)(SHR3>>1) ;
192
    #elif TRNG == TMWC
193
        ulong x=(ulong)(MWC>>1) ;
194
        ulong y=(ulong)(MWC>>1) ;
195
    #elif TRNG == TKISS
196
        ulong x=(ulong)(KISS>>1) ;
197
        ulong y=(ulong)(KISS>>1) ;
198
    #endif
199
#elif TYPE == TFP32
200
    #define THEONE 1.0f
201
    #if TRNG == TCONG
202
        float x=CONGfp ;
203
        float y=CONGfp ;
204
    #elif TRNG == TSHR3
205
        float x=SHR3fp ;
206
        float y=SHR3fp ;
207
    #elif TRNG == TMWC
208
        float x=MWCfp ;
209
        float y=MWCfp ;
210
    #elif TRNG == TKISS
211
      float x=KISSfp ;
212
      float y=KISSfp ;
213
    #endif
214
#elif TYPE == TFP64
215
    #define THEONE 1.0f
216
    #if TRNG == TCONG
217
        double x=(double)CONGfp ;
218
        double y=(double)CONGfp ;
219
    #elif TRNG == TSHR3
220
        double x=(double)SHR3fp ;
221
        double y=(double)SHR3fp ;
222
    #elif TRNG == TMWC
223
        double x=(double)MWCfp ;
224
        double y=(double)MWCfp ;
225
    #elif TRNG == TKISS
226
        double x=(double)KISSfp ;
227
        double y=(double)KISSfp ;
228
    #endif
229
#endif
230

231
#if TEST == IFTHEN
232
      if ((x*x+y*y) <=THEONE) {
233
         total+=1;
234
      }
235
#else
236
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
237
      total+=inside;
238
#endif
239
   }
240

241
   return(total);
242
}
243

244
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
245
{
246
   ulong total=MainLoop(iterations,seed_z,seed_w,blockIdx.x);
247
   s[blockIdx.x]=total;
248
   __syncthreads();
249

250
}
251

252
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
253
{
254
   ulong total=MainLoop(iterations,seed_z,seed_w,threadIdx.x);
255
   s[threadIdx.x]=total;
256
   __syncthreads();
257

258
}
259

260
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
261
{
262
   ulong total=MainLoop(iterations,seed_z,seed_w,blockDim.x*blockIdx.x+threadIdx.x);
263
   s[blockDim.x*blockIdx.x+threadIdx.x]=total;
264
   __syncthreads();
265
}
266

267
"""
268
    return(KERNEL_CODE_CUDA)
269

    
270
def KernelCodeOpenCL():
271
    KERNEL_CODE_OPENCL="""
272
#define TCONG 0
273
#define TSHR3 1
274
#define TMWC 2
275
#define TKISS 3
276

277
#define TINT32 0
278
#define TINT64 1
279
#define TFP32 2
280
#define TFP64 3
281

282
#define IFTHEN 1
283

284
// Marsaglia RNG very simple implementation
285
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
286
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
287

288
#define MWC   (znew+wnew)
289
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
290
#define CONG  (jcong=69069*jcong+1234567)
291
#define KISS  ((MWC^CONG)+SHR3)
292

293
#define MWCfp MWC * 2.328306435454494e-10f
294
#define KISSfp KISS * 2.328306435454494e-10f
295
#define CONGfp CONG * 2.328306435454494e-10f
296
#define SHR3fp SHR3 * 2.328306435454494e-10f
297

298
ulong MainLoop(ulong iterations,uint seed_z,uint seed_w,size_t work)
299
{
300

301
#if TRNG == TCONG
302
   uint jcong=seed_z+work;
303
#elif TRNG == TSHR3
304
   uint jsr=seed_w+work;
305
#elif TRNG == TMWC
306
   uint z=seed_z+work;
307
   uint w=seed_w+work;
308
#elif TRNG == TKISS
309
   uint jcong=seed_z+work;
310
   uint jsr=seed_w+work;
311
   uint z=seed_z-work;
312
   uint w=seed_w-work;
313
#endif
314

315
   ulong total=0;
316

317
   for (ulong i=0;i<iterations;i++) {
318

319
#if TYPE == TINT32
320
    #define THEONE 1073741824
321
    #if TRNG == TCONG
322
        uint x=CONG>>17 ;
323
        uint y=CONG>>17 ;
324
    #elif TRNG == TSHR3
325
        uint x=SHR3>>17 ;
326
        uint y=SHR3>>17 ;
327
    #elif TRNG == TMWC
328
        uint x=MWC>>17 ;
329
        uint y=MWC>>17 ;
330
    #elif TRNG == TKISS
331
        uint x=KISS>>17 ;
332
        uint y=KISS>>17 ;
333
    #endif
334
#elif TYPE == TINT64
335
    #define THEONE 4611686018427387904
336
    #if TRNG == TCONG
337
        ulong x=(ulong)(CONG>>1) ;
338
        ulong y=(ulong)(CONG>>1) ;
339
    #elif TRNG == TSHR3
340
        ulong x=(ulong)(SHR3>>1) ;
341
        ulong y=(ulong)(SHR3>>1) ;
342
    #elif TRNG == TMWC
343
        ulong x=(ulong)(MWC>>1) ;
344
        ulong y=(ulong)(MWC>>1) ;
345
    #elif TRNG == TKISS
346
        ulong x=(ulong)(KISS>>1) ;
347
        ulong y=(ulong)(KISS>>1) ;
348
    #endif
349
#elif TYPE == TFP32
350
    #define THEONE 1.0f
351
    #if TRNG == TCONG
352
        float x=CONGfp ;
353
        float y=CONGfp ;
354
    #elif TRNG == TSHR3
355
        float x=SHR3fp ;
356
        float y=SHR3fp ;
357
    #elif TRNG == TMWC
358
        float x=MWCfp ;
359
        float y=MWCfp ;
360
    #elif TRNG == TKISS
361
      float x=KISSfp ;
362
      float y=KISSfp ;
363
    #endif
364
#elif TYPE == TFP64
365
#pragma OPENCL EXTENSION cl_khr_fp64: enable
366
    #define THEONE 1.0f
367
    #if TRNG == TCONG
368
        double x=(double)CONGfp ;
369
        double y=(double)CONGfp ;
370
    #elif TRNG == TSHR3
371
        double x=(double)SHR3fp ;
372
        double y=(double)SHR3fp ;
373
    #elif TRNG == TMWC
374
        double x=(double)MWCfp ;
375
        double y=(double)MWCfp ;
376
    #elif TRNG == TKISS
377
        double x=(double)KISSfp ;
378
        double y=(double)KISSfp ;
379
    #endif
380
#endif
381

382
#if TEST == IFTHEN
383
      if ((x*x+y*y) <= THEONE) {
384
         total+=1;
385
      }
386
#else
387
      ulong inside=((x*x+y*y) <= THEONE) ? 1:0;
388
      total+=inside;
389
#endif
390
   }
391
   
392
   return(total);
393
}
394

395
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
396
{
397
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
398
   barrier(CLK_GLOBAL_MEM_FENCE);
399
   s[get_global_id(0)]=total;     
400
}
401

402
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
403
{
404
   ulong total=MainLoop(iterations,seed_z,seed_w,get_local_id(0));
405
   barrier(CLK_LOCAL_MEM_FENCE);
406
   s[get_local_id(0)]=total;
407
}
408

409
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
410
{
411
   ulong total=MainLoop(iterations,seed_z,seed_w,get_global_id(0));
412
   barrier(CLK_GLOBAL_MEM_FENCE || CLK_LOCAL_MEM_FENCE);
413
   s[get_global_id(0)]=total;
414
}
415

416
"""
417
    return(KERNEL_CODE_OPENCL)
418
    
419
def MetropolisCuda(InputCU):
420

    
421
    print("Inside ",InputCU)
422
    
423
    iterations=InputCU['Iterations']
424
    steps=InputCU['Steps']
425
    blocks=InputCU['Blocks']
426
    threads=InputCU['Threads']
427
    Device=InputCU['Device']
428
    RNG=InputCU['RNG']
429
    ValueType=InputCU['ValueType']
430
    TestType=InputCU['IfThen']
431
    
432
    Marsaglia,Computing,Test=DictionariesAPI()
433
    
434
    try:
435
        # For PyCUDA import
436
        import pycuda.driver as cuda
437
        from pycuda.compiler import SourceModule
438
        
439
        cuda.init()
440
        for Id in range(cuda.Device.count()):
441
            if Id==Device:
442
                XPU=cuda.Device(Id)
443
                print("GPU selected %s" % XPU.name())
444
        print
445

    
446
    except ImportError:
447
        print("Platform does not seem to support CUDA")
448

    
449
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)  
450
    circleCU = cuda.InOut(circle)
451
    #circleCU = cuda.mem_alloc(circle.size*circle.dtype.itemize)
452
    #cuda.memcpy_htod(circleCU, circle)
453

    
454
    Context=XPU.make_context()
455

    
456
    try:
457
        #mod = SourceModule(KernelCodeCuda(),options=['--compiler-options','-DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
458
        #mod = SourceModule(KernelCodeCuda(),nvcc='nvcc',keep=True)
459
        # Needed to set the compiler via ccbin for CUDA9 implementation
460
        mod = SourceModule(KernelCodeCuda(),options=['-ccbin','clang-3.8','--compiler-options','-DTRNG=%i' % Marsaglia[RNG],'-DTYPE=%s' % Computing[ValueType],'-DTEST=%s' % Test[TestType]],keep=True)
461
    except:
462
        print("Compilation seems to break")
463
 
464
    MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
465
    MetropolisThreadsCU=mod.get_function("MainLoopThreads")
466
    MetropolisHybridCU=mod.get_function("MainLoopHybrid")
467

    
468
    MyDuration=numpy.zeros(steps)
469

    
470
    jobs=blocks*threads;
471

    
472
    iterationsCU=numpy.uint64(iterations/jobs)
473
    if iterations%jobs!=0:
474
        iterationsCU+=numpy.uint64(1)
475

    
476
    for i in range(steps):
477
        start_time=time.time()
478

    
479
        try:
480
            MetropolisHybridCU(circleCU,
481
                               numpy.uint64(iterationsCU),
482
                               numpy.uint32(nprnd(2**32)),
483
                               numpy.uint32(nprnd(2**32)),
484
                               grid=(blocks,1),block=(threads,1,1))
485
        except:
486
            print("Crash during CUDA call")
487

    
488
        elapsed = time.time()-start_time
489
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
490

    
491
        MyDuration[i]=elapsed
492

    
493
    OutputCU={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCU*jobs),'Duration':MyDuration}
494
    print(OutputCU)
495
    Context.pop()
496
    
497
    Context.detach()
498
    return(OutputCU)
499

    
500
def MetropolisOpenCL(InputCL):
501

    
502
    import pyopencl as cl
503

    
504
    print("Inside ",InputCL)
505
    
506
    iterations=InputCL['Iterations']
507
    steps=InputCL['Steps']
508
    blocks=InputCL['Blocks']
509
    threads=InputCL['Threads']
510
    Device=InputCL['Device']
511
    RNG=InputCL['RNG']  
512
    ValueType=InputCL['ValueType']
513
    TestType=InputCL['IfThen']
514

    
515
    Marsaglia,Computing,Test=DictionariesAPI()
516
    
517
    # Initialisation des variables en les CASTant correctement
518
    Id=0
519
    HasXPU=False
520
    for platform in cl.get_platforms():
521
        for device in platform.get_devices():
522
            if Id==Device:
523
                XPU=device
524
                print("CPU/GPU selected: ",device.name.lstrip())
525
                HasXPU=True
526
            Id+=1
527

    
528
    if HasXPU==False:
529
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
530
        sys.exit()              
531

    
532
    # Je cree le contexte et la queue pour son execution
533
    try:
534
        ctx = cl.Context([XPU])
535
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
536
    except:
537
        print("Crash during context creation")
538

    
539
    # Je recupere les flag possibles pour les buffers
540
    mf = cl.mem_flags
541
        
542
    circle=numpy.zeros(blocks*threads).astype(numpy.uint64)
543
    circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
544
    
545
    MetropolisCL = cl.Program(ctx,KernelCodeOpenCL()).build( options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s -DTEST=%s" % (Marsaglia[RNG],Computing[ValueType],Test[TestType]))
546

    
547
    MyDuration=numpy.zeros(steps)
548
  
549
    jobs=blocks*threads;
550

    
551
    iterationsCL=numpy.uint64(iterations/jobs)
552
    if iterations%jobs!=0:
553
        iterationsCL+=1
554

    
555
    for i in range(steps):
556
        start_time=time.time()
557
        if threads == 1:
558
            CLLaunch=MetropolisCL.MainLoopGlobal(queue,(blocks,),None,
559
                                                 circleCL,
560
                                                 numpy.uint64(iterationsCL),
561
                                                 numpy.uint32(nprnd(2**32)),
562
                                                 numpy.uint32(nprnd(2**32)))
563
        else:
564
            CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
565
                                                 circleCL,
566
                                                 numpy.uint64(iterationsCL),
567
                                                 numpy.uint32(nprnd(2**32)),
568
                                                 numpy.uint32(nprnd(2**32)))
569

    
570
        CLLaunch.wait()
571
        cl.enqueue_copy(queue, circle, circleCL).wait()
572

    
573
        elapsed = time.time()-start_time
574
        print("(Blocks/Threads)=(%i,%i) method done in %.2f s..." % (blocks,threads,elapsed))
575
            
576
        # Elapsed method based on CLLaunch doesn't work for Beignet OpenCL
577
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
578

    
579
        # print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
580
        MyDuration[i]=elapsed
581
        # AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
582
        # MyPi[i]=numpy.median(AllPi)
583
        # print MyPi[i],numpy.std(AllPi),MyDuration[i]
584

    
585
    circleCL.release()
586

    
587
    OutputCL={'Inside':sum(circle),'NewIterations':numpy.uint64(iterationsCL*jobs),'Duration':MyDuration}
588
    print(OutputCL)
589
    return(OutputCL)
590

    
591

    
592
def FitAndPrint(N,D,Curves):
593

    
594
    from scipy.optimize import curve_fit
595
    import matplotlib.pyplot as plt
596

    
597
    try:
598
        coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
599

    
600
        D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
601
        coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
602
        coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
603
        coeffs_Amdahl[0]=D[0]
604
        print("Amdahl Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2]))
605
    except:
606
        print("Impossible to fit for Amdahl law : only %i elements" % len(D))
607

    
608
    try:
609
        coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
610

    
611
        D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
612
        coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
613
        coeffs_AmdahlR[0]=D[0]
614
        print("Amdahl Reduced Normalized: T=%.2f(%.6f+%.6f/N)" % (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1]))
615

    
616
    except:
617
        print("Impossible to fit for Reduced Amdahl law : only %i elements" % len(D))
618

    
619
    try:
620
        coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
621

    
622
        coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
623
        # coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
624
        coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
625
        coeffs_Mylq[0]=D[0]
626
        print("Mylq Normalized : T=%.2f(%.6f+%.6f/N)+%.6f*N" % (coeffs_Mylq[0],
627
                                                                coeffs_Mylq[1],
628
                                                                coeffs_Mylq[3],
629
                                                                coeffs_Mylq[2]))
630
        D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
631
                    coeffs_Mylq[3])
632
    except:
633
        print("Impossible to fit for Mylq law : only %i elements" % len(D))
634

    
635
    try:
636
        coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
637

    
638
        coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
639
        # coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
640
        # coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
641
        coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
642
        coeffs_Mylq2[0]=D[0]
643
        print("Mylq 2nd order Normalized: T=%.2f(%.6f+%.6f/N)+%.6f*N+%.6f*N^2" % (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[4],coeffs_Mylq2[2],coeffs_Mylq2[3]))
644

    
645
    except:
646
        print("Impossible to fit for 2nd order Mylq law : only %i elements" % len(D))
647

    
648
    if Curves:
649
        plt.xlabel("Number of Threads/work Items")
650
        plt.ylabel("Total Elapsed Time")
651

    
652
        Experience,=plt.plot(N,D,'ro') 
653
    try:
654
        pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
655
        pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
656
    except:
657
        print("Fit curves seem not to be available")
658

    
659
    plt.legend()
660
    plt.show()
661

    
662
if __name__=='__main__':
663
    
664
    # Set defaults values
665
  
666
    # Id of Device : 1 is for first find !
667
    Device=1
668
    # GPU style can be Cuda (Nvidia implementation) or OpenCL
669
    GpuStyle='OpenCL'
670
    # Iterations is integer
671
    Iterations=10000000
672
    # BlocksBlocks in first number of Blocks to explore
673
    BlocksBegin=1
674
    # BlocksEnd is last number of Blocks to explore
675
    BlocksEnd=1
676
    # BlocksStep is the step of Blocks to explore
677
    BlocksStep=1
678
    # ThreadsBlocks in first number of Blocks to explore
679
    ThreadsBegin=1
680
    # ThreadsEnd is last number of Blocks to explore
681
    ThreadsEnd=1
682
    # ThreadsStep is the step of Blocks to explore
683
    ThreadsStep=1
684
    # Redo is the times to redo the test to improve metrology
685
    Redo=1
686
    # OutMetrology is method for duration estimation : False is GPU inside
687
    OutMetrology=False
688
    Metrology='InMetro'
689
    # Curves is True to print the curves
690
    Curves=False
691
    # Fit is True to print the curves
692
    Fit=False
693
    # Inside based on If
694
    IfThen=False
695
    # Marsaglia RNG
696
    RNG='MWC'
697
    # Value type : INT32, INT64, FP32, FP64
698
    ValueType='FP32'
699

    
700
    HowToUse='%s -o (Out of Core Metrology) -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
701
    
702
    try:
703
        opts, args = getopt.getopt(sys.argv[1:],"hockg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
704
    except getopt.GetoptError:
705
        print(HowToUse % sys.argv[0])
706
        sys.exit(2)
707

    
708
    # List of Devices
709
    Devices=[]
710
    Alu={}
711
        
712
    for opt, arg in opts:
713
        if opt == '-h':
714
            print(HowToUse % sys.argv[0])
715

    
716
            print("\nInformations about devices detected under OpenCL API:")
717
            # For PyOpenCL import
718
            try:
719
                import pyopencl as cl
720
                Id=0
721
                for platform in cl.get_platforms():
722
                    for device in platform.get_devices():
723
                        #deviceType=cl.device_type.to_string(device.type)
724
                        deviceType="xPU"
725
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
726
                        Id=Id+1
727

    
728
            except:
729
                print("Your platform does not seem to support OpenCL")
730

    
731
            print("\nInformations about devices detected under CUDA API:")
732
            # For PyCUDA import
733
            try:
734
                import pycuda.driver as cuda
735
                cuda.init()
736
                for Id in range(cuda.Device.count()):
737
                    device=cuda.Device(Id)
738
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
739
                print
740
            except:
741
                print("Your platform does not seem to support CUDA")
742
        
743
            sys.exit()
744
        
745
        
746
        elif opt == '-o':
747
            OutMetrology=True
748
            Metrology='OutMetro'
749
        elif opt == '-c':
750
            Curves=True
751
        elif opt == '-k':
752
            IfThen=True
753
        elif opt in ("-d", "--device"):
754
            Devices.append(int(arg))
755
        elif opt in ("-g", "--gpustyle"):
756
            GpuStyle = arg
757
        elif opt in ("-m", "--marsaglia"):
758
            RNG = arg
759
        elif opt in ("-v", "--valuetype"):
760
            ValueType = arg
761
        elif opt in ("-i", "--iterations"):
762
            Iterations = numpy.uint64(arg)
763
        elif opt in ("-b", "--blocksbegin"):
764
            BlocksBegin = int(arg)
765
        elif opt in ("-e", "--blocksend"):
766
            BlocksEnd = int(arg)
767
        elif opt in ("-s", "--blocksstep"):
768
            BlocksStep = int(arg)
769
        elif opt in ("-f", "--threadsfirst"):
770
            ThreadsBegin = int(arg)
771
        elif opt in ("-l", "--threadslast"):
772
            ThreadsEnd = int(arg)
773
        elif opt in ("-t", "--threadsstep"):
774
            ThreadsStep = int(arg)
775
        elif opt in ("-r", "--redo"):
776
            Redo = int(arg)
777

    
778
    print("Devices Identification : %s" % Devices)
779
    print("GpuStyle used : %s" % GpuStyle)
780
    print("Iterations : %s" % Iterations)
781
    print("Number of Blocks on begin : %s" % BlocksBegin)
782
    print("Number of Blocks on end : %s" % BlocksEnd)
783
    print("Step on Blocks : %s" % BlocksStep)
784
    print("Number of Threads on begin : %s" % ThreadsBegin)
785
    print("Number of Threads on end : %s" % ThreadsEnd)
786
    print("Step on Threads : %s" % ThreadsStep)
787
    print("Number of redo : %s" % Redo)
788
    print("Metrology done out of XPU : %r" % OutMetrology)
789
    print("Type of Marsaglia RNG used : %s" % RNG)
790
    print("Type of variable : %s" % ValueType)
791

    
792
    if GpuStyle=='CUDA':
793
        try:
794
            # For PyCUDA import
795
            import pycuda.driver as cuda
796
            
797
            cuda.init()
798
            for Id in range(cuda.Device.count()):
799
                device=cuda.Device(Id)
800
                print("Device #%i of type GPU : %s" % (Id,device.name()))
801
                if Id in Devices:
802
                    Alu[Id]='GPU'
803
            
804
        except ImportError:
805
            print("Platform does not seem to support CUDA")
806

    
807
    if GpuStyle=='OpenCL':
808
        try:
809
            # For PyOpenCL import
810
            import pyopencl as cl
811
            Id=0
812
            for platform in cl.get_platforms():
813
                for device in platform.get_devices():
814
                    #deviceType=cl.device_type.to_string(device.type)
815
                    deviceType="xPU"
816
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
817

    
818
                    if Id in Devices:
819
                    # Set the Alu as detected Device Type
820
                        Alu[Id]=deviceType
821
                    Id=Id+1
822
        except ImportError:
823
            print("Platform does not seem to support OpenCL")
824

    
825
    print(Devices,Alu)
826
            
827
    BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
828
    ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
829
    
830
    ExploredJobs=numpy.array([]).astype(numpy.uint32)
831
    ExploredBlocks=numpy.array([]).astype(numpy.uint32)
832
    ExploredThreads=numpy.array([]).astype(numpy.uint32)
833
    avgD=numpy.array([]).astype(numpy.float32)
834
    medD=numpy.array([]).astype(numpy.float32)
835
    stdD=numpy.array([]).astype(numpy.float32)
836
    minD=numpy.array([]).astype(numpy.float32)
837
    maxD=numpy.array([]).astype(numpy.float32)
838
    avgR=numpy.array([]).astype(numpy.float32)
839
    medR=numpy.array([]).astype(numpy.float32)
840
    stdR=numpy.array([]).astype(numpy.float32)
841
    minR=numpy.array([]).astype(numpy.float32)
842
    maxR=numpy.array([]).astype(numpy.float32)
843

    
844
    for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
845
        
846
        # print Blocks,Threads
847
        circle=numpy.zeros(Blocks*Threads).astype(numpy.uint64)
848
        ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
849
        ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
850
        ExploredThreads=numpy.append(ExploredThreads,Threads)
851
        
852
        if OutMetrology: 
853
            DurationItem=numpy.array([]).astype(numpy.float32)
854
            Duration=numpy.array([]).astype(numpy.float32)
855
            Rate=numpy.array([]).astype(numpy.float32)
856
            for i in range(Redo):
857
                start=time.time()
858
                if GpuStyle=='CUDA':
859
                    try:
860
                        InputCU={}
861
                        InputCU['Iterations']=Iterations
862
                        InputCU['Steps']=1
863
                        InputCU['Blocks']=Blocks
864
                        InputCU['Threads']=Threads
865
                        InputCU['Device']=Devices[0]
866
                        InputCU['RNG']=RNG
867
                        InputCU['ValueType']=ValueType                    
868
                        InputCU['IfThen']=IfThen                    
869
                        OutputCU=MetropolisCuda(InputCU)
870
                        Inside=OutputCU['Circle']
871
                        NewIterations=OutputCU['NewIterations']
872
                        Duration=OutputCU['Duration']
873
                    except:
874
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
875
                elif GpuStyle=='OpenCL':
876
                    try:
877
                        InputCL={}
878
                        InputCL['Iterations']=Iterations
879
                        InputCL['Steps']=1
880
                        InputCL['Blocks']=Blocks
881
                        InputCL['Threads']=Threads
882
                        InputCL['Device']=Devices[0]
883
                        InputCL['RNG']=RNG
884
                        InputCL['ValueType']=ValueType
885
                        InputCL['IfThen']=IfThen                    
886
                        OutputCL=MetropolisOpenCL(InputCL)
887
                        Inside=OutputCL['Circle']
888
                        NewIterations=OutputCL['NewIterations']
889
                        Duration=OutputCL['Duration']
890
                    except:
891
                        print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
892
                Duration=numpy.append(Duration,time.time()-start)
893
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
894
        else:
895
            if GpuStyle=='CUDA':
896
                try:
897
                    InputCU={}
898
                    InputCU['Iterations']=Iterations
899
                    InputCU['Steps']=Redo
900
                    InputCU['Blocks']=Blocks
901
                    InputCU['Threads']=Threads
902
                    InputCU['Device']=Devices[0]
903
                    InputCU['RNG']=RNG
904
                    InputCU['ValueType']=ValueType
905
                    InputCU['IfThen']=IfThen                    
906
                    OutputCU=MetropolisCuda(InputCU)
907
                    Inside=OutputCU['Inside']
908
                    NewIterations=OutputCU['NewIterations']
909
                    Duration=OutputCU['Duration']
910
                except:
911
                    print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
912
                try:
913
                    pycuda.context.pop()
914
                except:
915
                    pass
916
            elif GpuStyle=='OpenCL':
917
                try:
918
                    InputCL={}
919
                    InputCL['Iterations']=Iterations
920
                    InputCL['Steps']=Redo
921
                    InputCL['Blocks']=Blocks
922
                    InputCL['Threads']=Threads
923
                    InputCL['Device']=Devices[0]
924
                    InputCL['RNG']=RNG
925
                    InputCL['ValueType']=ValueType
926
                    InputCL['IfThen']=IfThen
927
                    OutputCL=MetropolisOpenCL(InputCL)
928
                    Inside=OutputCL['Inside']
929
                    NewIterations=OutputCL['NewIterations']
930
                    Duration=OutputCL['Duration']
931
                except:
932
                    print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
933
            Rate=NewIterations/Duration
934
            print("Pi estimation %.8f" % (4./NewIterations*Inside))
935

    
936
            
937
        avgD=numpy.append(avgD,numpy.average(Duration))
938
        medD=numpy.append(medD,numpy.median(Duration))
939
        stdD=numpy.append(stdD,numpy.std(Duration))
940
        minD=numpy.append(minD,numpy.min(Duration))
941
        maxD=numpy.append(maxD,numpy.max(Duration))
942
        avgR=numpy.append(avgR,numpy.average(Rate))
943
        medR=numpy.append(medR,numpy.median(Rate))
944
        stdR=numpy.append(stdR,numpy.std(Rate))
945
        minR=numpy.append(minR,numpy.min(Rate))
946
        maxR=numpy.append(maxR,numpy.max(Rate))
947

    
948
        print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
949
        
950
        numpy.savez("Pi_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
951
        ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
952
        numpy.savetxt("Pi_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
953

    
954
    if Fit:
955
        FitAndPrint(ExploredJobs,median,Curves)