Statistiques
| Révision :

root / Pi / GPU / Pi-GPU.py-20130205 @ 48

Historique | Voir | Annoter | Télécharger (15,79 ko)

1
#!/usr/bin/env python
2
#
3
# Pi-by-MC using PyCUDA
4
#
5
# CC BY-NC-SA 2011 : <emmanuel.quemener@ens-lyon.fr> 
6
#
7
# Thanks to Andreas Klockner for PyCUDA:
8
# http://mathema.tician.de/software/pycuda
9
# 
10

    
11
# 2013-01-01 : problems with launch timeout
12
# nvidia-smi -c 1 : ko
13
# nvidia-smi -c 3 : ko
14
# nvidia-smi --gom=1 : not supported
15
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
16
# Option "Interactive" "0" in /etc/X11/xorg.conf
17

    
18
# Common tools
19
import numpy
20
from numpy.random import randint as nprnd
21
import sys
22
import getopt
23
import time
24
import matplotlib.pyplot as plt
25
from scipy.optimize import curve_fit
26
from socket import gethostname
27

    
28
# Predicted Amdahl Law (Reduced with s=1-p)  
29
def AmdahlR(N, T1, p):
30
  return (T1*(1-p+p/N))
31

    
32
# Predicted Amdahl Law
33
def Amdahl(N, T1, s, p):
34
  return (T1*(s+p/N))
35

    
36
# Predicted Mylq Law with first order
37
def Mylq(N, T1,s,c,p):
38
  return (T1*(s+c*N+p/N))
39

    
40
# Predicted Mylq Law with second order
41
def Mylq2(N, T1,s,c1,c2,p):
42
  return (T1*(s+c1*N+c2*N*N+p/N))
43

    
44
KERNEL_CODE_CUDA="""
45

    
46
// Marsaglia RNG very simple implementation
47

    
48
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
49
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
50
#define MWC   (znew+wnew)
51
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
52
#define CONG  (jcong=69069*jcong+1234567)
53
#define KISS  ((MWC^CONG)+SHR3)
54

    
55
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
56
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
57
#define MWCfp MWC * 2.328306435454494e-10f
58
#define KISSfp KISS * 2.328306435454494e-10f
59

    
60
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
61
{
62
   uint z=seed_z/(blockIdx.x+1);
63
   uint w=seed_w/(blockIdx.x+1);
64
   // uint jsr=123456789;
65
   // uint jcong=380116160;
66

    
67
   int total=0;
68

    
69
   for (uint i=0;i<iterations;i++) {
70

    
71
      float x=MWCfp ;
72
      float y=MWCfp ;
73

    
74
      // Matching test
75
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
76
      total+=inside;
77

    
78
   }
79

    
80
   s[blockIdx.x]=total;
81
   __syncthreads();
82

    
83
}
84

    
85
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
86
{
87
   uint z=seed_z/(threadIdx.x+1);
88
   uint w=seed_w/(threadIdx.x+1);
89
   // uint jsr=123456789;
90
   // uint jcong=380116160;
91

    
92
   int total=0;
93

    
94
   for (uint i=0;i<iterations;i++) {
95

    
96
      float x=MWCfp ;
97
      float y=MWCfp ;
98

    
99
      // Matching test
100
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
101
      total+=inside;
102

    
103
   }
104

    
105
   s[threadIdx.x]=total;
106
   __syncthreads();
107

    
108
}
109
"""
110

    
111
KERNEL_CODE_OPENCL="""
112

    
113
// Marsaglia RNG very simple implementation
114
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
115
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
116
#define MWC   (znew+wnew)
117
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
118
#define CONG  (jcong=69069*jcong+1234567)
119
#define KISS  ((MWC^CONG)+SHR3)
120

    
121
//#define MWCfp (MWC + 2147483648.0f) * 2.328306435454494e-10f
122
//#define KISSfp (KISS + 2147483648.0f) * 2.328306435454494e-10f
123
#define MWCfp MWC * 2.328306435454494e-10f
124
#define KISSfp KISS * 2.328306435454494e-10f
125

    
126
__kernel void MainLoopGlobal(__global int *s,uint iterations,uint seed_w,uint seed_z)
127
{
128
   uint z=seed_z/(get_global_id(0)+1);
129
   uint w=seed_w/(get_global_id(0)+1);
130
   // uint jsr=123456789;
131
   // uint jcong=380116160;
132

    
133
   int total=0;
134

    
135
   for (uint i=0;i<iterations;i++) {
136

    
137
      float x=MWCfp ;
138
      float y=MWCfp ;
139

    
140
      // Matching test
141
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
142
      total+=inside;
143
   }
144
   s[get_global_id(0)]=total;
145
   barrier(CLK_GLOBAL_MEM_FENCE);
146
      
147
}
148

    
149
__kernel void MainLoopLocal(__global int *s,uint iterations,uint seed_w,uint seed_z)
150
{
151
   uint z=seed_z/(get_local_id(0)+1);
152
   uint w=seed_w/(get_local_id(0)+1);
153
   // uint jsr=123456789;
154
   // uint jcong=380116160;
155

    
156
   int total=0;
157

    
158
   for (uint i=0;i<iterations;i++) {
159

    
160
      float x=MWCfp ;
161
      float y=MWCfp ;
162

    
163
      // Matching test
164
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
165
      total+=inside;
166
   }
167
   s[get_local_id(0)]=total;
168
   barrier(CLK_LOCAL_MEM_FENCE);
169
      
170
}
171
"""
172

    
173
def MetropolisCuda(circle,iterations,steps,threads,ParaStyle):
174

    
175
  # Avec PyCUDA autoinit, rien a faire !
176
  
177
  circleCU = cuda.InOut(circle)
178
  
179
  mod = SourceModule(KERNEL_CODE_CUDA)
180

    
181
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
182
  MetropolisThreadsCU=mod.get_function("MainLoopThreads")
183
  
184
  start = pycuda.driver.Event()
185
  stop = pycuda.driver.Event()
186
  
187
  MyPi=numpy.zeros(steps)
188
  MyDuration=numpy.zeros(steps)
189
  
190
  if iterations%threads==0:
191
    iterationsCL=numpy.uint32(iterations/threads+1)
192
    iterationsNew=iterationsCL*threads
193
  else:
194
    iterationsCL=numpy.uint32(iterations/threads)
195
    iterationsNew=iterations
196

    
197
  for i in range(steps):
198
    start.record()
199
    start.synchronize()
200
    if ParaStyle=='Blocks':
201
      MetropolisBlocksCU(circleCU,
202
                         numpy.uint32(iterationsCL),
203
                         numpy.uint32(nprnd(2**32/threads)),
204
                         numpy.uint32(nprnd(2**32/threads)),
205
                         grid=(threads,1),
206
                         block=(1,1,1))
207
    else:
208
      MetropolisThreadsCU(circleCU,
209
                          numpy.uint32(iterationsCL),
210
                          numpy.uint32(nprnd(2**32/threads)),
211
                          numpy.uint32(nprnd(2**32/threads)),
212
                          grid=(1,1),
213
                          block=(threads,1,1))
214
    print "GPU done"
215
    stop.record()
216
    stop.synchronize()
217
                
218
    #elapsed = stop.time_since(start)*1e-3
219
    elapsed = start.time_till(stop)*1e-3
220

    
221
    #print circle,float(numpy.sum(circle))
222
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsCL)
223
    MyDuration[i]=elapsed
224
    #print MyPi[i],MyDuration[i]
225
    #time.sleep(1)
226

    
227
    print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
228

    
229
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
230

    
231

    
232
def MetropolisOpenCL(circle,iterations,steps,threads,ParaStyle,Alu):
233
	
234
  # Initialisation des variables en les CASTant correctement
235
    
236
  # Je detecte un peripherique GPU dans la liste des peripheriques
237
  # for platform in cl.get_platforms():
238
  # 	for device in platform.get_devices():
239
  # 		if cl.device_type.to_string(device.type)=='GPU':
240
  # 			GPU=device
241
  #print "GPU detected: ",device.name
242

    
243
  HasGPU=False
244
  # Device selection based on choice (default is GPU)
245
  for platform in cl.get_platforms():
246
    for device in platform.get_devices():
247
      if not HasGPU:
248
        deviceType=cl.device_type.to_string(device.type)
249
        if deviceType=="GPU" and Alu=="GPU":
250
          GPU=device
251
          print "GPU selected: ",device.name
252
          HasGPU=True
253
        if deviceType=="CPU" and Alu=="CPU":	    
254
          GPU=device
255
          print "CPU selected: ",device.name
256
          HasGPU=True	
257
				
258
  # Je cree le contexte et la queue pour son execution
259
  #ctx = cl.create_some_context()
260
  ctx = cl.Context([GPU])
261
  queue = cl.CommandQueue(ctx,
262
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
263

    
264
  # Je recupere les flag possibles pour les buffers
265
  mf = cl.mem_flags
266
	
267
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)
268

    
269
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
270
    options = "-cl-mad-enable -cl-fast-relaxed-math")
271

    
272
  #MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build()
273

    
274
  i=0
275

    
276
  MyPi=numpy.zeros(steps)
277
  MyDuration=numpy.zeros(steps)
278
  
279
  if iterations%threads==0:
280
    iterationsCL=numpy.uint32(iterations/threads+1)
281
    iterationsNew=iterationsCL*threads
282
  else:
283
    iterationsCL=numpy.uint32(iterations/threads)
284
    iterationsNew=iterations
285

    
286
  for i in range(steps):
287
		
288
    if ParaStyle=='Blocks':
289
      # Call OpenCL kernel
290
      # (1,) is Global work size (only 1 work size)
291
      # (1,) is local work size
292
      # circleCL is lattice translated in CL format
293
      # SeedZCL is lattice translated in CL format
294
      # SeedWCL is lattice translated in CL format
295
      # step is number of iterations
296
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(threads,),(1,),
297
                                           circleCL,
298
                                           numpy.uint32(iterationsCL),
299
                                           numpy.uint32(nprnd(2**32/threads)),
300
                                           numpy.uint32(nprnd(2**32/threads)))
301
      print "%s with %s done" % (Alu,ParaStyle)
302
    else:
303
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
304
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(threads,),(threads,),
305
                                          circleCL,
306
                                          numpy.uint32(iterationsCL),
307
                                          numpy.uint32(nprnd(2**32/threads)),
308
                                          numpy.uint32(nprnd(2**32/threads)))
309
      print "%s with %s done" % (Alu,ParaStyle)
310

    
311
    CLLaunch.wait()
312
    cl.enqueue_copy(queue, circle, circleCL).wait()
313

    
314
    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
315

    
316
    #print circle,float(numpy.sum(circle))
317
    MyPi[i]=4.*float(numpy.sum(circle))/float(iterationsNew)
318
    MyDuration[i]=elapsed
319
    #print MyPi[i],MyDuration[i]
320

    
321
  circleCL.release()
322

    
323
  #print threads,numpy.mean(MyPi),numpy.median(MyPi),numpy.std(MyPi)
324
  print threads,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)
325
	
326
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
327

    
328

    
329
def FitAndPrint(N,D,Curves):
330

    
331
  try:
332
    coeffs_Amdahl, matcov_Amdahl = curve_fit(Amdahl, N, D)
333

    
334
    D_Amdahl=Amdahl(N,coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
335
    coeffs_Amdahl[1]=coeffs_Amdahl[1]*coeffs_Amdahl[0]/D[0]
336
    coeffs_Amdahl[2]=coeffs_Amdahl[2]*coeffs_Amdahl[0]/D[0]
337
    coeffs_Amdahl[0]=D[0]
338
    print "Amdahl Normalized: T=%.2f(%.5f+%.5f/N)" % \
339
        (coeffs_Amdahl[0],coeffs_Amdahl[1],coeffs_Amdahl[2])
340
  except:
341
    print "Impossible to fit for Amdahl law : only %i elements" % len(D) 
342

    
343
  try:
344
    coeffs_AmdahlR, matcov_AmdahlR = curve_fit(AmdahlR, N, D)
345

    
346
    D_AmdahlR=AmdahlR(N,coeffs_AmdahlR[0],coeffs_AmdahlR[1])
347
    coeffs_AmdahlR[1]=coeffs_AmdahlR[1]*coeffs_AmdahlR[0]/D[0]
348
    coeffs_AmdahlR[0]=D[0]
349
    print "Amdahl Reduced Normalized: T=%.2f(%.5f+%.5f/N)" % \
350
        (coeffs_AmdahlR[0],1-coeffs_AmdahlR[1],coeffs_AmdahlR[1])
351

    
352
  except:
353
    print "Impossible to fit for Reduced Amdahl law : only %i elements" % len(D) 
354

    
355
  try:
356
    coeffs_Mylq, matcov_Mylq = curve_fit(Mylq, N, D)
357

    
358
    coeffs_Mylq[1]=coeffs_Mylq[1]*coeffs_Mylq[0]/D[0]
359
    coeffs_Mylq[2]=coeffs_Mylq[2]*coeffs_Mylq[0]/D[0]
360
    coeffs_Mylq[3]=coeffs_Mylq[3]*coeffs_Mylq[0]/D[0]
361
    coeffs_Mylq[0]=D[0]
362
    print "Mylq Normalized : T=%.2f(%.5f+%.5f*N+%.5f/N)" % (coeffs_Mylq[0],
363
                                                            coeffs_Mylq[1],
364
                                                            coeffs_Mylq[2],
365
                                                            coeffs_Mylq[3])
366
    D_Mylq=Mylq(N,coeffs_Mylq[0],coeffs_Mylq[1],coeffs_Mylq[2],
367
                coeffs_Mylq[3])
368
  except:
369
    print "Impossible to fit for Mylq law : only %i elements" % len(D) 
370

    
371
  try:
372
    coeffs_Mylq2, matcov_Mylq2 = curve_fit(Mylq2, N, D)
373

    
374
    coeffs_Mylq2[1]=coeffs_Mylq2[1]*coeffs_Mylq2[0]/D[0]
375
    coeffs_Mylq2[2]=coeffs_Mylq2[2]*coeffs_Mylq2[0]/D[0]
376
    coeffs_Mylq2[3]=coeffs_Mylq2[3]*coeffs_Mylq2[0]/D[0]
377
    coeffs_Mylq2[4]=coeffs_Mylq2[4]*coeffs_Mylq2[0]/D[0]
378
    coeffs_Mylq2[0]=D[0]
379
    print "Mylq 2nd order Normalized: T=%.2f(%.5f+%.5f*N+%.5f*N^2+%.5f/N)" % \
380
        (coeffs_Mylq2[0],coeffs_Mylq2[1],coeffs_Mylq2[2],coeffs_Mylq2[3],
381
         coeffs_Mylq2[4])
382

    
383
  except:
384
    print "Impossible to fit for 2nd order Mylq law : only %i elements" % len(D) 
385

    
386
  if Curves:
387
    plt.xlabel("Number of Threads/work Items")
388
    plt.ylabel("Total Elapsed Time")
389

    
390
    Experience,=plt.plot(N,D,'ro') 
391
    try:
392
      pAmdahl,=plt.plot(N,D_Amdahl,label="Loi de Amdahl")    
393
      pMylq,=plt.plot(N,D_Mylq,label="Loi de Mylq")
394
    except:
395
      print "Fit curves seem not to be available"
396

    
397
    plt.legend()
398
    plt.show()
399

    
400
if __name__=='__main__':
401

    
402
  # Set defaults values
403
  # Alu can be CPU or GPU
404
  Alu='CPU'
405
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
406
  GpuStyle='OpenCL'
407
  # Parallel distribution can be on Threads or Blocks
408
  ParaStyle='Threads'
409
  # Iterations is integer
410
  Iterations=1000000000
411
  # ThreadStart in first number of Threads to explore
412
  ThreadStart=1
413
  # ThreadEnd is last number of Threads to explore
414
  ThreadEnd=512
415
  # Redo is the times to redo the test to improve metrology
416
  Redo=1
417
  # OutMetrology is method for duration estimation : False is GPU inside
418
  OutMetrology=False
419
  # Curves is True to print the curves
420
  Curves=False
421

    
422
  try:
423
    opts, args = getopt.getopt(sys.argv[1:],"hoca:g:p:i:s:e:r:",["alu=","gpustyle=","parastyle=","iterations=","threadstart=","threadend=","redo="])
424
  except getopt.GetoptError:
425
    print '%s -o -a <CPU/GPU> -g <CUDA/OpenCL> -p <ParaStyle> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
426
    sys.exit(2)
427
    
428
  for opt, arg in opts:
429
    if opt == '-h':
430
      print '%s -o (Out of Core Metrology) -c (Print Curves) -a <CPU/GPU> -g <CUDA/OpenCL> -p <Threads/Blocks> -i <Iterations> -s <ThreadStart> -e <ThreadEnd> -r <RedoToImproveStats>' % sys.argv[0]
431
      sys.exit()
432
    elif opt == '-o':
433
      OutMetrology=True
434
    elif opt == '-c':
435
      Curves=True
436
    elif opt in ("-a", "--alu"):
437
      Alu = arg
438
    elif opt in ("-g", "--gpustyle"):
439
      GpuStyle = arg
440
    elif opt in ("-p", "--parastyle"):
441
      ParaStyle = arg
442
    elif opt in ("-i", "--iterations"):
443
      Iterations = numpy.uint32(arg)
444
    elif opt in ("-s", "--threadstart"):
445
      ThreadStart = int(arg)
446
    elif opt in ("-e", "--threadend"):
447
      ThreadEnd = int(arg)
448
    elif opt in ("-r", "--redo"):
449
      Redo = int(arg)
450

    
451
  if Alu=='CPU' and GpuStyle=='CUDA':
452
    print "Alu can't be CPU for CUDA, set Alu to GPU"
453
    Alu='GPU'
454

    
455
  if ParaStyle not in ('Blocks','Threads'):
456
    print "%s not exists, ParaStyle set as Threads !" % ParaStyle
457
    ParaStyle='Threads'
458

    
459
  print "Compute unit : %s" % Alu
460
  print "GpuStyle used : %s" % GpuStyle
461
  print "Parallel Style used : %s" % ParaStyle
462
  print "Iterations : %s" % Iterations
463
  print "Number of threads on start : %s" % ThreadStart
464
  print "Number of threads on end : %s" % ThreadEnd
465
  print "Number of redo : %s" % Redo
466
  print "Metrology done out of CPU/GPU : %r" % OutMetrology
467

    
468
  if GpuStyle=='CUDA':
469
    # For PyCUDA import
470
    import pycuda.driver as cuda
471
    import pycuda.gpuarray as gpuarray
472
    import pycuda.autoinit
473
    from pycuda.compiler import SourceModule
474

    
475
  if GpuStyle=='OpenCL':
476
    # For PyOpenCL import
477
    import pyopencl as cl
478

    
479
  average=numpy.array([]).astype(numpy.float32)
480
  median=numpy.array([]).astype(numpy.float32)
481
  stddev=numpy.array([]).astype(numpy.float32)
482

    
483
  ExploredThreads=numpy.array([]).astype(numpy.uint32)
484

    
485
  Threads=ThreadStart
486

    
487
  while Threads <= ThreadEnd:
488
    ExploredThreads=numpy.append(ExploredThreads,Threads)
489
    circle=numpy.zeros(Threads).astype(numpy.uint32)
490

    
491
    if OutMetrology:
492
      duration=numpy.array([]).astype(numpy.float32)
493
      for i in range(Redo):
494
        start=time.time()
495
        if GpuStyle=='CUDA':
496
          MetropolisCuda(circle,Iterations,1,Threads,ParaStyle)
497
        elif GpuStyle=='OpenCL':
498
          MetropolisOpenCL(circle,Iterations,1,Threads,ParaStyle,Alu)
499
        duration=numpy.append(duration,time.time()-start)
500
      avg=numpy.mean(duration)
501
      med=numpy.median(duration)
502
      std=numpy.std(duration)
503
    else:
504
      if GpuStyle=='CUDA':
505
        avg,med,std=MetropolisCuda(circle,Iterations,Redo,Threads,ParaStyle)
506
      elif GpuStyle=='OpenCL':
507
        avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Threads,
508
                                     ParaStyle,Alu)
509
    print "avg,med,std",avg,med,std
510
    average=numpy.append(average,avg)
511
    median=numpy.append(median,med)
512
    stddev=numpy.append(stddev,std)
513
    #THREADS*=2
514
    Threads+=1
515

    
516
  numpy.savez("Pi_%s_%s_%s_%s_%i_%.8i_%s" % (Alu,GpuStyle,ParaStyle,ThreadStart,ThreadEnd,Iterations,gethostname()),(ExploredThreads,average,median,stddev))
517

    
518
  FitAndPrint(ExploredThreads,median,Curves)
519