Révision 17

Pi/GPU/Pi-GPU.py (revision 17)
24 24
from scipy.optimize import curve_fit
25 25
from socket import gethostname
26 26

  
27
# find prime factors of a number
28
# Get for WWW :
29
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/
30
def PrimeFactors(x):
31
  factorlist=numpy.array([]).astype('uint32')
32
  loop=2
33
  while loop<=x:
34
    if x%loop==0:
35
      x/=loop
36
      factorlist=numpy.append(factorlist,[loop])
37
    else:
38
      loop+=1
39
  return factorlist
40

  
41
# Try to find the best thread number in Hybrid approach (Blocks&Threads)
42
# output is thread number
43
def BestThreadsNumber(jobs):
44
  factors=PrimeFactors(jobs)
45
  matrix=numpy.append([factors],[factors[::-1]],axis=0)
46
  threads=1
47
  for factor in matrix.transpose().ravel():
48
    threads=threads*factor
49
    if threads*threads>jobs:
50
      break
51
  return(long(threads))
52

  
27 53
# Predicted Amdahl Law (Reduced with s=1-p)  
28 54
def AmdahlR(N, T1, p):
29 55
  return (T1*(1-p+p/N))
......
54 80
#define MWCfp MWC * 2.328306435454494e-10f
55 81
#define KISSfp KISS * 2.328306435454494e-10f
56 82

  
57
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
83
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
58 84
{
59 85
   uint z=seed_z/(blockIdx.x+1);
60 86
   uint w=seed_w/(blockIdx.x+1);
61 87

  
62
   int total=0;
88
   ulong total=0;
63 89

  
64
   for (uint i=0;i<iterations;i++) {
90
   for (ulong i=0;i<iterations;i++) {
65 91

  
66 92
      float x=MWCfp ;
67 93
      float y=MWCfp ;
68 94

  
69 95
      // Matching test
70
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
96
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
71 97
      total+=inside;
72 98

  
73 99
   }
......
77 103

  
78 104
}
79 105

  
80
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
106
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
81 107
{
82 108
   uint z=seed_z/(threadIdx.x+1);
83 109
   uint w=seed_w/(threadIdx.x+1);
84 110

  
85
   int total=0;
111
   ulong total=0;
86 112

  
87
   for (uint i=0;i<iterations;i++) {
113
   for (ulong i=0;i<iterations;i++) {
88 114

  
89 115
      float x=MWCfp ;
90 116
      float y=MWCfp ;
91 117

  
92 118
      // Matching test
93
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
119
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
94 120
      total+=inside;
95 121

  
96 122
   }
......
100 126

  
101 127
}
102 128

  
103
__global__ void MainLoopHybrid(uint *s,uint iterations,uint seed_w,uint seed_z)
129
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
104 130
{
105 131
   uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1);
106 132
   uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1);
107 133

  
108
   int total=0;
134
   ulong total=0;
109 135

  
110
   for (uint i=0;i<iterations;i++) {
136
   for (ulong i=0;i<iterations;i++) {
111 137

  
112 138
      float x=MWCfp ;
113 139
      float y=MWCfp ;
114 140

  
115 141
      // Matching test
116
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
142
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
117 143
      total+=inside;
118 144

  
119 145
   }
......
137 163
#define MWCfp MWC * 2.328306435454494e-10f
138 164
#define KISSfp KISS * 2.328306435454494e-10f
139 165

  
140
__kernel void MainLoopGlobal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
166
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
141 167
{
142 168
   uint z=seed_z/(get_global_id(0)+1);
143 169
   uint w=seed_w/(get_global_id(0)+1);
144 170

  
145
   int total=0;
171
   ulong total=0;
146 172

  
147
   for (uint i=0;i<iterations;i++) {
173
   for (ulong i=0;i<iterations;i++) {
148 174

  
149 175
      float x=MWCfp ;
150 176
      float y=MWCfp ;
151 177

  
152 178
      // Matching test
153
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
179
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
154 180
      total+=inside;
155 181
   }
156 182
   s[get_global_id(0)]=total;
......
158 184
      
159 185
}
160 186

  
161
__kernel void MainLoopLocal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
187
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
162 188
{
163 189
   uint z=seed_z/(get_local_id(0)+1);
164 190
   uint w=seed_w/(get_local_id(0)+1);
165 191

  
166
   int total=0;
192
   ulong total=0;
167 193

  
168
   for (uint i=0;i<iterations;i++) {
194
   for (ulong i=0;i<iterations;i++) {
169 195

  
170 196
      float x=MWCfp ;
171 197
      float y=MWCfp ;
172 198

  
173 199
      // Matching test
174
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
200
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
175 201
      total+=inside;
176 202
   }
177 203
   s[get_local_id(0)]=total;
......
179 205
      
180 206
}
181 207

  
182
__kernel void MainLoopHybrid(__global uint *s,uint iterations,uint seed_w,uint seed_z)
208
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
183 209
{
184 210
   uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
185 211
   uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1);
186
   
187
   // uint jsr=123456789;
188
   // uint jcong=380116160;
189 212

  
190
   int total=0;
213
   ulong total=0;
191 214

  
192 215
   for (uint i=0;i<iterations;i++) {
193 216

  
......
195 218
     float y=MWCfp ;
196 219

  
197 220
      // Matching test
198
      int inside=((x*x+y*y) < 1.0f) ? 1:0;
221
      ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
199 222
      total+=inside;
200 223
   }
201 224
   barrier(CLK_LOCAL_MEM_FENCE);
......
223 246
  MyDuration=numpy.zeros(steps)
224 247
  
225 248
  if iterations%jobs==0:
226
    iterationsCL=numpy.uint32(iterations/jobs)
249
    iterationsCL=numpy.uint64(iterations/jobs)
227 250
    iterationsNew=iterationsCL*jobs
228 251
  else:
229
    iterationsCL=numpy.uint32(iterations/jobs+1)
252
    iterationsCL=numpy.uint64(iterations/jobs+1)
230 253
    iterationsNew=iterations
231 254

  
232 255
  for i in range(steps):
......
234 257
    start.synchronize()
235 258
    if ParaStyle=='Blocks':
236 259
      MetropolisBlocksCU(circleCU,
237
                         numpy.uint32(iterationsCL),
260
                         numpy.uint64(iterationsCL),
238 261
                         numpy.uint32(nprnd(2**30/jobs)),
239 262
                         numpy.uint32(nprnd(2**30/jobs)),
240 263
                         grid=(jobs,1),
241 264
                         block=(1,1,1))
242
      print "GPU with %i %s done" % (jobs,ParaStyle)
265
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
266
            (Alu,jobs,1,ParaStyle)      
243 267
    elif ParaStyle=='Hybrid':
244
      blocks=jobs/int(math.sqrt(float(jobs)))
268
      threads=BestThreadsNumber(jobs)
245 269
      MetropolisHybridCU(circleCU,
246
                          numpy.uint32(iterationsCL),
270
                          numpy.uint64(iterationsCL),
247 271
                          numpy.uint32(nprnd(2**30/jobs)),
248 272
                          numpy.uint32(nprnd(2**30/jobs)),
249
                          grid=(blocks,1),
250
                          block=(jobs/blocks,1,1))
251
      print "GPU with (blocks,jobs)=(%i,%i) %s done" % (blocks,jobs/blocks,ParaStyle)
273
                          grid=(jobs,1),
274
                          block=(threads,1,1))
275
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
276
            (Alu,jobs/threads,threads,ParaStyle)
252 277
    else:
253 278
      MetropolisJobsCU(circleCU,
254
                          numpy.uint32(iterationsCL),
279
                          numpy.uint64(iterationsCL),
255 280
                          numpy.uint32(nprnd(2**30/jobs)),
256 281
                          numpy.uint32(nprnd(2**30/jobs)),
257 282
                          grid=(1,1),
258 283
                          block=(jobs,1,1))
259
      print "GPU with %i %s done" % (jobs,ParaStyle)
284
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
285
            (Alu,jobs,1,ParaStyle)
260 286
    stop.record()
261 287
    stop.synchronize()
262 288
                
......
287 313

  
288 314
  HasGPU=False
289 315
  Id=1
290
  # Device selection based on choice (default is GPU)
316
  # Primary Device selection based on Device Id
291 317
  for platform in cl.get_platforms():
292 318
    for device in platform.get_devices():
293
      if not HasGPU:
294
        deviceType=cl.device_type.to_string(device.type)
295
        if deviceType=="GPU" and Alu=="GPU" and Id==Device:
296
          GPU=device
297
          print "GPU selected: ",device.name
298
          HasGPU=True
299
        if deviceType=="CPU" and Alu=="CPU":	    
300
          GPU=device
301
          print "CPU selected: ",device.name
302
          HasGPU=True
319
      deviceType=cl.device_type.to_string(device.type)
320
      if Id==Device and not HasGPU:
321
        GPU=device
322
        print "CPU/GPU selected: ",device.name
323
        HasGPU=True
303 324
      Id=Id+1
325
  # Default Device selection based on ALU Type
326
  for platform in cl.get_platforms():
327
    for device in platform.get_devices():
328
      deviceType=cl.device_type.to_string(device.type)
329
      if deviceType=="GPU" and Alu=="GPU" and not HasGPU:
330
        GPU=device
331
        print "GPU selected: ",device.name
332
        HasGPU=True
333
      if deviceType=="CPU" and Alu=="CPU" and not HasGPU:        
334
        GPU=device
335
        print "CPU selected: ",device.name
336
        HasGPU=True
304 337
				
305 338
  # Je cree le contexte et la queue pour son execution
306 339
  #ctx = cl.create_some_context()
......
330 363
    iterationsCL=numpy.uint32(iterations/jobs+1)
331 364
    iterationsNew=iterations
332 365

  
333
  blocks=int(math.sqrt(jobs))
334

  
335 366
  for i in range(steps):
336 367
		
337 368
    if ParaStyle=='Blocks':
......
344 375
      # step is number of iterations
345 376
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
346 377
                                           circleCL,
347
                                           numpy.uint32(iterationsCL),
378
                                           numpy.uint64(iterationsCL),
348 379
                                           numpy.uint32(nprnd(2**30/jobs)),
349 380
                                           numpy.uint32(nprnd(2**30/jobs)))
350
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
381
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
382
            (Alu,jobs,1,ParaStyle)
351 383
    elif ParaStyle=='Hybrid':
384
      threads=BestThreadsNumber(jobs)
352 385
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
353
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(blocks*blocks,),(blocks,),
386
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
354 387
                                          circleCL,
355
                                          numpy.uint32(iterationsCL),
388
                                          numpy.uint64(iterationsCL),
356 389
                                          numpy.uint32(nprnd(2**30/jobs)),
357 390
                                          numpy.uint32(nprnd(2**30/jobs)))
358
      print "%s with (Blocks,Threads)=(%i,%i) %s done" % (Alu,blocks,blocks,ParaStyle)
391
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
392
            (Alu,jobs/threads,threads,ParaStyle)
359 393
    else:
360 394
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
361 395
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
362 396
                                          circleCL,
363
                                          numpy.uint32(iterationsCL),
397
                                          numpy.uint64(iterationsCL),
364 398
                                          numpy.uint32(nprnd(2**30/jobs)),
365 399
                                          numpy.uint32(nprnd(2**30/jobs)))
366 400
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
......
459 493
  # Set defaults values
460 494
  # Alu can be CPU or GPU
461 495
  Alu='CPU'
462
  # Id of GPU
463
  Device=1
496
  # Id of GPU : 0 is for first find !
497
  Device=0
464 498
  # GPU style can be Cuda (Nvidia implementation) or OpenCL
465 499
  GpuStyle='OpenCL'
466 500
  # Parallel distribution can be on Threads or Blocks
......
555 589
  while Jobs <= JobEnd:
556 590
    avg,med,std=0,0,0
557 591
    ExploredJobs=numpy.append(ExploredJobs,Jobs)
558
    circle=numpy.zeros(Jobs).astype(numpy.uint32)
592
    circle=numpy.zeros(Jobs).astype(numpy.uint64)
559 593

  
560 594
    if OutMetrology:
561 595
      duration=numpy.array([]).astype(numpy.float32)

Formats disponibles : Unified diff