Révision 17 Pi/GPU/Pi-GPU.py
Pi-GPU.py (revision 17) | ||
---|---|---|
24 | 24 |
from scipy.optimize import curve_fit |
25 | 25 |
from socket import gethostname |
26 | 26 |
|
27 |
# find prime factors of a number |
|
28 |
# Get for WWW : |
|
29 |
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/ |
|
30 |
def PrimeFactors(x): |
|
31 |
factorlist=numpy.array([]).astype('uint32') |
|
32 |
loop=2 |
|
33 |
while loop<=x: |
|
34 |
if x%loop==0: |
|
35 |
x/=loop |
|
36 |
factorlist=numpy.append(factorlist,[loop]) |
|
37 |
else: |
|
38 |
loop+=1 |
|
39 |
return factorlist |
|
40 |
|
|
41 |
# Try to find the best thread number in Hybrid approach (Blocks&Threads) |
|
42 |
# output is thread number |
|
43 |
def BestThreadsNumber(jobs): |
|
44 |
factors=PrimeFactors(jobs) |
|
45 |
matrix=numpy.append([factors],[factors[::-1]],axis=0) |
|
46 |
threads=1 |
|
47 |
for factor in matrix.transpose().ravel(): |
|
48 |
threads=threads*factor |
|
49 |
if threads*threads>jobs: |
|
50 |
break |
|
51 |
return(long(threads)) |
|
52 |
|
|
27 | 53 |
# Predicted Amdahl Law (Reduced with s=1-p) |
28 | 54 |
def AmdahlR(N, T1, p): |
29 | 55 |
return (T1*(1-p+p/N)) |
... | ... | |
54 | 80 |
#define MWCfp MWC * 2.328306435454494e-10f |
55 | 81 |
#define KISSfp KISS * 2.328306435454494e-10f |
56 | 82 |
|
57 |
__global__ void MainLoopBlocks(uint *s,uint iterations,uint seed_w,uint seed_z)
|
|
83 |
__global__ void MainLoopBlocks(ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
58 | 84 |
{ |
59 | 85 |
uint z=seed_z/(blockIdx.x+1); |
60 | 86 |
uint w=seed_w/(blockIdx.x+1); |
61 | 87 |
|
62 |
int total=0;
|
|
88 |
ulong total=0;
|
|
63 | 89 |
|
64 |
for (uint i=0;i<iterations;i++) {
|
|
90 |
for (ulong i=0;i<iterations;i++) {
|
|
65 | 91 |
|
66 | 92 |
float x=MWCfp ; |
67 | 93 |
float y=MWCfp ; |
68 | 94 |
|
69 | 95 |
// Matching test |
70 |
int inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
96 |
ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
71 | 97 |
total+=inside; |
72 | 98 |
|
73 | 99 |
} |
... | ... | |
77 | 103 |
|
78 | 104 |
} |
79 | 105 |
|
80 |
__global__ void MainLoopThreads(uint *s,uint iterations,uint seed_w,uint seed_z)
|
|
106 |
__global__ void MainLoopThreads(ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
81 | 107 |
{ |
82 | 108 |
uint z=seed_z/(threadIdx.x+1); |
83 | 109 |
uint w=seed_w/(threadIdx.x+1); |
84 | 110 |
|
85 |
int total=0;
|
|
111 |
ulong total=0;
|
|
86 | 112 |
|
87 |
for (uint i=0;i<iterations;i++) {
|
|
113 |
for (ulong i=0;i<iterations;i++) {
|
|
88 | 114 |
|
89 | 115 |
float x=MWCfp ; |
90 | 116 |
float y=MWCfp ; |
91 | 117 |
|
92 | 118 |
// Matching test |
93 |
int inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
119 |
ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
94 | 120 |
total+=inside; |
95 | 121 |
|
96 | 122 |
} |
... | ... | |
100 | 126 |
|
101 | 127 |
} |
102 | 128 |
|
103 |
__global__ void MainLoopHybrid(uint *s,uint iterations,uint seed_w,uint seed_z)
|
|
129 |
__global__ void MainLoopHybrid(ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
104 | 130 |
{ |
105 | 131 |
uint z=seed_z/(blockDim.x*blockIdx.x+threadIdx.x+1); |
106 | 132 |
uint w=seed_w/(blockDim.x*blockIdx.x+threadIdx.x+1); |
107 | 133 |
|
108 |
int total=0;
|
|
134 |
ulong total=0;
|
|
109 | 135 |
|
110 |
for (uint i=0;i<iterations;i++) {
|
|
136 |
for (ulong i=0;i<iterations;i++) {
|
|
111 | 137 |
|
112 | 138 |
float x=MWCfp ; |
113 | 139 |
float y=MWCfp ; |
114 | 140 |
|
115 | 141 |
// Matching test |
116 |
int inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
142 |
ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
117 | 143 |
total+=inside; |
118 | 144 |
|
119 | 145 |
} |
... | ... | |
137 | 163 |
#define MWCfp MWC * 2.328306435454494e-10f |
138 | 164 |
#define KISSfp KISS * 2.328306435454494e-10f |
139 | 165 |
|
140 |
__kernel void MainLoopGlobal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
|
|
166 |
__kernel void MainLoopGlobal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
141 | 167 |
{ |
142 | 168 |
uint z=seed_z/(get_global_id(0)+1); |
143 | 169 |
uint w=seed_w/(get_global_id(0)+1); |
144 | 170 |
|
145 |
int total=0;
|
|
171 |
ulong total=0;
|
|
146 | 172 |
|
147 |
for (uint i=0;i<iterations;i++) {
|
|
173 |
for (ulong i=0;i<iterations;i++) {
|
|
148 | 174 |
|
149 | 175 |
float x=MWCfp ; |
150 | 176 |
float y=MWCfp ; |
151 | 177 |
|
152 | 178 |
// Matching test |
153 |
int inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
179 |
ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
154 | 180 |
total+=inside; |
155 | 181 |
} |
156 | 182 |
s[get_global_id(0)]=total; |
... | ... | |
158 | 184 |
|
159 | 185 |
} |
160 | 186 |
|
161 |
__kernel void MainLoopLocal(__global uint *s,uint iterations,uint seed_w,uint seed_z)
|
|
187 |
__kernel void MainLoopLocal(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
162 | 188 |
{ |
163 | 189 |
uint z=seed_z/(get_local_id(0)+1); |
164 | 190 |
uint w=seed_w/(get_local_id(0)+1); |
165 | 191 |
|
166 |
int total=0;
|
|
192 |
ulong total=0;
|
|
167 | 193 |
|
168 |
for (uint i=0;i<iterations;i++) {
|
|
194 |
for (ulong i=0;i<iterations;i++) {
|
|
169 | 195 |
|
170 | 196 |
float x=MWCfp ; |
171 | 197 |
float y=MWCfp ; |
172 | 198 |
|
173 | 199 |
// Matching test |
174 |
int inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
200 |
ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
175 | 201 |
total+=inside; |
176 | 202 |
} |
177 | 203 |
s[get_local_id(0)]=total; |
... | ... | |
179 | 205 |
|
180 | 206 |
} |
181 | 207 |
|
182 |
__kernel void MainLoopHybrid(__global uint *s,uint iterations,uint seed_w,uint seed_z)
|
|
208 |
__kernel void MainLoopHybrid(__global ulong *s,ulong iterations,uint seed_w,uint seed_z)
|
|
183 | 209 |
{ |
184 | 210 |
uint z=seed_z/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1); |
185 | 211 |
uint w=seed_w/(get_group_id(0)*get_num_groups(0)+get_local_id(0)+1); |
186 |
|
|
187 |
// uint jsr=123456789; |
|
188 |
// uint jcong=380116160; |
|
189 | 212 |
|
190 |
int total=0;
|
|
213 |
ulong total=0;
|
|
191 | 214 |
|
192 | 215 |
for (uint i=0;i<iterations;i++) { |
193 | 216 |
|
... | ... | |
195 | 218 |
float y=MWCfp ; |
196 | 219 |
|
197 | 220 |
// Matching test |
198 |
int inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
221 |
ulong inside=((x*x+y*y) < 1.0f) ? 1:0;
|
|
199 | 222 |
total+=inside; |
200 | 223 |
} |
201 | 224 |
barrier(CLK_LOCAL_MEM_FENCE); |
... | ... | |
223 | 246 |
MyDuration=numpy.zeros(steps) |
224 | 247 |
|
225 | 248 |
if iterations%jobs==0: |
226 |
iterationsCL=numpy.uint32(iterations/jobs)
|
|
249 |
iterationsCL=numpy.uint64(iterations/jobs)
|
|
227 | 250 |
iterationsNew=iterationsCL*jobs |
228 | 251 |
else: |
229 |
iterationsCL=numpy.uint32(iterations/jobs+1)
|
|
252 |
iterationsCL=numpy.uint64(iterations/jobs+1)
|
|
230 | 253 |
iterationsNew=iterations |
231 | 254 |
|
232 | 255 |
for i in range(steps): |
... | ... | |
234 | 257 |
start.synchronize() |
235 | 258 |
if ParaStyle=='Blocks': |
236 | 259 |
MetropolisBlocksCU(circleCU, |
237 |
numpy.uint32(iterationsCL),
|
|
260 |
numpy.uint64(iterationsCL),
|
|
238 | 261 |
numpy.uint32(nprnd(2**30/jobs)), |
239 | 262 |
numpy.uint32(nprnd(2**30/jobs)), |
240 | 263 |
grid=(jobs,1), |
241 | 264 |
block=(1,1,1)) |
242 |
print "GPU with %i %s done" % (jobs,ParaStyle) |
|
265 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
|
266 |
(Alu,jobs,1,ParaStyle) |
|
243 | 267 |
elif ParaStyle=='Hybrid': |
244 |
blocks=jobs/int(math.sqrt(float(jobs)))
|
|
268 |
threads=BestThreadsNumber(jobs)
|
|
245 | 269 |
MetropolisHybridCU(circleCU, |
246 |
numpy.uint32(iterationsCL),
|
|
270 |
numpy.uint64(iterationsCL),
|
|
247 | 271 |
numpy.uint32(nprnd(2**30/jobs)), |
248 | 272 |
numpy.uint32(nprnd(2**30/jobs)), |
249 |
grid=(blocks,1), |
|
250 |
block=(jobs/blocks,1,1)) |
|
251 |
print "GPU with (blocks,jobs)=(%i,%i) %s done" % (blocks,jobs/blocks,ParaStyle) |
|
273 |
grid=(jobs,1), |
|
274 |
block=(threads,1,1)) |
|
275 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
|
276 |
(Alu,jobs/threads,threads,ParaStyle) |
|
252 | 277 |
else: |
253 | 278 |
MetropolisJobsCU(circleCU, |
254 |
numpy.uint32(iterationsCL),
|
|
279 |
numpy.uint64(iterationsCL),
|
|
255 | 280 |
numpy.uint32(nprnd(2**30/jobs)), |
256 | 281 |
numpy.uint32(nprnd(2**30/jobs)), |
257 | 282 |
grid=(1,1), |
258 | 283 |
block=(jobs,1,1)) |
259 |
print "GPU with %i %s done" % (jobs,ParaStyle) |
|
284 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
|
285 |
(Alu,jobs,1,ParaStyle) |
|
260 | 286 |
stop.record() |
261 | 287 |
stop.synchronize() |
262 | 288 |
|
... | ... | |
287 | 313 |
|
288 | 314 |
HasGPU=False |
289 | 315 |
Id=1 |
290 |
# Device selection based on choice (default is GPU)
|
|
316 |
# Primary Device selection based on Device Id
|
|
291 | 317 |
for platform in cl.get_platforms(): |
292 | 318 |
for device in platform.get_devices(): |
293 |
if not HasGPU: |
|
294 |
deviceType=cl.device_type.to_string(device.type) |
|
295 |
if deviceType=="GPU" and Alu=="GPU" and Id==Device: |
|
296 |
GPU=device |
|
297 |
print "GPU selected: ",device.name |
|
298 |
HasGPU=True |
|
299 |
if deviceType=="CPU" and Alu=="CPU": |
|
300 |
GPU=device |
|
301 |
print "CPU selected: ",device.name |
|
302 |
HasGPU=True |
|
319 |
deviceType=cl.device_type.to_string(device.type) |
|
320 |
if Id==Device and not HasGPU: |
|
321 |
GPU=device |
|
322 |
print "CPU/GPU selected: ",device.name |
|
323 |
HasGPU=True |
|
303 | 324 |
Id=Id+1 |
325 |
# Default Device selection based on ALU Type |
|
326 |
for platform in cl.get_platforms(): |
|
327 |
for device in platform.get_devices(): |
|
328 |
deviceType=cl.device_type.to_string(device.type) |
|
329 |
if deviceType=="GPU" and Alu=="GPU" and not HasGPU: |
|
330 |
GPU=device |
|
331 |
print "GPU selected: ",device.name |
|
332 |
HasGPU=True |
|
333 |
if deviceType=="CPU" and Alu=="CPU" and not HasGPU: |
|
334 |
GPU=device |
|
335 |
print "CPU selected: ",device.name |
|
336 |
HasGPU=True |
|
304 | 337 |
|
305 | 338 |
# Je cree le contexte et la queue pour son execution |
306 | 339 |
#ctx = cl.create_some_context() |
... | ... | |
330 | 363 |
iterationsCL=numpy.uint32(iterations/jobs+1) |
331 | 364 |
iterationsNew=iterations |
332 | 365 |
|
333 |
blocks=int(math.sqrt(jobs)) |
|
334 |
|
|
335 | 366 |
for i in range(steps): |
336 | 367 |
|
337 | 368 |
if ParaStyle=='Blocks': |
... | ... | |
344 | 375 |
# step is number of iterations |
345 | 376 |
CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None, |
346 | 377 |
circleCL, |
347 |
numpy.uint32(iterationsCL),
|
|
378 |
numpy.uint64(iterationsCL),
|
|
348 | 379 |
numpy.uint32(nprnd(2**30/jobs)), |
349 | 380 |
numpy.uint32(nprnd(2**30/jobs))) |
350 |
print "%s with %i %s done" % (Alu,jobs,ParaStyle) |
|
381 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
|
382 |
(Alu,jobs,1,ParaStyle) |
|
351 | 383 |
elif ParaStyle=='Hybrid': |
384 |
threads=BestThreadsNumber(jobs) |
|
352 | 385 |
# en OpenCL, necessaire de mettre un Global_id identique au local_id |
353 |
CLLaunch=MetropolisCL.MainLoopHybrid(queue,(blocks*blocks,),(blocks,),
|
|
386 |
CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
|
|
354 | 387 |
circleCL, |
355 |
numpy.uint32(iterationsCL),
|
|
388 |
numpy.uint64(iterationsCL),
|
|
356 | 389 |
numpy.uint32(nprnd(2**30/jobs)), |
357 | 390 |
numpy.uint32(nprnd(2**30/jobs))) |
358 |
print "%s with (Blocks,Threads)=(%i,%i) %s done" % (Alu,blocks,blocks,ParaStyle) |
|
391 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
|
392 |
(Alu,jobs/threads,threads,ParaStyle) |
|
359 | 393 |
else: |
360 | 394 |
# en OpenCL, necessaire de mettre un Global_id identique au local_id |
361 | 395 |
CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,), |
362 | 396 |
circleCL, |
363 |
numpy.uint32(iterationsCL),
|
|
397 |
numpy.uint64(iterationsCL),
|
|
364 | 398 |
numpy.uint32(nprnd(2**30/jobs)), |
365 | 399 |
numpy.uint32(nprnd(2**30/jobs))) |
366 | 400 |
print "%s with %i %s done" % (Alu,jobs,ParaStyle) |
... | ... | |
459 | 493 |
# Set defaults values |
460 | 494 |
# Alu can be CPU or GPU |
461 | 495 |
Alu='CPU' |
462 |
# Id of GPU |
|
463 |
Device=1
|
|
496 |
# Id of GPU : 0 is for first find !
|
|
497 |
Device=0
|
|
464 | 498 |
# GPU style can be Cuda (Nvidia implementation) or OpenCL |
465 | 499 |
GpuStyle='OpenCL' |
466 | 500 |
# Parallel distribution can be on Threads or Blocks |
... | ... | |
555 | 589 |
while Jobs <= JobEnd: |
556 | 590 |
avg,med,std=0,0,0 |
557 | 591 |
ExploredJobs=numpy.append(ExploredJobs,Jobs) |
558 |
circle=numpy.zeros(Jobs).astype(numpy.uint32)
|
|
592 |
circle=numpy.zeros(Jobs).astype(numpy.uint64)
|
|
559 | 593 |
|
560 | 594 |
if OutMetrology: |
561 | 595 |
duration=numpy.array([]).astype(numpy.float32) |
Formats disponibles : Unified diff