Révision 63
Splutter/GPU/SplutterGPU.py (revision 63) | ||
---|---|---|
70 | 70 |
|
71 | 71 |
#define znew ((z=36969*(z&65535)+(z>>16))<<16) |
72 | 72 |
#define wnew ((w=18000*(w&65535)+(w>>16))&65535) |
73 |
|
|
73 | 74 |
#define MWC (znew+wnew) |
74 | 75 |
#define SHR3 (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5)) |
75 | 76 |
#define CONG (jcong=69069*jcong+1234567) |
76 | 77 |
#define KISS ((MWC^CONG)+SHR3) |
77 | 78 |
|
79 |
#define CONGfp CONG * 2.328306435454494e-10f |
|
80 |
#define SHR3fp SHR3 * 2.328306435454494e-10f |
|
78 | 81 |
#define MWCfp MWC * 2.328306435454494e-10f |
79 | 82 |
#define KISSfp KISS * 2.328306435454494e-10f |
80 | 83 |
|
81 |
#define MAX 4294967296 |
|
84 |
#define MAX (ulong)4294967296
|
|
82 | 85 |
|
83 |
uint rotl(uint value, int shift) { |
|
84 |
return (value << shift) | (value >> (sizeof(value) * 8 - shift)); |
|
86 |
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
87 |
{ |
|
88 |
const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x); |
|
89 |
const ulong size=(ulong)(gridDim.x*blockDim.x); |
|
90 |
const ulong block=(ulong)space/(ulong)size; |
|
91 |
|
|
92 |
uint z=seed_z-(uint)id; |
|
93 |
uint w=seed_w+(uint)id; |
|
94 |
|
|
95 |
uint jsr=seed_z; |
|
96 |
uint jcong=seed_w; |
|
97 |
|
|
98 |
for ( ulong i=0;i<iterations;i++) { |
|
99 |
|
|
100 |
// Dense version |
|
101 |
uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX ); |
|
102 |
|
|
103 |
s[position]++; |
|
104 |
} |
|
105 |
|
|
106 |
__syncthreads(); |
|
85 | 107 |
} |
86 |
|
|
87 |
uint rotr(uint value, int shift) { |
|
88 |
return (value >> shift) | (value << (sizeof(value) * 8 - shift)); |
|
108 |
|
|
109 |
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
110 |
{ |
|
111 |
const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x); |
|
112 |
const ulong size=(ulong)(gridDim.x*blockDim.x); |
|
113 |
const ulong block=(ulong)space/(ulong)size; |
|
114 |
|
|
115 |
uint z=seed_z-(uint)id; |
|
116 |
uint w=seed_w+(uint)id; |
|
117 |
|
|
118 |
uint jsr=seed_z; |
|
119 |
uint jcong=seed_w; |
|
120 |
|
|
121 |
for ( ulong i=0;i<iterations;i++) { |
|
122 |
|
|
123 |
// Sparse version |
|
124 |
uint position=(uint)( (ulong)MWC*block/MAX*size+id ); |
|
125 |
|
|
126 |
s[position]++; |
|
127 |
} |
|
128 |
|
|
129 |
__syncthreads(); |
|
89 | 130 |
} |
90 | 131 |
|
91 |
__global__ void MainLoopBlocks(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
|
|
132 |
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
92 | 133 |
{ |
93 |
// uint z=rotl(seed_z,blockIdx.x); |
|
94 |
// uint w=rotr(seed_w,blockIdx.x); |
|
134 |
const ulong id=(ulong)(threadIdx.x); |
|
135 |
const ulong size=(ulong)(blockDim.x); |
|
136 |
const ulong block=(ulong)space/(ulong)size; |
|
137 |
|
|
138 |
uint z=seed_z-(uint)id; |
|
139 |
uint w=seed_w+(uint)id; |
|
95 | 140 |
|
96 |
// uint jsr=rotl(seed_z,blockIdx.x);
|
|
97 |
// uint jcong=rotr(seed_w,blockIdx.x);
|
|
141 |
uint jsr=seed_z;
|
|
142 |
uint jcong=seed_w;
|
|
98 | 143 |
|
99 |
uint z=seed_z/(blockIdx.x+1); |
|
100 |
uint w=seed_w%(blockIdx.x+1); |
|
144 |
for ( ulong i=0;i<iterations;i++) { |
|
101 | 145 |
|
102 |
uint jsr=seed_z/(blockIdx.x+1);
|
|
103 |
uint jcong=seed_w%(blockIdx.x+1);
|
|
146 |
// Dense version
|
|
147 |
size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
|
|
104 | 148 |
|
105 |
for (ulong i=0;i<iterations;i++) { |
|
149 |
s[position]++; |
|
150 |
} |
|
106 | 151 |
|
107 |
s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1; |
|
108 | 152 |
|
109 |
}
|
|
110 |
__threadfence_block(); |
|
153 |
__syncthreads();
|
|
154 |
|
|
111 | 155 |
} |
112 | 156 |
|
113 |
__global__ void MainLoopThreads(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z) |
|
114 |
{ |
|
115 |
// uint z=rotl(seed_z,threadIdx.x); |
|
116 |
// uint w=rotr(seed_w,threadIdx.x); |
|
157 |
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
158 |
{ |
|
159 |
const ulong id=(ulong)threadIdx.x; |
|
160 |
const ulong size=(ulong)blockDim.x; |
|
161 |
const ulong block=(ulong)space/(ulong)size; |
|
162 |
|
|
163 |
uint z=seed_z-(uint)id; |
|
164 |
uint w=seed_w+(uint)id; |
|
117 | 165 |
|
118 |
// uint jsr=rotl(seed_z,threadIdx.x);
|
|
119 |
// uint jcong=rotr(seed_w,threadIdx.x);
|
|
166 |
uint jsr=seed_z;
|
|
167 |
uint jcong=seed_w;
|
|
120 | 168 |
|
121 |
uint z=seed_z; |
|
122 |
uint w=seed_w; |
|
169 |
for ( ulong i=0;i<iterations;i++) { |
|
123 | 170 |
|
124 |
uint jsr=seed_z;
|
|
125 |
uint jcong=seed_w;
|
|
171 |
// Sparse version
|
|
172 |
size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
|
|
126 | 173 |
|
127 |
for (ulong i=0;i<iterations;i++) { |
|
128 |
|
|
129 |
s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1; |
|
174 |
s[position]++; |
|
130 | 175 |
} |
131 | 176 |
|
132 | 177 |
__syncthreads(); |
178 |
|
|
133 | 179 |
} |
134 | 180 |
|
135 |
__global__ void MainLoopHybrid(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
|
|
181 |
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
136 | 182 |
{ |
137 |
uint z=seed_z; |
|
138 |
uint w=seed_w; |
|
183 |
const ulong id=(ulong)(blockIdx.x); |
|
184 |
const ulong size=(ulong)(gridDim.x); |
|
185 |
const ulong block=(ulong)space/(ulong)size; |
|
186 |
|
|
187 |
uint z=seed_z-(uint)id; |
|
188 |
uint w=seed_w+(uint)id; |
|
139 | 189 |
|
140 |
uint jsr=seed_z; |
|
141 |
uint jcong=seed_w; |
|
190 |
uint jsr=seed_z;
|
|
191 |
uint jcong=seed_w;
|
|
142 | 192 |
|
143 |
for (ulong i=0;i<iterations;i++) { |
|
193 |
for ( ulong i=0;i<iterations;i++) {
|
|
144 | 194 |
|
145 |
s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1; |
|
195 |
// Dense version |
|
196 |
size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX ); |
|
197 |
|
|
198 |
s[position]++; |
|
146 | 199 |
} |
200 |
|
|
201 |
__syncthreads(); |
|
202 |
} |
|
147 | 203 |
|
204 |
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
205 |
{ |
|
206 |
const ulong id=(ulong)(blockIdx.x); |
|
207 |
const ulong size=(ulong)(gridDim.x); |
|
208 |
const ulong block=(ulong)space/(ulong)size; |
|
209 |
|
|
210 |
uint z=seed_z-(uint)id; |
|
211 |
uint w=seed_w+(uint)id; |
|
212 |
|
|
213 |
uint jsr=seed_z; |
|
214 |
uint jcong=seed_w; |
|
215 |
|
|
216 |
for ( ulong i=0;i<iterations;i++) { |
|
217 |
|
|
218 |
// Sparse version |
|
219 |
size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id ); |
|
220 |
|
|
221 |
s[position]++; |
|
222 |
|
|
223 |
} |
|
224 |
|
|
225 |
//s[blockIdx.x]=blockIdx.x; |
|
148 | 226 |
__syncthreads(); |
227 |
} |
|
149 | 228 |
|
150 |
} |
|
151 | 229 |
""" |
152 | 230 |
|
153 | 231 |
KERNEL_CODE_OPENCL=""" |
... | ... | |
175 | 253 |
return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift)); |
176 | 254 |
} |
177 | 255 |
|
178 |
__kernel void MainLoopGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
256 |
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
179 | 257 |
{ |
180 |
//__private const float id=(float)get_global_id(0); |
|
181 |
//__private const float size=(float)get_global_size(0); |
|
182 |
//__private const float block=space/size; |
|
183 |
|
|
184 | 258 |
__private const ulong id=(ulong)get_global_id(0); |
185 | 259 |
__private const ulong size=(ulong)get_global_size(0); |
186 | 260 |
__private const ulong block=(ulong)space/(ulong)size; |
... | ... | |
193 | 267 |
|
194 | 268 |
for (__private ulong i=0;i<iterations;i++) { |
195 | 269 |
|
196 |
// Standard version does not work for several processes (some lost!) memory unconsistent |
|
197 |
//__private size_t position=(size_t)(((ulong)space*(ulong)MWC)/(ulong)MAX); |
|
198 |
|
|
199 | 270 |
// Dense version |
200 |
//__private size_t position=(size_t)( ((ulong)MWC+(ulong)id*(ulong)MAX)*(ulong)block/(ulong)MAX );
|
|
271 |
__private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
|
|
201 | 272 |
|
273 |
s[position]++; |
|
274 |
} |
|
275 |
|
|
276 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
277 |
|
|
278 |
} |
|
279 |
|
|
280 |
__kernel void SplutterGlobalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
281 |
{ |
|
282 |
__private const ulong id=(ulong)get_global_id(0); |
|
283 |
__private const ulong size=(ulong)get_global_size(0); |
|
284 |
__private const ulong block=(ulong)space/(ulong)size; |
|
285 |
|
|
286 |
__private uint z=seed_z-(uint)id; |
|
287 |
__private uint w=seed_w+(uint)id; |
|
288 |
|
|
289 |
__private uint jsr=seed_z; |
|
290 |
__private uint jcong=seed_w; |
|
291 |
|
|
292 |
for (__private ulong i=0;i<iterations;i++) { |
|
293 |
|
|
202 | 294 |
// Sparse version |
203 |
//__private size_t position=(size_t)( ((ulong)MWC+(ulong)id*(ulong)MAX)*(ulong)block/(ulong)MAX ); |
|
204 |
//__private size_t position=(size_t)( ((ulong)MWC*(block)+(ulong)id*(ulong)MAX)/(ulong)MAX ); |
|
205 |
// First |
|
206 |
//__private size_t position=(size_t)( (ulong)(0)*(ulong)size+(ulong)id ); |
|
207 |
// Last |
|
208 |
//__private size_t position=(size_t)( (ulong)(block-1)*(ulong)size+(ulong)id ); |
|
209 |
// General |
|
210 |
__private size_t position=(size_t)( (ulong)MWC*(ulong)(block)/(ulong)MAX*(ulong)size+(ulong)id ); |
|
295 |
__private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id ); |
|
211 | 296 |
|
212 |
// Float version seems to be the best...
|
|
213 |
//__private uint position=(uint)( block*(CONGfp+id) );
|
|
297 |
s[position]++;
|
|
298 |
}
|
|
214 | 299 |
|
300 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
301 |
|
|
302 |
} |
|
303 |
|
|
304 |
__kernel void SplutterLocalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
305 |
{ |
|
306 |
__private const ulong id=(ulong)get_local_id(0); |
|
307 |
__private const ulong size=(ulong)get_local_size(0); |
|
308 |
__private const ulong block=(ulong)space/(ulong)size; |
|
309 |
|
|
310 |
__private uint z=seed_z-(uint)id; |
|
311 |
__private uint w=seed_w+(uint)id; |
|
312 |
|
|
313 |
__private uint jsr=seed_z; |
|
314 |
__private uint jcong=seed_w; |
|
315 |
|
|
316 |
for (__private ulong i=0;i<iterations;i++) { |
|
317 |
|
|
318 |
// Dense version |
|
319 |
__private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX ); |
|
320 |
|
|
215 | 321 |
s[position]++; |
216 | 322 |
} |
217 | 323 |
|
... | ... | |
219 | 325 |
|
220 | 326 |
} |
221 | 327 |
|
222 |
__kernel void MainLoopLocal(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
|
|
328 |
__kernel void SplutterLocalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
223 | 329 |
{ |
224 |
uint z=rotl(seed_z,get_local_id(0)); |
|
225 |
uint w=rotr(seed_w,get_local_id(0)); |
|
330 |
__private const ulong id=(ulong)get_local_id(0); |
|
331 |
__private const ulong size=(ulong)get_local_size(0); |
|
332 |
__private const ulong block=(ulong)space/(ulong)size; |
|
333 |
|
|
334 |
__private uint z=seed_z-(uint)id; |
|
335 |
__private uint w=seed_w+(uint)id; |
|
226 | 336 |
|
227 |
uint jsr=rotl(seed_z,get_local_id(0));
|
|
228 |
uint jcong=rotr(seed_w,get_local_id(0));
|
|
337 |
__private uint jsr=seed_z;
|
|
338 |
__private uint jcong=seed_w;
|
|
229 | 339 |
|
230 |
for (ulong i=0;i<iterations;i++) { |
|
340 |
for (__private ulong i=0;i<iterations;i++) {
|
|
231 | 341 |
|
232 |
s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=(uint)1; |
|
342 |
// Sparse version |
|
343 |
__private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id ); |
|
344 |
|
|
345 |
s[position]++; |
|
233 | 346 |
} |
234 | 347 |
|
348 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
235 | 349 |
|
236 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
237 | 350 |
} |
238 | 351 |
|
239 |
__kernel void MainLoopHybrid(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
|
|
352 |
__kernel void SplutterHybridDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
240 | 353 |
{ |
241 |
uint z=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0)); |
|
242 |
uint w=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0)); |
|
354 |
__private const ulong id=(ulong)(get_global_id(0)); |
|
355 |
__private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0)); |
|
356 |
__private const ulong block=(ulong)space/(ulong)size; |
|
357 |
|
|
358 |
__private uint z=seed_z-(uint)id; |
|
359 |
__private uint w=seed_w+(uint)id; |
|
243 | 360 |
|
244 |
uint jsr=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
|
|
245 |
uint jcong=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
|
|
361 |
__private uint jsr=seed_z;
|
|
362 |
__private uint jcong=seed_w;
|
|
246 | 363 |
|
247 |
for (ulong i=0;i<iterations;i++) { |
|
364 |
for (__private ulong i=0;i<iterations;i++) {
|
|
248 | 365 |
|
249 |
s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1; |
|
366 |
// Dense version |
|
367 |
__private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX ); |
|
250 | 368 |
|
251 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
|
369 |
s[position]++;
|
|
252 | 370 |
} |
371 |
|
|
372 |
} |
|
253 | 373 |
|
374 |
__kernel void SplutterHybridSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
375 |
{ |
|
376 |
__private const ulong id=(ulong)(get_global_id(0)); |
|
377 |
__private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0)); |
|
378 |
__private const ulong block=(ulong)space/(ulong)size; |
|
379 |
|
|
380 |
__private uint z=seed_z-(uint)id; |
|
381 |
__private uint w=seed_w+(uint)id; |
|
382 |
|
|
383 |
__private uint jsr=seed_z; |
|
384 |
__private uint jcong=seed_w; |
|
385 |
|
|
386 |
for (__private ulong i=0;i<iterations;i++) { |
|
387 |
|
|
388 |
// Sparse version |
|
389 |
__private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id ); |
|
390 |
|
|
391 |
s[position]++; |
|
392 |
} |
|
254 | 393 |
|
255 | 394 |
} |
395 |
|
|
256 | 396 |
""" |
257 | 397 |
|
258 |
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle): |
|
398 |
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Dense):
|
|
259 | 399 |
|
260 | 400 |
# Avec PyCUDA autoinit, rien a faire ! |
261 | 401 |
|
262 | 402 |
circleCU = cuda.InOut(circle) |
263 | 403 |
|
404 |
print "prout" |
|
405 |
|
|
264 | 406 |
mod = SourceModule(KERNEL_CODE_CUDA) |
265 | 407 |
|
266 |
MetropolisBlocksCU=mod.get_function("MainLoopBlocks") |
|
267 |
MetropolisJobsCU=mod.get_function("MainLoopThreads") |
|
268 |
MetropolisHybridCU=mod.get_function("MainLoopHybrid") |
|
408 |
print "prout 2" |
|
409 |
|
|
410 |
if Dense: |
|
411 |
MetropolisBlocksCU=mod.get_function("SplutterGlobalDense") |
|
412 |
MetropolisThreadsCU=mod.get_function("SplutterLocalDense") |
|
413 |
MetropolisHybridCU=mod.get_function("SplutterHybridDense") |
|
414 |
else: |
|
415 |
MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse") |
|
416 |
MetropolisThreadsCU=mod.get_function("SplutterLocalSparse") |
|
417 |
MetropolisHybridCU=mod.get_function("SplutterHybridSparse") |
|
269 | 418 |
|
419 |
print "prout 3" |
|
420 |
|
|
270 | 421 |
start = pycuda.driver.Event() |
271 | 422 |
stop = pycuda.driver.Event() |
272 | 423 |
|
... | ... | |
280 | 431 |
|
281 | 432 |
iterationsNew=iterationsCL*jobs |
282 | 433 |
|
434 |
Splutter=numpy.zeros(jobs*16).astype(numpy.uint32) |
|
435 |
|
|
283 | 436 |
for i in range(steps): |
284 | 437 |
|
285 |
Splutter=numpy.zeros(1024).astype(numpy.uint32)
|
|
438 |
Splutter[:]=0
|
|
286 | 439 |
|
287 |
print Splutter |
|
440 |
print Splutter,len(Splutter)
|
|
288 | 441 |
|
289 | 442 |
SplutterCU = cuda.InOut(Splutter) |
290 | 443 |
|
... | ... | |
313 | 466 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
314 | 467 |
(Alu,jobs/threads,threads,ParaStyle) |
315 | 468 |
else: |
316 |
MetropolisJobsCU(SplutterCU,
|
|
469 |
MetropolisThreadsCU(SplutterCU,
|
|
317 | 470 |
numpy.uint32(len(Splutter)), |
318 | 471 |
numpy.uint64(iterationsCL), |
319 | 472 |
numpy.uint32(nprnd(2**30/jobs)), |
... | ... | |
321 | 474 |
grid=(1,1), |
322 | 475 |
block=(jobs,1,1)) |
323 | 476 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
324 |
(Alu,jobs,1,ParaStyle)
|
|
477 |
(Alu,1,jobs,ParaStyle)
|
|
325 | 478 |
stop.record() |
326 | 479 |
stop.synchronize() |
327 | 480 |
|
... | ... | |
343 | 496 |
return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)) |
344 | 497 |
|
345 | 498 |
|
346 |
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device): |
|
499 |
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,Dense):
|
|
347 | 500 |
|
348 | 501 |
# Initialisation des variables en les CASTant correctement |
349 | 502 |
|
... | ... | |
401 | 554 |
|
402 | 555 |
# Je cree le contexte et la queue pour son execution |
403 | 556 |
ctx = cl.Context([XPU]) |
404 |
queue = cl.CommandQueue(ctx, |
|
405 |
properties=cl.command_queue_properties.PROFILING_ENABLE) |
|
406 |
|
|
557 |
queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) |
|
558 |
|
|
407 | 559 |
# Je recupere les flag possibles pour les buffers |
408 | 560 |
mf = cl.mem_flags |
409 |
|
|
561 |
|
|
410 | 562 |
MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math") |
411 |
|
|
563 |
|
|
412 | 564 |
MyDuration=numpy.zeros(steps) |
413 | 565 |
|
414 | 566 |
if iterations%jobs==0: |
415 | 567 |
iterationsCL=numpy.uint64(iterations/jobs) |
416 | 568 |
else: |
417 | 569 |
iterationsCL=numpy.uint64(iterations/jobs+1) |
418 |
|
|
570 |
|
|
419 | 571 |
iterationsNew=numpy.uint64(iterationsCL*jobs) |
420 | 572 |
|
421 | 573 |
MySplutter=numpy.zeros(steps) |
... | ... | |
453 | 605 |
# numpy.uint64(iterationsCL), |
454 | 606 |
# numpy.uint32(nprnd(2**30/jobs)), |
455 | 607 |
# numpy.uint32(nprnd(2**30/jobs))) |
456 |
CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None, |
|
457 |
SplutterCL, |
|
458 |
numpy.uint32(len(Splutter)), |
|
459 |
numpy.uint64(iterationsCL), |
|
460 |
numpy.uint32(521288629), |
|
461 |
numpy.uint32(362436069)) |
|
608 |
if Dense: |
|
609 |
CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None, |
|
610 |
SplutterCL, |
|
611 |
numpy.uint32(len(Splutter)), |
|
612 |
numpy.uint64(iterationsCL), |
|
613 |
numpy.uint32(521288629), |
|
614 |
numpy.uint32(362436069)) |
|
615 |
else: |
|
616 |
CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None, |
|
617 |
SplutterCL, |
|
618 |
numpy.uint32(len(Splutter)), |
|
619 |
numpy.uint64(iterationsCL), |
|
620 |
numpy.uint32(521288629), |
|
621 |
numpy.uint32(362436069)) |
|
622 |
|
|
462 | 623 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
463 | 624 |
(Alu,jobs,1,ParaStyle) |
464 | 625 |
elif ParaStyle=='Hybrid': |
465 | 626 |
threads=BestThreadsNumber(jobs) |
466 | 627 |
# en OpenCL, necessaire de mettre un Global_id identique au local_id |
467 |
CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,), |
|
468 |
SplutterCL, |
|
469 |
numpy.uint32(len(Splutter)), |
|
470 |
numpy.uint64(iterationsCL), |
|
471 |
numpy.uint32(nprnd(2**30/jobs)), |
|
472 |
numpy.uint32(nprnd(2**30/jobs))) |
|
628 |
if Dense: |
|
629 |
CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,), |
|
630 |
SplutterCL, |
|
631 |
numpy.uint32(len(Splutter)), |
|
632 |
numpy.uint64(iterationsCL), |
|
633 |
numpy.uint32(nprnd(2**30/jobs)), |
|
634 |
numpy.uint32(nprnd(2**30/jobs))) |
|
635 |
else: |
|
636 |
CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,), |
|
637 |
SplutterCL, |
|
638 |
numpy.uint32(len(Splutter)), |
|
639 |
numpy.uint64(iterationsCL), |
|
640 |
numpy.uint32(nprnd(2**30/jobs)), |
|
641 |
numpy.uint32(nprnd(2**30/jobs))) |
|
473 | 642 |
|
474 | 643 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
475 | 644 |
(Alu,jobs/threads,threads,ParaStyle) |
476 | 645 |
else: |
477 | 646 |
# en OpenCL, necessaire de mettre un Global_id identique au local_id |
478 |
CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,), |
|
479 |
SplutterCL, |
|
480 |
numpy.uint32(len(Splutter)), |
|
481 |
numpy.uint64(iterationsCL), |
|
482 |
numpy.uint32(nprnd(2**30/jobs)), |
|
483 |
numpy.uint32(nprnd(2**30/jobs))) |
|
647 |
if Dense: |
|
648 |
CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,), |
|
649 |
SplutterCL, |
|
650 |
numpy.uint32(len(Splutter)), |
|
651 |
numpy.uint64(iterationsCL), |
|
652 |
numpy.uint32(nprnd(2**30/jobs)), |
|
653 |
numpy.uint32(nprnd(2**30/jobs))) |
|
654 |
else: |
|
655 |
CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,), |
|
656 |
SplutterCL, |
|
657 |
numpy.uint32(len(Splutter)), |
|
658 |
numpy.uint64(iterationsCL), |
|
659 |
numpy.uint32(nprnd(2**30/jobs)), |
|
660 |
numpy.uint32(nprnd(2**30/jobs))) |
|
661 |
|
|
484 | 662 |
print "%s with %i %s done" % (Alu,jobs,ParaStyle) |
485 | 663 |
|
486 | 664 |
CLLaunch.wait() |
... | ... | |
603 | 781 |
Curves=False |
604 | 782 |
# Fit is True to print the curves |
605 | 783 |
Fit=False |
784 |
# Spluttering is Dense by default |
|
785 |
Dense=True |
|
606 | 786 |
|
607 | 787 |
try: |
608 |
opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
|
|
788 |
opts, args = getopt.getopt(sys.argv[1:],"hocfvwa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
|
|
609 | 789 |
except getopt.GetoptError: |
610 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0] |
|
790 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
|
|
611 | 791 |
sys.exit(2) |
612 | 792 |
|
613 | 793 |
for opt, arg in opts: |
614 | 794 |
if opt == '-h': |
615 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0] |
|
795 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
|
|
616 | 796 |
|
617 | 797 |
print "\nInformations about devices detected under OpenCL:" |
618 | 798 |
# For PyOpenCL import |
... | ... | |
636 | 816 |
Metrology='OutMetro' |
637 | 817 |
elif opt == '-c': |
638 | 818 |
Curves=True |
819 |
elif opt == '-v': |
|
820 |
Dense=True |
|
821 |
elif opt == '-w': |
|
822 |
Dense=False |
|
639 | 823 |
elif opt == '-f': |
640 | 824 |
Fit=True |
641 | 825 |
elif opt in ("-a", "--alu"): |
... | ... | |
669 | 853 |
print "Device Identification : %s" % Device |
670 | 854 |
print "GpuStyle used : %s" % GpuStyle |
671 | 855 |
print "Parallel Style used : %s" % ParaStyle |
856 |
print "Dense (or Sparse) Spluttering : %r" % Dense |
|
672 | 857 |
print "Iterations : %s" % Iterations |
673 | 858 |
print "Number of threads on start : %s" % JobStart |
674 | 859 |
print "Number of threads on end : %s" % JobEnd |
... | ... | |
720 | 905 |
start=time.time() |
721 | 906 |
if GpuStyle=='CUDA': |
722 | 907 |
try: |
723 |
a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle) |
|
908 |
print "toto" |
|
909 |
a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Dense) |
|
724 | 910 |
except: |
725 | 911 |
print "Problem with %i // computations on Cuda" % Jobs |
726 | 912 |
elif GpuStyle=='OpenCL': |
727 | 913 |
try: |
728 | 914 |
a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle, |
729 |
Alu,Device) |
|
915 |
Alu,Device,Dense)
|
|
730 | 916 |
except: |
731 | 917 |
print "Problem with %i // computations on OpenCL" % Jobs |
732 | 918 |
duration=numpy.append(duration,time.time()-start) |
... | ... | |
739 | 925 |
else: |
740 | 926 |
if GpuStyle=='CUDA': |
741 | 927 |
try: |
742 |
avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle) |
|
928 |
avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,Dense)
|
|
743 | 929 |
except: |
744 | 930 |
print "Problem with %i // computations on Cuda" % Jobs |
745 | 931 |
elif GpuStyle=='OpenCL': |
746 | 932 |
try: |
747 |
avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device) |
|
933 |
avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,Dense)
|
|
748 | 934 |
except: |
749 | 935 |
print "Problem with %i // computations on OpenCL" % Jobs |
750 | 936 |
|
Formats disponibles : Unified diff