Révision 63

Splutter/GPU/SplutterGPU.py (revision 63)
70 70

  
71 71
#define znew  ((z=36969*(z&65535)+(z>>16))<<16)
72 72
#define wnew  ((w=18000*(w&65535)+(w>>16))&65535)
73

  
73 74
#define MWC   (znew+wnew)
74 75
#define SHR3  (jsr=(jsr=(jsr=jsr^(jsr<<17))^(jsr>>13))^(jsr<<5))
75 76
#define CONG  (jcong=69069*jcong+1234567)
76 77
#define KISS  ((MWC^CONG)+SHR3)
77 78

  
79
#define CONGfp CONG * 2.328306435454494e-10f
80
#define SHR3fp SHR3 * 2.328306435454494e-10f
78 81
#define MWCfp MWC * 2.328306435454494e-10f
79 82
#define KISSfp KISS * 2.328306435454494e-10f
80 83

  
81
#define MAX 4294967296
84
#define MAX (ulong)4294967296
82 85

  
83
uint rotl(uint value, int shift) {
84
   return (value << shift) | (value >> (sizeof(value) * 8 - shift));
86
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
87
{
88
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
89
    const ulong size=(ulong)(gridDim.x*blockDim.x);
90
    const ulong block=(ulong)space/(ulong)size;
91
   
92
    uint z=seed_z-(uint)id;
93
    uint w=seed_w+(uint)id;
94

  
95
    uint jsr=seed_z;
96
    uint jcong=seed_w;
97

  
98
   for ( ulong i=0;i<iterations;i++) {
99

  
100
      // Dense version
101
       uint position=(uint)( ((ulong)MWC+id*MAX)*block/MAX );
102

  
103
      s[position]++;
104
   }
105

  
106
   __syncthreads();
85 107
}
86
 
87
uint rotr(uint value, int shift) {
88
   return (value >> shift) | (value << (sizeof(value) * 8 - shift));
108

  
109
__global__ void SplutterGlobalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
110
{ 
111
    const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x);
112
    const ulong size=(ulong)(gridDim.x*blockDim.x);
113
    const ulong block=(ulong)space/(ulong)size;
114
   
115
    uint z=seed_z-(uint)id;
116
    uint w=seed_w+(uint)id;
117

  
118
    uint jsr=seed_z;
119
    uint jcong=seed_w;
120

  
121
   for ( ulong i=0;i<iterations;i++) {
122

  
123
      // Sparse version
124
       uint position=(uint)( (ulong)MWC*block/MAX*size+id );
125

  
126
      s[position]++;
127
   }
128

  
129
   __syncthreads();
89 130
}
90 131

  
91
__global__ void MainLoopBlocks(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
132
__global__ void SplutterLocalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
92 133
{
93
   // uint z=rotl(seed_z,blockIdx.x);
94
   // uint w=rotr(seed_w,blockIdx.x);
134
    const ulong id=(ulong)(threadIdx.x);
135
    const ulong size=(ulong)(blockDim.x);
136
    const ulong block=(ulong)space/(ulong)size;
137
   
138
    uint z=seed_z-(uint)id;
139
    uint w=seed_w+(uint)id;
95 140

  
96
   // uint jsr=rotl(seed_z,blockIdx.x);
97
   // uint jcong=rotr(seed_w,blockIdx.x);
141
    uint jsr=seed_z;
142
    uint jcong=seed_w;
98 143

  
99
   uint z=seed_z/(blockIdx.x+1);
100
   uint w=seed_w%(blockIdx.x+1);
144
   for ( ulong i=0;i<iterations;i++) {
101 145

  
102
   uint jsr=seed_z/(blockIdx.x+1);
103
   uint jcong=seed_w%(blockIdx.x+1);
146
      // Dense version
147
       size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
104 148

  
105
   for (ulong i=0;i<iterations;i++) {
149
      s[position]++;
150
   }
106 151

  
107
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
108 152

  
109
   }
110
   __threadfence_block();
153
   __syncthreads();
154

  
111 155
}
112 156

  
113
__global__ void MainLoopThreads(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
114
{ 
115
   // uint z=rotl(seed_z,threadIdx.x);
116
   // uint w=rotr(seed_w,threadIdx.x);
157
__global__ void SplutterLocalSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
158
{
159
    const ulong id=(ulong)threadIdx.x;
160
    const ulong size=(ulong)blockDim.x;
161
    const ulong block=(ulong)space/(ulong)size;
162
   
163
    uint z=seed_z-(uint)id;
164
    uint w=seed_w+(uint)id;
117 165

  
118
   // uint jsr=rotl(seed_z,threadIdx.x);
119
   // uint jcong=rotr(seed_w,threadIdx.x);
166
    uint jsr=seed_z;
167
    uint jcong=seed_w;
120 168

  
121
   uint z=seed_z;
122
   uint w=seed_w;
169
   for ( ulong i=0;i<iterations;i++) {
123 170

  
124
   uint jsr=seed_z;
125
   uint jcong=seed_w;
171
      // Sparse version
172
       size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
126 173

  
127
   for (ulong i=0;i<iterations;i++) {
128

  
129
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
174
      s[position]++;
130 175
   }
131 176

  
132 177
   __syncthreads();
178

  
133 179
}
134 180

  
135
__global__ void MainLoopHybrid(uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
181
__global__ void SplutterHybridDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
136 182
{
137
   uint z=seed_z;
138
   uint w=seed_w;
183
    const ulong id=(ulong)(blockIdx.x);
184
    const ulong size=(ulong)(gridDim.x);
185
    const ulong block=(ulong)space/(ulong)size;
186
   
187
    uint z=seed_z-(uint)id;
188
    uint w=seed_w+(uint)id;
139 189

  
140
   uint jsr=seed_z;
141
   uint jcong=seed_w;
190
    uint jsr=seed_z;
191
    uint jcong=seed_w;
142 192

  
143
   for (ulong i=0;i<iterations;i++) {
193
   for ( ulong i=0;i<iterations;i++) {
144 194

  
145
      s[(uint)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
195
      // Dense version
196
      size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
197

  
198
      s[position]++;
146 199
   }
200
      
201
   __syncthreads();
202
}
147 203

  
204
__global__ void SplutterHybridSparse(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
205
{
206
    const ulong id=(ulong)(blockIdx.x);
207
    const ulong size=(ulong)(gridDim.x);
208
    const ulong block=(ulong)space/(ulong)size;
209
   
210
    uint z=seed_z-(uint)id;
211
    uint w=seed_w+(uint)id;
212

  
213
    uint jsr=seed_z;
214
    uint jcong=seed_w;
215

  
216
   for ( ulong i=0;i<iterations;i++) {
217

  
218
      // Sparse version
219
      size_t position=(size_t)( (((ulong)MWC*block)/MAX)*size+id );
220

  
221
      s[position]++;
222

  
223
   }
224

  
225
   //s[blockIdx.x]=blockIdx.x;
148 226
   __syncthreads();
227
}
149 228

  
150
}
151 229
"""
152 230

  
153 231
KERNEL_CODE_OPENCL="""
......
175 253
    return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift));
176 254
}
177 255

  
178
__kernel void MainLoopGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
256
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
179 257
{
180
   //__private const float id=(float)get_global_id(0);
181
   //__private const float size=(float)get_global_size(0);
182
   //__private const float block=space/size;
183

  
184 258
   __private const ulong id=(ulong)get_global_id(0);
185 259
   __private const ulong size=(ulong)get_global_size(0);
186 260
   __private const ulong block=(ulong)space/(ulong)size;
......
193 267

  
194 268
   for (__private ulong i=0;i<iterations;i++) {
195 269

  
196
      // Standard version does not work for several processes (some lost!) memory unconsistent
197
      //__private size_t position=(size_t)(((ulong)space*(ulong)MWC)/(ulong)MAX);
198
      
199 270
      // Dense version
200
      //__private size_t position=(size_t)( ((ulong)MWC+(ulong)id*(ulong)MAX)*(ulong)block/(ulong)MAX );
271
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
201 272

  
273
      s[position]++;
274
   }
275

  
276
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
277

  
278
}
279

  
280
__kernel void SplutterGlobalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
281
{
282
   __private const ulong id=(ulong)get_global_id(0);
283
   __private const ulong size=(ulong)get_global_size(0);
284
   __private const ulong block=(ulong)space/(ulong)size;
285
   
286
   __private uint z=seed_z-(uint)id;
287
   __private uint w=seed_w+(uint)id;
288

  
289
   __private uint jsr=seed_z;
290
   __private uint jcong=seed_w;
291

  
292
   for (__private ulong i=0;i<iterations;i++) {
293

  
202 294
      // Sparse version
203
      //__private size_t position=(size_t)( ((ulong)MWC+(ulong)id*(ulong)MAX)*(ulong)block/(ulong)MAX );
204
      //__private size_t position=(size_t)( ((ulong)MWC*(block)+(ulong)id*(ulong)MAX)/(ulong)MAX );
205
      // First
206
      //__private size_t position=(size_t)( (ulong)(0)*(ulong)size+(ulong)id );
207
      // Last
208
      //__private size_t position=(size_t)( (ulong)(block-1)*(ulong)size+(ulong)id );
209
      // General
210
      __private size_t position=(size_t)( (ulong)MWC*(ulong)(block)/(ulong)MAX*(ulong)size+(ulong)id );
295
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
211 296

  
212
      // Float version seems to be the best...
213
      //__private uint position=(uint)( block*(CONGfp+id) );
297
      s[position]++;
298
   }
214 299

  
300
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
301

  
302
}
303

  
304
__kernel void SplutterLocalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
305
{
306
   __private const ulong id=(ulong)get_local_id(0);
307
   __private const ulong size=(ulong)get_local_size(0);
308
   __private const ulong block=(ulong)space/(ulong)size;
309
   
310
   __private uint z=seed_z-(uint)id;
311
   __private uint w=seed_w+(uint)id;
312

  
313
   __private uint jsr=seed_z;
314
   __private uint jcong=seed_w;
315

  
316
   for (__private ulong i=0;i<iterations;i++) {
317

  
318
      // Dense version
319
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
320

  
215 321
      s[position]++;
216 322
   }
217 323

  
......
219 325

  
220 326
}
221 327

  
222
__kernel void MainLoopLocal(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
328
__kernel void SplutterLocalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
223 329
{
224
   uint z=rotl(seed_z,get_local_id(0));
225
   uint w=rotr(seed_w,get_local_id(0));
330
   __private const ulong id=(ulong)get_local_id(0);
331
   __private const ulong size=(ulong)get_local_size(0);
332
   __private const ulong block=(ulong)space/(ulong)size;
333
   
334
   __private uint z=seed_z-(uint)id;
335
   __private uint w=seed_w+(uint)id;
226 336

  
227
   uint jsr=rotl(seed_z,get_local_id(0));
228
   uint jcong=rotr(seed_w,get_local_id(0));
337
   __private uint jsr=seed_z;
338
   __private uint jcong=seed_w;
229 339

  
230
   for (ulong i=0;i<iterations;i++) {
340
   for (__private ulong i=0;i<iterations;i++) {
231 341

  
232
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=(uint)1;
342
      // Sparse version
343
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
344

  
345
      s[position]++;
233 346
   }
234 347

  
348
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
235 349

  
236
   barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
237 350
}
238 351

  
239
__kernel void MainLoopHybrid(__global uint *s,uint size,ulong iterations,uint seed_w,uint seed_z)
352
__kernel void SplutterHybridDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
240 353
{
241
   uint z=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
242
   uint w=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
354
   __private const ulong id=(ulong)(get_global_id(0));
355
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
356
   __private const ulong block=(ulong)space/(ulong)size;
357
   
358
   __private uint z=seed_z-(uint)id;
359
   __private uint w=seed_w+(uint)id;
243 360

  
244
   uint jsr=rotl(seed_z,get_group_id(0)*get_num_groups(0)+get_local_id(0));
245
   uint jcong=rotr(seed_w,get_group_id(0)*get_num_groups(0)+get_local_id(0));
361
   __private uint jsr=seed_z;
362
   __private uint jcong=seed_w;
246 363

  
247
   for (ulong i=0;i<iterations;i++) {
364
   for (__private ulong i=0;i<iterations;i++) {
248 365

  
249
      s[(int)(((ulong)size*(ulong)CONG)/(ulong)MAX)]+=1;
366
      // Dense version
367
      __private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
250 368

  
251
      barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
369
      s[position]++;
252 370
   }
371
      
372
}
253 373

  
374
__kernel void SplutterHybridSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
375
{
376
   __private const ulong id=(ulong)(get_global_id(0));
377
   __private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0));
378
   __private const ulong block=(ulong)space/(ulong)size;
379
   
380
   __private uint z=seed_z-(uint)id;
381
   __private uint w=seed_w+(uint)id;
382

  
383
   __private uint jsr=seed_z;
384
   __private uint jcong=seed_w;
385

  
386
   for (__private ulong i=0;i<iterations;i++) {
387

  
388
      // Sparse version
389
      __private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id );
390

  
391
      s[position]++;
392
   }
254 393
      
255 394
}
395

  
256 396
"""
257 397

  
258
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle):
398
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Dense):
259 399

  
260 400
  # Avec PyCUDA autoinit, rien a faire !
261 401
  
262 402
  circleCU = cuda.InOut(circle)
263 403
  
404
  print "prout"
405

  
264 406
  mod = SourceModule(KERNEL_CODE_CUDA)
265 407

  
266
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
267
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
268
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
408
  print "prout 2"
409

  
410
  if Dense:
411
    MetropolisBlocksCU=mod.get_function("SplutterGlobalDense")
412
    MetropolisThreadsCU=mod.get_function("SplutterLocalDense")
413
    MetropolisHybridCU=mod.get_function("SplutterHybridDense")
414
  else:
415
    MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse")
416
    MetropolisThreadsCU=mod.get_function("SplutterLocalSparse")
417
    MetropolisHybridCU=mod.get_function("SplutterHybridSparse")
269 418
  
419
  print "prout 3"
420

  
270 421
  start = pycuda.driver.Event()
271 422
  stop = pycuda.driver.Event()
272 423
  
......
280 431
    
281 432
  iterationsNew=iterationsCL*jobs
282 433

  
434
  Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
435

  
283 436
  for i in range(steps):
284 437

  
285
    Splutter=numpy.zeros(1024).astype(numpy.uint32)
438
    Splutter[:]=0
286 439
    
287
    print Splutter
440
    print Splutter,len(Splutter)
288 441

  
289 442
    SplutterCU = cuda.InOut(Splutter)
290 443

  
......
313 466
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
314 467
            (Alu,jobs/threads,threads,ParaStyle)
315 468
    else:
316
      MetropolisJobsCU(SplutterCU,
469
      MetropolisThreadsCU(SplutterCU,
317 470
                       numpy.uint32(len(Splutter)),
318 471
                       numpy.uint64(iterationsCL),
319 472
                       numpy.uint32(nprnd(2**30/jobs)),
......
321 474
                       grid=(1,1),
322 475
                       block=(jobs,1,1))
323 476
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
324
            (Alu,jobs,1,ParaStyle)
477
            (Alu,1,jobs,ParaStyle)
325 478
    stop.record()
326 479
    stop.synchronize()
327 480
                
......
343 496
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration))
344 497

  
345 498

  
346
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device):
499
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,Dense):
347 500
	
348 501
  # Initialisation des variables en les CASTant correctement
349 502

  
......
401 554

  
402 555
  # Je cree le contexte et la queue pour son execution
403 556
  ctx = cl.Context([XPU])
404
  queue = cl.CommandQueue(ctx,
405
                          properties=cl.command_queue_properties.PROFILING_ENABLE)
406

  
557
  queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
558
  
407 559
  # Je recupere les flag possibles pour les buffers
408 560
  mf = cl.mem_flags
409
	
561

  
410 562
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build(options = "-cl-mad-enable -cl-fast-relaxed-math")
411
  
563
      
412 564
  MyDuration=numpy.zeros(steps)
413 565
  
414 566
  if iterations%jobs==0:
415 567
    iterationsCL=numpy.uint64(iterations/jobs)
416 568
  else:
417 569
    iterationsCL=numpy.uint64(iterations/jobs+1)
418

  
570
    
419 571
  iterationsNew=numpy.uint64(iterationsCL*jobs)
420 572

  
421 573
  MySplutter=numpy.zeros(steps)
......
453 605
      #                                      numpy.uint64(iterationsCL),
454 606
      #                                      numpy.uint32(nprnd(2**30/jobs)),
455 607
      #                                      numpy.uint32(nprnd(2**30/jobs)))
456
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
457
                                           SplutterCL,
458
                                           numpy.uint32(len(Splutter)),
459
                                           numpy.uint64(iterationsCL),
460
                                           numpy.uint32(521288629),
461
                                           numpy.uint32(362436069))
608
      if Dense:
609
        CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None,
610
                                                  SplutterCL,
611
                                                  numpy.uint32(len(Splutter)),
612
                                                  numpy.uint64(iterationsCL),
613
                                                  numpy.uint32(521288629),
614
                                                  numpy.uint32(362436069))
615
      else:
616
        CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None,
617
                                                   SplutterCL,
618
                                                   numpy.uint32(len(Splutter)),
619
                                                   numpy.uint64(iterationsCL),
620
                                                   numpy.uint32(521288629),
621
                                                   numpy.uint32(362436069))
622
        
462 623
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
463 624
            (Alu,jobs,1,ParaStyle)
464 625
    elif ParaStyle=='Hybrid':
465 626
      threads=BestThreadsNumber(jobs)
466 627
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
467
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
468
                                           SplutterCL,
469
                                           numpy.uint32(len(Splutter)),
470
                                           numpy.uint64(iterationsCL),
471
                                           numpy.uint32(nprnd(2**30/jobs)),
472
                                           numpy.uint32(nprnd(2**30/jobs)))
628
      if Dense:
629
        CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,),
630
                                                  SplutterCL,
631
                                                  numpy.uint32(len(Splutter)),
632
                                                  numpy.uint64(iterationsCL),
633
                                                  numpy.uint32(nprnd(2**30/jobs)),
634
                                                  numpy.uint32(nprnd(2**30/jobs)))
635
      else:
636
        CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,),
637
                                                   SplutterCL,
638
                                                   numpy.uint32(len(Splutter)),
639
                                                   numpy.uint64(iterationsCL),
640
                                                   numpy.uint32(nprnd(2**30/jobs)),
641
                                                   numpy.uint32(nprnd(2**30/jobs)))
473 642
        
474 643
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
475 644
            (Alu,jobs/threads,threads,ParaStyle)
476 645
    else:
477 646
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
478
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
479
                                          SplutterCL,
480
                                          numpy.uint32(len(Splutter)),
481
                                          numpy.uint64(iterationsCL),
482
                                          numpy.uint32(nprnd(2**30/jobs)),
483
                                          numpy.uint32(nprnd(2**30/jobs)))
647
      if Dense:
648
        CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,),
649
                                                 SplutterCL,
650
                                                 numpy.uint32(len(Splutter)),
651
                                                 numpy.uint64(iterationsCL),
652
                                                 numpy.uint32(nprnd(2**30/jobs)),
653
                                                 numpy.uint32(nprnd(2**30/jobs)))
654
      else:
655
        CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,),
656
                                                  SplutterCL,
657
                                                  numpy.uint32(len(Splutter)),
658
                                                  numpy.uint64(iterationsCL),
659
                                                  numpy.uint32(nprnd(2**30/jobs)),
660
                                                  numpy.uint32(nprnd(2**30/jobs)))
661
        
484 662
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)
485 663

  
486 664
    CLLaunch.wait()
......
603 781
  Curves=False
604 782
  # Fit is True to print the curves
605 783
  Fit=False
784
  # Spluttering is Dense by default
785
  Dense=True
606 786

  
607 787
  try:
608
    opts, args = getopt.getopt(sys.argv[1:],"hoclfa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
788
    opts, args = getopt.getopt(sys.argv[1:],"hocfvwa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
609 789
  except getopt.GetoptError:
610
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
790
    print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
611 791
    sys.exit(2)
612 792
    
613 793
  for opt, arg in opts:
614 794
    if opt == '-h':
615
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
795
      print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law)  -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
616 796

  
617 797
      print "\nInformations about devices detected under OpenCL:"
618 798
      # For PyOpenCL import
......
636 816
      Metrology='OutMetro'
637 817
    elif opt == '-c':
638 818
      Curves=True
819
    elif opt == '-v':
820
      Dense=True
821
    elif opt == '-w':
822
      Dense=False
639 823
    elif opt == '-f':
640 824
      Fit=True
641 825
    elif opt in ("-a", "--alu"):
......
669 853
  print "Device Identification : %s" % Device
670 854
  print "GpuStyle used : %s" % GpuStyle
671 855
  print "Parallel Style used : %s" % ParaStyle
856
  print "Dense (or Sparse) Spluttering : %r" % Dense
672 857
  print "Iterations : %s" % Iterations
673 858
  print "Number of threads on start : %s" % JobStart
674 859
  print "Number of threads on end : %s" % JobEnd
......
720 905
        start=time.time()
721 906
        if GpuStyle=='CUDA':
722 907
          try:
723
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle)
908
            print "toto"
909
            a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Dense)
724 910
          except:
725 911
            print "Problem with %i // computations on Cuda" % Jobs
726 912
        elif GpuStyle=='OpenCL':
727 913
          try:
728 914
            a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle,
729
                                   Alu,Device)
915
                                   Alu,Device,Dense)
730 916
          except:
731 917
            print "Problem with %i // computations on OpenCL" % Jobs            
732 918
        duration=numpy.append(duration,time.time()-start)
......
739 925
    else:
740 926
      if GpuStyle=='CUDA':
741 927
        try:
742
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle)
928
          avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,Dense)
743 929
        except:
744 930
          print "Problem with %i // computations on Cuda" % Jobs
745 931
      elif GpuStyle=='OpenCL':
746 932
        try:
747
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device)
933
          avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,Dense)
748 934
        except:
749 935
          print "Problem with %i // computations on OpenCL" % Jobs            
750 936

  

Formats disponibles : Unified diff