Révision 68
Splutter/GPU/SplutterGPU.py (revision 68) | ||
---|---|---|
25 | 25 |
import math |
26 | 26 |
from socket import gethostname |
27 | 27 |
|
28 |
# find prime factors of a number |
|
29 |
# Get for WWW : |
|
30 |
# http://pythonism.wordpress.com/2008/05/17/looking-at-factorisation-in-python/ |
|
31 |
def PrimeFactors(x): |
|
32 |
factorlist=numpy.array([]).astype('uint32') |
|
33 |
loop=2 |
|
34 |
while loop<=x: |
|
35 |
if x%loop==0: |
|
36 |
x/=loop |
|
37 |
factorlist=numpy.append(factorlist,[loop]) |
|
38 |
else: |
|
39 |
loop+=1 |
|
40 |
return factorlist |
|
41 |
|
|
28 | 42 |
# Try to find the best thread number in Hybrid approach (Blocks&Threads) |
29 | 43 |
# output is thread number |
30 | 44 |
def BestThreadsNumber(jobs): |
... | ... | |
53 | 67 |
def Mylq2(N, T1,s,c1,c2,p): |
54 | 68 |
return (T1*(s+p/N)+c1*N+c2*N*N) |
55 | 69 |
|
56 |
prout=""" |
|
57 |
|
|
58 |
""" |
|
59 |
|
|
60 |
|
|
61 | 70 |
KERNEL_CODE_CUDA=""" |
62 | 71 |
|
63 | 72 |
// Marsaglia RNG very simple implementation |
... | ... | |
275 | 284 |
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
276 | 285 |
{ |
277 | 286 |
__private const ulong id=(ulong)get_global_id(0); |
278 |
__private const ulong size=(ulong)get_global_size(0); |
|
279 |
__private const ulong block=(ulong)space/(ulong)size; |
|
280 | 287 |
|
281 | 288 |
__private uint z=seed_z-(uint)id; |
282 | 289 |
__private uint w=seed_w+(uint)id; |
... | ... | |
287 | 294 |
for (__private ulong i=0;i<iterations;i++) { |
288 | 295 |
|
289 | 296 |
// Dense version |
290 |
__private size_t position=(size_t)( ((ulong)MWC*(ulong)space)/MAX );
|
|
297 |
__private size_t position=(size_t)( MWC%space );
|
|
291 | 298 |
|
292 | 299 |
atomic_inc(&s[position]); |
293 | 300 |
} |
... | ... | |
296 | 303 |
|
297 | 304 |
} |
298 | 305 |
|
299 |
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
306 |
__kernel void SplutterLocal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
300 | 307 |
{ |
301 |
__private const ulong id=(ulong)get_global_id(0); |
|
302 |
__private const ulong size=(ulong)get_global_size(0); |
|
303 |
__private const ulong block=(ulong)space/(ulong)size; |
|
304 |
|
|
305 |
__private uint z=seed_z-(uint)id; |
|
306 |
__private uint w=seed_w+(uint)id; |
|
307 |
|
|
308 |
__private uint jsr=seed_z; |
|
309 |
__private uint jcong=seed_w; |
|
310 |
|
|
311 |
for (__private ulong i=0;i<iterations;i++) { |
|
312 |
|
|
313 |
// Dense version |
|
314 |
__private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX ); |
|
315 |
|
|
316 |
s[position]++; |
|
317 |
} |
|
318 |
|
|
319 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
320 |
|
|
321 |
} |
|
322 |
|
|
323 |
__kernel void SplutterGlobalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
324 |
{ |
|
325 |
__private const ulong id=(ulong)get_global_id(0); |
|
326 |
__private const ulong size=(ulong)get_global_size(0); |
|
327 |
__private const ulong block=(ulong)space/(ulong)size; |
|
328 |
|
|
329 |
__private uint z=seed_z-(uint)id; |
|
330 |
__private uint w=seed_w+(uint)id; |
|
331 |
|
|
332 |
__private uint jsr=seed_z; |
|
333 |
__private uint jcong=seed_w; |
|
334 |
|
|
335 |
for (__private ulong i=0;i<iterations;i++) { |
|
336 |
|
|
337 |
// Sparse version |
|
338 |
__private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id ); |
|
339 |
|
|
340 |
s[position]++; |
|
341 |
} |
|
342 |
|
|
343 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
344 |
|
|
345 |
} |
|
346 |
|
|
347 |
__kernel void SplutterLocalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
348 |
{ |
|
349 | 308 |
__private const ulong id=(ulong)get_local_id(0); |
350 |
__private const ulong size=(ulong)get_local_size(0); |
|
351 |
__private const ulong block=(ulong)space/(ulong)size; |
|
352 | 309 |
|
353 | 310 |
__private uint z=seed_z-(uint)id; |
354 | 311 |
__private uint w=seed_w+(uint)id; |
... | ... | |
359 | 316 |
for (__private ulong i=0;i<iterations;i++) { |
360 | 317 |
|
361 | 318 |
// Dense version |
362 |
__private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX ); |
|
319 |
//__private size_t position=(size_t)( (MWC+id*block)%space ); |
|
320 |
__private size_t position=(size_t)( MWC%space ); |
|
363 | 321 |
|
364 |
s[position]++;
|
|
322 |
atomic_inc(&s[position]);
|
|
365 | 323 |
} |
366 | 324 |
|
367 | 325 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
368 | 326 |
|
369 | 327 |
} |
370 | 328 |
|
371 |
__kernel void SplutterLocalSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
329 |
__kernel void SplutterHybrid(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z)
|
|
372 | 330 |
{ |
373 |
__private const ulong id=(ulong)get_local_id(0); |
|
374 |
__private const ulong size=(ulong)get_local_size(0); |
|
375 |
__private const ulong block=(ulong)space/(ulong)size; |
|
331 |
__private const ulong id=(ulong)(get_global_id(0)+get_local_id(0)); |
|
376 | 332 |
|
377 | 333 |
__private uint z=seed_z-(uint)id; |
378 | 334 |
__private uint w=seed_w+(uint)id; |
... | ... | |
382 | 338 |
|
383 | 339 |
for (__private ulong i=0;i<iterations;i++) { |
384 | 340 |
|
385 |
// Sparse version |
|
386 |
__private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id ); |
|
387 |
|
|
388 |
s[position]++; |
|
389 |
} |
|
390 |
|
|
391 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
392 |
|
|
393 |
} |
|
394 |
|
|
395 |
__kernel void SplutterHybridDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
396 |
{ |
|
397 |
__private const ulong id=(ulong)(get_global_id(0)); |
|
398 |
__private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0)); |
|
399 |
__private const ulong block=(ulong)space/(ulong)size; |
|
400 |
|
|
401 |
__private uint z=seed_z-(uint)id; |
|
402 |
__private uint w=seed_w+(uint)id; |
|
403 |
|
|
404 |
__private uint jsr=seed_z; |
|
405 |
__private uint jcong=seed_w; |
|
406 |
|
|
407 |
for (__private ulong i=0;i<iterations;i++) { |
|
408 |
|
|
409 | 341 |
// Dense version |
410 |
__private size_t position=(size_t)( ((ulong)MWC+id*MAX)*block/MAX );
|
|
342 |
__private size_t position=(size_t)( MWC%space );
|
|
411 | 343 |
|
412 |
s[position]++;
|
|
344 |
atomic_inc(&s[position]);
|
|
413 | 345 |
} |
414 | 346 |
|
415 | 347 |
} |
416 | 348 |
|
417 |
__kernel void SplutterHybridSparse(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
418 |
{ |
|
419 |
__private const ulong id=(ulong)(get_global_id(0)); |
|
420 |
__private const ulong size=(ulong)(get_local_size(0)*get_num_groups(0)); |
|
421 |
__private const ulong block=(ulong)space/(ulong)size; |
|
422 |
|
|
423 |
__private uint z=seed_z-(uint)id; |
|
424 |
__private uint w=seed_w+(uint)id; |
|
425 |
|
|
426 |
__private uint jsr=seed_z; |
|
427 |
__private uint jcong=seed_w; |
|
428 |
|
|
429 |
for (__private ulong i=0;i<iterations;i++) { |
|
430 |
|
|
431 |
// Sparse version |
|
432 |
__private size_t position=(size_t)( (ulong)MWC*block/MAX*size+id ); |
|
433 |
|
|
434 |
s[position]++; |
|
435 |
} |
|
436 |
|
|
437 |
} |
|
438 |
|
|
439 | 349 |
""" |
440 | 350 |
|
441 |
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density): |
|
351 |
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density,Memory):
|
|
442 | 352 |
|
443 | 353 |
# Avec PyCUDA autoinit, rien a faire ! |
444 | 354 |
|
... | ... | |
536 | 446 |
|
537 | 447 |
|
538 | 448 |
def MetropolisOpenCL(circle,iterations,steps,jobs, |
539 |
ParaStyle,Alu,Device,Density):
|
|
449 |
ParaStyle,Alu,Device,Memory):
|
|
540 | 450 |
|
541 | 451 |
# Initialisation des variables en les CASTant correctement |
542 | 452 |
|
... | ... | |
616 | 526 |
print MaxWorks,2**(int)(numpy.log2(MemoryXPU)) |
617 | 527 |
|
618 | 528 |
#Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32) |
619 |
Splutter=numpy.zeros(jobs*16).astype(numpy.uint32) |
|
529 |
#Splutter=numpy.zeros(jobs*16).astype(numpy.uint32) |
|
530 |
Splutter=numpy.zeros(Memory).astype(numpy.uint32) |
|
620 | 531 |
|
621 | 532 |
for i in range(steps): |
622 | 533 |
|
... | ... | |
645 | 556 |
# numpy.uint64(iterationsCL), |
646 | 557 |
# numpy.uint32(nprnd(2**30/jobs)), |
647 | 558 |
# numpy.uint32(nprnd(2**30/jobs))) |
648 |
if Density=='Dense': |
|
649 |
CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None, |
|
650 |
SplutterCL, |
|
651 |
numpy.uint32(len(Splutter)), |
|
652 |
numpy.uint64(iterationsCL), |
|
653 |
numpy.uint32(521288629), |
|
654 |
numpy.uint32(362436069)) |
|
655 |
elif Density=='Sparse': |
|
656 |
CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None, |
|
657 |
SplutterCL, |
|
658 |
numpy.uint32(len(Splutter)), |
|
659 |
numpy.uint64(iterationsCL), |
|
660 |
numpy.uint32(521288629), |
|
661 |
numpy.uint32(362436069)) |
|
559 |
CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None, |
|
560 |
SplutterCL, |
|
561 |
numpy.uint32(len(Splutter)), |
|
562 |
numpy.uint64(iterationsCL), |
|
563 |
numpy.uint32(nprnd(2**30/jobs)), |
|
564 |
numpy.uint32(nprnd(2**30/jobs))) |
|
662 | 565 |
|
663 |
else: |
|
664 |
CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None, |
|
665 |
SplutterCL, |
|
666 |
numpy.uint32(len(Splutter)), |
|
667 |
numpy.uint64(iterationsCL), |
|
668 |
numpy.uint32(521288629), |
|
669 |
numpy.uint32(362436069)) |
|
670 |
|
|
671 | 566 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
672 | 567 |
(Alu,jobs,1,ParaStyle) |
673 | 568 |
elif ParaStyle=='Hybrid': |
674 |
threads=BestThreadsNumber(jobs) |
|
569 |
#threads=BestThreadsNumber(jobs) |
|
570 |
threads=BestThreadsNumber(256) |
|
571 |
print "print",threads |
|
675 | 572 |
# en OpenCL, necessaire de mettre un Global_id identique au local_id |
676 |
if Density=='Dense': |
|
677 |
CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,), |
|
678 |
SplutterCL, |
|
679 |
numpy.uint32(len(Splutter)), |
|
680 |
numpy.uint64(iterationsCL), |
|
681 |
numpy.uint32(nprnd(2**30/jobs)), |
|
682 |
numpy.uint32(nprnd(2**30/jobs))) |
|
683 |
elif Density=='Sparse': |
|
684 |
CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,), |
|
685 |
SplutterCL, |
|
686 |
numpy.uint32(len(Splutter)), |
|
687 |
numpy.uint64(iterationsCL), |
|
688 |
numpy.uint32(nprnd(2**30/jobs)), |
|
689 |
numpy.uint32(nprnd(2**30/jobs))) |
|
573 |
CLLaunch=MetropolisCL.SplutterHybrid(queue,(jobs,),(threads,), |
|
574 |
SplutterCL, |
|
575 |
numpy.uint32(len(Splutter)), |
|
576 |
numpy.uint64(iterationsCL), |
|
577 |
numpy.uint32(nprnd(2**30/jobs)), |
|
578 |
numpy.uint32(nprnd(2**30/jobs))) |
|
690 | 579 |
|
691 | 580 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
692 | 581 |
(Alu,jobs/threads,threads,ParaStyle) |
693 | 582 |
else: |
694 | 583 |
# en OpenCL, necessaire de mettre un global_id identique au local_id |
695 |
if Density=='Dense': |
|
696 |
CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,), |
|
697 |
SplutterCL, |
|
698 |
numpy.uint32(len(Splutter)), |
|
699 |
numpy.uint64(iterationsCL), |
|
700 |
numpy.uint32(nprnd(2**30/jobs)), |
|
701 |
numpy.uint32(nprnd(2**30/jobs))) |
|
702 |
elif Density=='Sparse': |
|
703 |
CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,), |
|
704 |
SplutterCL, |
|
705 |
numpy.uint32(len(Splutter)), |
|
706 |
numpy.uint64(iterationsCL), |
|
707 |
numpy.uint32(nprnd(2**30/jobs)), |
|
708 |
numpy.uint32(nprnd(2**30/jobs))) |
|
584 |
CLLaunch=MetropolisCL.SplutterLocal(queue,(jobs,),(jobs,), |
|
585 |
SplutterCL, |
|
586 |
numpy.uint32(len(Splutter)), |
|
587 |
numpy.uint64(iterationsCL), |
|
588 |
numpy.uint32(nprnd(2**30/jobs)), |
|
589 |
numpy.uint32(nprnd(2**30/jobs))) |
|
709 | 590 |
|
710 | 591 |
|
711 | 592 |
print "%s with %i %s done" % (Alu,jobs,ParaStyle) |
... | ... | |
719 | 600 |
print Splutter,sum(Splutter) |
720 | 601 |
#MySplutter[i]=numpy.median(Splutter) |
721 | 602 |
#print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter) |
603 |
|
|
604 |
SplutterCL.release() |
|
722 | 605 |
|
723 |
SplutterCL.release() |
|
724 |
|
|
725 | 606 |
print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration) |
726 | 607 |
|
727 | 608 |
return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)) |
... | ... | |
830 | 711 |
Curves=False |
831 | 712 |
# Fit is True to print the curves |
832 | 713 |
Fit=False |
833 |
# Spluttering is Dense by default
|
|
834 |
Density='All'
|
|
714 |
# Memory of vector explored
|
|
715 |
Memory=1024
|
|
835 | 716 |
|
836 | 717 |
try: |
837 |
opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:y:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","density="])
|
|
718 |
opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:m:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
|
|
838 | 719 |
except getopt.GetoptError: |
839 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
|
|
720 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
|
|
840 | 721 |
sys.exit(2) |
841 | 722 |
|
842 | 723 |
for opt, arg in opts: |
843 | 724 |
if opt == '-h': |
844 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
|
|
725 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> -m <MemoryRaw>' % sys.argv[0]
|
|
845 | 726 |
|
846 | 727 |
print "\nInformations about devices detected under OpenCL:" |
847 | 728 |
# For PyOpenCL import |
... | ... | |
865 | 746 |
Metrology='OutMetro' |
866 | 747 |
elif opt == '-c': |
867 | 748 |
Curves=True |
868 |
elif opt in ("-y", "--density"): |
|
869 |
Density = arg |
|
870 | 749 |
elif opt == '-f': |
871 | 750 |
Fit=True |
872 | 751 |
elif opt in ("-a", "--alu"): |
... | ... | |
887 | 766 |
JobStep = int(arg) |
888 | 767 |
elif opt in ("-r", "--redo"): |
889 | 768 |
Redo = int(arg) |
769 |
elif opt in ("-m", "--memory"): |
|
770 |
Memory = int(arg) |
|
890 | 771 |
|
891 | 772 |
print "Toto %s" % Alu |
892 | 773 |
|
... | ... | |
902 | 783 |
print "Device Identification : %s" % Device |
903 | 784 |
print "GpuStyle used : %s" % GpuStyle |
904 | 785 |
print "Parallel Style used : %s" % ParaStyle |
905 |
print "Density Spluttering : %s" % Density |
|
906 | 786 |
print "Iterations : %s" % Iterations |
907 | 787 |
print "Number of threads on start : %s" % JobStart |
908 | 788 |
print "Number of threads on end : %s" % JobEnd |
909 | 789 |
print "Number of redo : %s" % Redo |
790 |
print "Memory : %s" % Memory |
|
910 | 791 |
print "Metrology done out of CPU/GPU : %r" % OutMetrology |
911 | 792 |
|
912 | 793 |
if GpuStyle=='CUDA': |
... | ... | |
954 | 835 |
start=time.time() |
955 | 836 |
if GpuStyle=='CUDA': |
956 | 837 |
try: |
957 |
a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Density) |
|
838 |
a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle, |
|
839 |
Memory) |
|
958 | 840 |
except: |
959 | 841 |
print "Problem with %i // computations on Cuda" % Jobs |
960 | 842 |
elif GpuStyle=='OpenCL': |
961 | 843 |
try: |
962 | 844 |
a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle, |
963 |
Alu,Device,Density)
|
|
845 |
Alu,Device,Memory)
|
|
964 | 846 |
except: |
965 | 847 |
print "Problem with %i // computations on OpenCL" % Jobs |
966 | 848 |
duration=numpy.append(duration,time.time()-start) |
... | ... | |
974 | 856 |
if GpuStyle=='CUDA': |
975 | 857 |
try: |
976 | 858 |
avg,med,std=MetropolisCuda(circle,Iterations,Redo, |
977 |
Jobs,ParaStyle,Density)
|
|
859 |
Jobs,ParaStyle,Memory)
|
|
978 | 860 |
except: |
979 | 861 |
print "Problem with %i // computations on Cuda" % Jobs |
980 | 862 |
elif GpuStyle=='OpenCL': |
981 | 863 |
try: |
982 |
avg,med,std=MetropolisOpenCL(circle,Iterations,Redo, |
|
983 |
Jobs,ParaStyle,Alu,Device,Density)
|
|
864 |
avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,
|
|
865 |
ParaStyle,Alu,Device,Memory)
|
|
984 | 866 |
except: |
985 | 867 |
print "Problem with %i // computations on OpenCL" % Jobs |
986 | 868 |
|
Formats disponibles : Unified diff