Révision 66 Splutter/GPU/SplutterGPU.py
SplutterGPU.py (revision 66) | ||
---|---|---|
7 | 7 |
# |
8 | 8 |
# Thanks to Andreas Klockner for PyCUDA: |
9 | 9 |
# http://mathema.tician.de/software/pycuda |
10 |
# http://mathema.tician.de/software/pyopencl |
|
10 | 11 |
# |
11 | 12 |
|
12 | 13 |
# 2013-01-01 : problems with launch timeout |
13 | 14 |
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time |
14 | 15 |
# Option "Interactive" "0" in /etc/X11/xorg.conf |
15 | 16 |
|
17 |
# Marsaglia elements about RNG |
|
18 |
|
|
16 | 19 |
# Common tools |
17 | 20 |
import numpy |
18 | 21 |
from numpy.random import randint as nprnd |
... | ... | |
64 | 67 |
def Mylq2(N, T1,s,c1,c2,p): |
65 | 68 |
return (T1*(s+p/N)+c1*N+c2*N*N) |
66 | 69 |
|
70 |
prout=""" |
|
71 |
|
|
72 |
""" |
|
73 |
|
|
74 |
|
|
67 | 75 |
KERNEL_CODE_CUDA=""" |
68 | 76 |
|
69 | 77 |
// Marsaglia RNG very simple implementation |
... | ... | |
82 | 90 |
#define KISSfp KISS * 2.328306435454494e-10f |
83 | 91 |
|
84 | 92 |
#define MAX (ulong)4294967296 |
93 |
#define UMAX (uint)2147483648 |
|
85 | 94 |
|
95 |
__global__ void SplutterGlobal(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
96 |
{ |
|
97 |
const ulong id=(ulong)(blockIdx.x); |
|
98 |
|
|
99 |
uint z=seed_z-(uint)id; |
|
100 |
uint w=seed_w+(uint)id; |
|
101 |
|
|
102 |
uint jsr=seed_z; |
|
103 |
uint jcong=seed_w; |
|
104 |
|
|
105 |
for ( ulong i=0;i<iterations;i++) { |
|
106 |
|
|
107 |
// All version |
|
108 |
uint position=(uint)( ((ulong)MWC*(ulong)space)/MAX ); |
|
109 |
|
|
110 |
// UMAX is set to avoid round over overflow |
|
111 |
atomicInc(&s[position],UMAX); |
|
112 |
} |
|
113 |
|
|
114 |
__syncthreads(); |
|
115 |
} |
|
116 |
|
|
86 | 117 |
__global__ void SplutterGlobalDense(uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
87 | 118 |
{ |
88 | 119 |
const ulong id=(ulong)(threadIdx.x+blockIdx.x*blockDim.x); |
... | ... | |
229 | 260 |
""" |
230 | 261 |
|
231 | 262 |
KERNEL_CODE_OPENCL=""" |
263 |
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable |
|
264 |
|
|
232 | 265 |
// Marsaglia RNG very simple implementation |
233 | 266 |
#define znew ((z=36969*(z&65535)+(z>>16))<<16) |
234 | 267 |
#define wnew ((w=18000*(w&65535)+(w>>16))&65535) |
... | ... | |
253 | 286 |
return (value >> shift) | (value << (sizeof(value) * CHAR_BIT - shift)); |
254 | 287 |
} |
255 | 288 |
|
289 |
__kernel void SplutterGlobal(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
|
290 |
{ |
|
291 |
__private const ulong id=(ulong)get_global_id(0); |
|
292 |
__private const ulong size=(ulong)get_global_size(0); |
|
293 |
__private const ulong block=(ulong)space/(ulong)size; |
|
294 |
|
|
295 |
__private uint z=seed_z-(uint)id; |
|
296 |
__private uint w=seed_w+(uint)id; |
|
297 |
|
|
298 |
__private uint jsr=seed_z; |
|
299 |
__private uint jcong=seed_w; |
|
300 |
|
|
301 |
for (__private ulong i=0;i<iterations;i++) { |
|
302 |
|
|
303 |
// Dense version |
|
304 |
__private size_t position=(size_t)( ((ulong)MWC*(ulong)space)/MAX ); |
|
305 |
|
|
306 |
atomic_inc(&s[position]); |
|
307 |
} |
|
308 |
|
|
309 |
barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); |
|
310 |
|
|
311 |
} |
|
312 |
|
|
256 | 313 |
__kernel void SplutterGlobalDense(__global uint *s,const uint space,const ulong iterations,const uint seed_w,const uint seed_z) |
257 | 314 |
{ |
258 | 315 |
__private const ulong id=(ulong)get_global_id(0); |
... | ... | |
395 | 452 |
|
396 | 453 |
""" |
397 | 454 |
|
398 |
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Dense):
|
|
455 |
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,Density):
|
|
399 | 456 |
|
400 | 457 |
# Avec PyCUDA autoinit, rien a faire ! |
401 |
|
|
458 |
|
|
402 | 459 |
circleCU = cuda.InOut(circle) |
403 | 460 |
|
404 |
print "prout" |
|
405 |
|
|
406 | 461 |
mod = SourceModule(KERNEL_CODE_CUDA) |
407 | 462 |
|
408 |
print "prout 2" |
|
409 |
|
|
410 |
if Dense: |
|
463 |
if Density=='Dense': |
|
411 | 464 |
MetropolisBlocksCU=mod.get_function("SplutterGlobalDense") |
412 | 465 |
MetropolisThreadsCU=mod.get_function("SplutterLocalDense") |
413 | 466 |
MetropolisHybridCU=mod.get_function("SplutterHybridDense") |
414 |
else:
|
|
467 |
elif Density=='Sparse':
|
|
415 | 468 |
MetropolisBlocksCU=mod.get_function("SplutterGlobalSparse") |
416 | 469 |
MetropolisThreadsCU=mod.get_function("SplutterLocalSparse") |
417 | 470 |
MetropolisHybridCU=mod.get_function("SplutterHybridSparse") |
418 |
|
|
419 |
print "prout 3"
|
|
420 |
|
|
471 |
else: |
|
472 |
MetropolisBlocksCU=mod.get_function("SplutterGlobal")
|
|
473 |
|
|
421 | 474 |
start = pycuda.driver.Event() |
422 | 475 |
stop = pycuda.driver.Event() |
423 | 476 |
|
... | ... | |
496 | 549 |
return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration)) |
497 | 550 |
|
498 | 551 |
|
499 |
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,Dense): |
|
552 |
def MetropolisOpenCL(circle,iterations,steps,jobs, |
|
553 |
ParaStyle,Alu,Device,Density): |
|
500 | 554 |
|
501 | 555 |
# Initialisation des variables en les CASTant correctement |
502 | 556 |
|
... | ... | |
605 | 659 |
# numpy.uint64(iterationsCL), |
606 | 660 |
# numpy.uint32(nprnd(2**30/jobs)), |
607 | 661 |
# numpy.uint32(nprnd(2**30/jobs))) |
608 |
if Dense:
|
|
662 |
if Density=='Dense':
|
|
609 | 663 |
CLLaunch=MetropolisCL.SplutterGlobalDense(queue,(jobs,),None, |
610 | 664 |
SplutterCL, |
611 | 665 |
numpy.uint32(len(Splutter)), |
612 | 666 |
numpy.uint64(iterationsCL), |
613 | 667 |
numpy.uint32(521288629), |
614 | 668 |
numpy.uint32(362436069)) |
615 |
else:
|
|
669 |
elif Density=='Sparse':
|
|
616 | 670 |
CLLaunch=MetropolisCL.SplutterGlobalSparse(queue,(jobs,),None, |
617 | 671 |
SplutterCL, |
618 | 672 |
numpy.uint32(len(Splutter)), |
... | ... | |
620 | 674 |
numpy.uint32(521288629), |
621 | 675 |
numpy.uint32(362436069)) |
622 | 676 |
|
677 |
else: |
|
678 |
CLLaunch=MetropolisCL.SplutterGlobal(queue,(jobs,),None, |
|
679 |
SplutterCL, |
|
680 |
numpy.uint32(len(Splutter)), |
|
681 |
numpy.uint64(iterationsCL), |
|
682 |
numpy.uint32(521288629), |
|
683 |
numpy.uint32(362436069)) |
|
684 |
|
|
623 | 685 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
624 | 686 |
(Alu,jobs,1,ParaStyle) |
625 | 687 |
elif ParaStyle=='Hybrid': |
626 | 688 |
threads=BestThreadsNumber(jobs) |
627 | 689 |
# en OpenCL, necessaire de mettre un Global_id identique au local_id |
628 |
if Dense:
|
|
690 |
if Density=='Dense':
|
|
629 | 691 |
CLLaunch=MetropolisCL.SplutterHybridDense(queue,(jobs,),(threads,), |
630 | 692 |
SplutterCL, |
631 | 693 |
numpy.uint32(len(Splutter)), |
632 | 694 |
numpy.uint64(iterationsCL), |
633 | 695 |
numpy.uint32(nprnd(2**30/jobs)), |
634 | 696 |
numpy.uint32(nprnd(2**30/jobs))) |
635 |
else:
|
|
697 |
elif Density=='Sparse':
|
|
636 | 698 |
CLLaunch=MetropolisCL.SplutterHybridSparse(queue,(jobs,),(threads,), |
637 | 699 |
SplutterCL, |
638 | 700 |
numpy.uint32(len(Splutter)), |
... | ... | |
643 | 705 |
print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \ |
644 | 706 |
(Alu,jobs/threads,threads,ParaStyle) |
645 | 707 |
else: |
646 |
# en OpenCL, necessaire de mettre un Global_id identique au local_id
|
|
647 |
if Dense:
|
|
708 |
# en OpenCL, necessaire de mettre un global_id identique au local_id
|
|
709 |
if Density=='Dense':
|
|
648 | 710 |
CLLaunch=MetropolisCL.SplutterLocalDense(queue,(jobs,),(jobs,), |
649 | 711 |
SplutterCL, |
650 | 712 |
numpy.uint32(len(Splutter)), |
651 | 713 |
numpy.uint64(iterationsCL), |
652 | 714 |
numpy.uint32(nprnd(2**30/jobs)), |
653 | 715 |
numpy.uint32(nprnd(2**30/jobs))) |
654 |
else:
|
|
716 |
elif Density=='Sparse':
|
|
655 | 717 |
CLLaunch=MetropolisCL.SplutterLocalSparse(queue,(jobs,),(jobs,), |
656 | 718 |
SplutterCL, |
657 | 719 |
numpy.uint32(len(Splutter)), |
... | ... | |
659 | 721 |
numpy.uint32(nprnd(2**30/jobs)), |
660 | 722 |
numpy.uint32(nprnd(2**30/jobs))) |
661 | 723 |
|
724 |
|
|
662 | 725 |
print "%s with %i %s done" % (Alu,jobs,ParaStyle) |
663 | 726 |
|
664 | 727 |
CLLaunch.wait() |
... | ... | |
765 | 828 |
# Parallel distribution can be on Threads or Blocks |
766 | 829 |
ParaStyle='Blocks' |
767 | 830 |
# Iterations is integer |
768 |
Iterations=100000000
|
|
831 |
Iterations=10000000 |
|
769 | 832 |
# JobStart in first number of Jobs to explore |
770 | 833 |
JobStart=1 |
771 | 834 |
# JobEnd is last number of Jobs to explore |
... | ... | |
782 | 845 |
# Fit is True to print the curves |
783 | 846 |
Fit=False |
784 | 847 |
# Spluttering is Dense by default |
785 |
Dense=True
|
|
848 |
Density='All'
|
|
786 | 849 |
|
787 | 850 |
try: |
788 |
opts, args = getopt.getopt(sys.argv[1:],"hocfvwa:g:p:i:s:e:t:r:d:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device="])
|
|
851 |
opts, args = getopt.getopt(sys.argv[1:],"hocfa:g:p:i:s:e:t:r:d:y:",["alu=","gpustyle=","parastyle=","iterations=","jobstart=","jobend=","jobstep=","redo=","device=","density="])
|
|
789 | 852 |
except getopt.GetoptError: |
790 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
|
|
853 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats> ' % sys.argv[0]
|
|
791 | 854 |
sys.exit(2) |
792 | 855 |
|
793 | 856 |
for opt, arg in opts: |
794 | 857 |
if opt == '-h': |
795 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -v (Dense Spluttering) -w (Sparse Spluttering) -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
|
|
858 |
print '%s -o (Out of Core Metrology) -c (Print Curves) -f (Fit to Amdahl Law) -y <Dense/Sparse/All> -a <CPU/GPU/ACCELERATOR> -d <DeviceId> -g <CUDA/OpenCL> -p <Threads/Hybrid/Blocks> -i <Iterations> -s <JobStart> -e <JobEnd> -t <JobStep> -r <RedoToImproveStats>' % sys.argv[0]
|
|
796 | 859 |
|
797 | 860 |
print "\nInformations about devices detected under OpenCL:" |
798 | 861 |
# For PyOpenCL import |
... | ... | |
816 | 879 |
Metrology='OutMetro' |
817 | 880 |
elif opt == '-c': |
818 | 881 |
Curves=True |
819 |
elif opt == '-v': |
|
820 |
Dense=True |
|
821 |
elif opt == '-w': |
|
822 |
Dense=False |
|
882 |
elif opt in ("-y", "--density"): |
|
883 |
Density = arg |
|
823 | 884 |
elif opt == '-f': |
824 | 885 |
Fit=True |
825 | 886 |
elif opt in ("-a", "--alu"): |
... | ... | |
841 | 902 |
elif opt in ("-r", "--redo"): |
842 | 903 |
Redo = int(arg) |
843 | 904 |
|
905 |
print "Toto %s" % Alu |
|
906 |
|
|
844 | 907 |
if Alu=='CPU' and GpuStyle=='CUDA': |
845 | 908 |
print "Alu can't be CPU for CUDA, set Alu to GPU" |
846 | 909 |
Alu='GPU' |
847 | 910 |
|
848 | 911 |
if ParaStyle not in ('Blocks','Threads','Hybrid'): |
849 | 912 |
print "%s not exists, ParaStyle set as Threads !" % ParaStyle |
850 |
ParaStyle='Threads'
|
|
913 |
ParaStyle='Blocks'
|
|
851 | 914 |
|
852 | 915 |
print "Compute unit : %s" % Alu |
853 | 916 |
print "Device Identification : %s" % Device |
854 | 917 |
print "GpuStyle used : %s" % GpuStyle |
855 | 918 |
print "Parallel Style used : %s" % ParaStyle |
856 |
print "Dense (or Sparse) Spluttering : %r" % Dense
|
|
919 |
print "Density Spluttering : %s" % Density
|
|
857 | 920 |
print "Iterations : %s" % Iterations |
858 | 921 |
print "Number of threads on start : %s" % JobStart |
859 | 922 |
print "Number of threads on end : %s" % JobEnd |
... | ... | |
905 | 968 |
start=time.time() |
906 | 969 |
if GpuStyle=='CUDA': |
907 | 970 |
try: |
908 |
print "toto" |
|
909 |
a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Dense) |
|
971 |
a,m,s=MetropolisCuda(circle,Iterations,1,Jobs,ParaStyle,Density) |
|
910 | 972 |
except: |
911 | 973 |
print "Problem with %i // computations on Cuda" % Jobs |
912 | 974 |
elif GpuStyle=='OpenCL': |
913 | 975 |
try: |
914 | 976 |
a,m,s=MetropolisOpenCL(circle,Iterations,1,Jobs,ParaStyle, |
915 |
Alu,Device,Dense)
|
|
977 |
Alu,Device,Density)
|
|
916 | 978 |
except: |
917 | 979 |
print "Problem with %i // computations on OpenCL" % Jobs |
918 | 980 |
duration=numpy.append(duration,time.time()-start) |
... | ... | |
925 | 987 |
else: |
926 | 988 |
if GpuStyle=='CUDA': |
927 | 989 |
try: |
928 |
avg,med,std=MetropolisCuda(circle,Iterations,Redo,Jobs,ParaStyle,Dense) |
|
990 |
avg,med,std=MetropolisCuda(circle,Iterations,Redo, |
|
991 |
Jobs,ParaStyle,Density) |
|
929 | 992 |
except: |
930 | 993 |
print "Problem with %i // computations on Cuda" % Jobs |
931 | 994 |
elif GpuStyle=='OpenCL': |
932 | 995 |
try: |
933 |
avg,med,std=MetropolisOpenCL(circle,Iterations,Redo,Jobs,ParaStyle,Alu,Device,Dense) |
|
996 |
avg,med,std=MetropolisOpenCL(circle,Iterations,Redo, |
|
997 |
Jobs,ParaStyle,Alu,Device,Density) |
|
934 | 998 |
except: |
935 | 999 |
print "Problem with %i // computations on OpenCL" % Jobs |
936 | 1000 |
|
Formats disponibles : Unified diff