root / Pi / XPU / PiXpuMPI.py @ 247
Historique | Voir | Annoter | Télécharger (14,16 ko)
1 | 127 | equemene | #!/usr/bin/env python3
|
---|---|---|---|
2 | 107 | equemene | |
3 | 107 | equemene | #
|
4 | 107 | equemene | # Pi-by-MonteCarlo using PyCUDA/PyOpenCL
|
5 | 107 | equemene | #
|
6 | 107 | equemene | # CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
7 | 107 | equemene | # Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
8 | 107 | equemene | #
|
9 | 107 | equemene | # Thanks to Andreas Klockner for PyCUDA:
|
10 | 107 | equemene | # http://mathema.tician.de/software/pycuda
|
11 | 107 | equemene | # Thanks to Andreas Klockner for PyOpenCL:
|
12 | 107 | equemene | # http://mathema.tician.de/software/pyopencl
|
13 | 107 | equemene | #
|
14 | 107 | equemene | |
15 | 107 | equemene | # 2013-01-01 : problems with launch timeout
|
16 | 107 | equemene | # http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
|
17 | 107 | equemene | # Option "Interactive" "0" in /etc/X11/xorg.conf
|
18 | 107 | equemene | |
19 | 107 | equemene | # Common tools
|
20 | 107 | equemene | import numpy |
21 | 107 | equemene | from numpy.random import randint as nprnd |
22 | 107 | equemene | import sys |
23 | 107 | equemene | import getopt |
24 | 107 | equemene | import time |
25 | 107 | equemene | import math |
26 | 107 | equemene | import itertools |
27 | 107 | equemene | from socket import gethostname |
28 | 107 | equemene | |
29 | 107 | equemene | import mpi4py |
30 | 107 | equemene | from mpi4py import MPI |
31 | 107 | equemene | |
32 | 107 | equemene | from PiXPU import * |
33 | 107 | equemene | |
34 | 107 | equemene | if __name__=='__main__': |
35 | 107 | equemene | |
36 | 107 | equemene | # MPI Init
|
37 | 107 | equemene | comm = MPI.COMM_WORLD |
38 | 107 | equemene | rank = comm.Get_rank() |
39 | 107 | equemene | |
40 | 107 | equemene | # Define number of Nodes on with computing is performed (exclude 0)
|
41 | 107 | equemene | RankSize=comm.Get_size() |
42 | 107 | equemene | |
43 | 107 | equemene | if rank == 0: |
44 | 107 | equemene | |
45 | 107 | equemene | # Set defaults values
|
46 | 107 | equemene | |
47 | 107 | equemene | # Id of Device : 1 is for first find !
|
48 | 107 | equemene | Device=1
|
49 | 107 | equemene | # GPU style can be Cuda (Nvidia implementation) or OpenCL
|
50 | 107 | equemene | GpuStyle='OpenCL'
|
51 | 107 | equemene | # Iterations is integer
|
52 | 107 | equemene | Iterations=10000000
|
53 | 107 | equemene | # BlocksBlocks in first number of Blocks to explore
|
54 | 107 | equemene | BlocksBegin=1
|
55 | 107 | equemene | # BlocksEnd is last number of Blocks to explore
|
56 | 107 | equemene | BlocksEnd=16
|
57 | 107 | equemene | # BlocksStep is the step of Blocks to explore
|
58 | 107 | equemene | BlocksStep=1
|
59 | 107 | equemene | # ThreadsBlocks in first number of Blocks to explore
|
60 | 107 | equemene | ThreadsBegin=1
|
61 | 107 | equemene | # ThreadsEnd is last number of Blocks to explore
|
62 | 107 | equemene | ThreadsEnd=1
|
63 | 107 | equemene | # ThreadsStep is the step of Blocks to explore
|
64 | 107 | equemene | ThreadsStep=1
|
65 | 107 | equemene | # Redo is the times to redo the test to improve metrology
|
66 | 107 | equemene | Redo=1
|
67 | 107 | equemene | # OutMetrology is method for duration estimation : False is GPU inside
|
68 | 107 | equemene | OutMetrology=False
|
69 | 107 | equemene | Metrology='InMetro'
|
70 | 107 | equemene | # Curves is True to print the curves
|
71 | 107 | equemene | Curves=False
|
72 | 107 | equemene | # Fit is True to print the curves
|
73 | 107 | equemene | Fit=False
|
74 | 107 | equemene | # Marsaglia RNG
|
75 | 107 | equemene | RNG='MWC'
|
76 | 239 | equemene | # Seeds
|
77 | 239 | equemene | Seeds=110271,101008 |
78 | 107 | equemene | # Value type : INT32, INT64, FP32, FP64
|
79 | 107 | equemene | ValueType='FP32'
|
80 | 190 | equemene | # Inside based on If
|
81 | 190 | equemene | IfThen=False
|
82 | 190 | equemene | |
83 | 190 | equemene | HowToUse='%s -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
|
84 | 107 | equemene | |
85 | 107 | equemene | try:
|
86 | 190 | equemene | opts, args = getopt.getopt(sys.argv[1:],"hckg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="]) |
87 | 107 | equemene | except getopt.GetoptError:
|
88 | 127 | equemene | print(HowToUse % sys.argv[0])
|
89 | 107 | equemene | sys.exit(2)
|
90 | 107 | equemene | |
91 | 107 | equemene | # List of Devices
|
92 | 107 | equemene | Devices=[] |
93 | 107 | equemene | Alu={} |
94 | 107 | equemene | |
95 | 107 | equemene | for opt, arg in opts: |
96 | 107 | equemene | if opt == '-h': |
97 | 127 | equemene | print(HowToUse % sys.argv[0])
|
98 | 107 | equemene | |
99 | 127 | equemene | print("\nInformations about devices detected under OpenCL:")
|
100 | 107 | equemene | # For PyOpenCL import
|
101 | 107 | equemene | try:
|
102 | 107 | equemene | import pyopencl as cl |
103 | 129 | equemene | Id=0
|
104 | 107 | equemene | for platform in cl.get_platforms(): |
105 | 107 | equemene | for device in platform.get_devices(): |
106 | 138 | equemene | #deviceType=cl.device_type.to_string(device.type)
|
107 | 157 | equemene | deviceType="xPU"
|
108 | 127 | equemene | print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
|
109 | 107 | equemene | Id=Id+1
|
110 | 107 | equemene | |
111 | 107 | equemene | print
|
112 | 123 | equemene | except:
|
113 | 127 | equemene | print("Your platform does not seem to support OpenCL")
|
114 | 123 | equemene | |
115 | 129 | equemene | print("\nInformations about devices detected under CUDA API:")
|
116 | 123 | equemene | # For PyCUDA import
|
117 | 123 | equemene | try:
|
118 | 123 | equemene | import pycuda.driver as cuda |
119 | 123 | equemene | cuda.init() |
120 | 123 | equemene | for Id in range(cuda.Device.count()): |
121 | 123 | equemene | device=cuda.Device(Id) |
122 | 127 | equemene | print("Device #%i of type GPU : %s" % (Id,device.name()))
|
123 | 123 | equemene | print
|
124 | 123 | equemene | except:
|
125 | 127 | equemene | print("Your platform does not seem to support CUDA")
|
126 | 107 | equemene | |
127 | 123 | equemene | sys.exit() |
128 | 123 | equemene | |
129 | 107 | equemene | elif opt == '-c': |
130 | 107 | equemene | Curves=True
|
131 | 190 | equemene | elif opt == '-k': |
132 | 190 | equemene | IfThen=True
|
133 | 107 | equemene | elif opt in ("-d", "--device"): |
134 | 107 | equemene | Devices.append(int(arg))
|
135 | 107 | equemene | elif opt in ("-g", "--gpustyle"): |
136 | 107 | equemene | GpuStyle = arg |
137 | 107 | equemene | elif opt in ("-m", "--marsaglia"): |
138 | 107 | equemene | RNG = arg |
139 | 107 | equemene | elif opt in ("-v", "--valuetype"): |
140 | 107 | equemene | ValueType = arg |
141 | 107 | equemene | elif opt in ("-i", "--iterations"): |
142 | 107 | equemene | Iterations = numpy.uint64(arg) |
143 | 107 | equemene | elif opt in ("-b", "--blocksbegin"): |
144 | 107 | equemene | BlocksBegin = int(arg)
|
145 | 192 | equemene | BlocksEnd = BlocksBegin |
146 | 107 | equemene | elif opt in ("-e", "--blocksend"): |
147 | 107 | equemene | BlocksEnd = int(arg)
|
148 | 107 | equemene | elif opt in ("-s", "--blocksstep"): |
149 | 107 | equemene | BlocksStep = int(arg)
|
150 | 107 | equemene | elif opt in ("-f", "--threadsfirst"): |
151 | 107 | equemene | ThreadsBegin = int(arg)
|
152 | 192 | equemene | ThreadsEnd = ThreadsBegin |
153 | 107 | equemene | elif opt in ("-l", "--threadslast"): |
154 | 107 | equemene | ThreadsEnd = int(arg)
|
155 | 107 | equemene | elif opt in ("-t", "--threadsstep"): |
156 | 107 | equemene | ThreadsStep = int(arg)
|
157 | 107 | equemene | elif opt in ("-r", "--redo"): |
158 | 107 | equemene | Redo = int(arg)
|
159 | 107 | equemene | |
160 | 127 | equemene | print("Devices Identification : %s" % Devices)
|
161 | 127 | equemene | print("GpuStyle used : %s" % GpuStyle)
|
162 | 127 | equemene | print("Iterations : %s" % Iterations)
|
163 | 127 | equemene | print("Number of Blocks on begin : %s" % BlocksBegin)
|
164 | 127 | equemene | print("Number of Blocks on end : %s" % BlocksEnd)
|
165 | 127 | equemene | print("Step on Blocks : %s" % BlocksStep)
|
166 | 127 | equemene | print("Number of Threads on begin : %s" % ThreadsBegin)
|
167 | 127 | equemene | print("Number of Threads on end : %s" % ThreadsEnd)
|
168 | 127 | equemene | print("Step on Threads : %s" % ThreadsStep)
|
169 | 127 | equemene | print("Number of redo : %s" % Redo)
|
170 | 127 | equemene | print("Metrology done out of XPU : %r" % OutMetrology)
|
171 | 127 | equemene | print("Type of Marsaglia RNG used : %s" % RNG)
|
172 | 127 | equemene | print("Type of variable : %s" % ValueType)
|
173 | 107 | equemene | |
174 | 107 | equemene | if GpuStyle=='CUDA': |
175 | 107 | equemene | try:
|
176 | 107 | equemene | # For PyCUDA import
|
177 | 107 | equemene | import pycuda.driver as cuda |
178 | 129 | equemene | |
179 | 129 | equemene | cuda.init() |
180 | 129 | equemene | for Id in range(cuda.Device.count()): |
181 | 129 | equemene | device=cuda.Device(Id) |
182 | 129 | equemene | print("Device #%i of type GPU : %s" % (Id,device.name()))
|
183 | 129 | equemene | if Id in Devices: |
184 | 129 | equemene | Alu[Id]='GPU'
|
185 | 107 | equemene | except ImportError: |
186 | 127 | equemene | print("Platform does not seem to support CUDA")
|
187 | 107 | equemene | |
188 | 107 | equemene | if GpuStyle=='OpenCL': |
189 | 107 | equemene | try:
|
190 | 107 | equemene | # For PyOpenCL import
|
191 | 107 | equemene | import pyopencl as cl |
192 | 129 | equemene | Id=0
|
193 | 107 | equemene | for platform in cl.get_platforms(): |
194 | 107 | equemene | for device in platform.get_devices(): |
195 | 138 | equemene | #deviceType=cl.device_type.to_string(device.type)
|
196 | 240 | equemene | deviceType="xPU"
|
197 | 127 | equemene | print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
|
198 | 107 | equemene | |
199 | 107 | equemene | if Id in Devices: |
200 | 107 | equemene | # Set the Alu as detected Device Type
|
201 | 107 | equemene | Alu[Id]=deviceType |
202 | 107 | equemene | Id=Id+1
|
203 | 107 | equemene | except ImportError: |
204 | 127 | equemene | print("Platform does not seem to support OpenCL")
|
205 | 107 | equemene | |
206 | 127 | equemene | print(Devices,Alu) |
207 | 107 | equemene | |
208 | 127 | equemene | BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
|
209 | 127 | equemene | ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
|
210 | 107 | equemene | |
211 | 107 | equemene | ExploredJobs=numpy.array([]).astype(numpy.uint32) |
212 | 107 | equemene | ExploredBlocks=numpy.array([]).astype(numpy.uint32) |
213 | 107 | equemene | ExploredThreads=numpy.array([]).astype(numpy.uint32) |
214 | 107 | equemene | avgD=numpy.array([]).astype(numpy.float32) |
215 | 107 | equemene | medD=numpy.array([]).astype(numpy.float32) |
216 | 107 | equemene | stdD=numpy.array([]).astype(numpy.float32) |
217 | 107 | equemene | minD=numpy.array([]).astype(numpy.float32) |
218 | 107 | equemene | maxD=numpy.array([]).astype(numpy.float32) |
219 | 107 | equemene | avgR=numpy.array([]).astype(numpy.float32) |
220 | 107 | equemene | medR=numpy.array([]).astype(numpy.float32) |
221 | 107 | equemene | stdR=numpy.array([]).astype(numpy.float32) |
222 | 107 | equemene | minR=numpy.array([]).astype(numpy.float32) |
223 | 107 | equemene | maxR=numpy.array([]).astype(numpy.float32) |
224 | 107 | equemene | |
225 | 129 | equemene | IterationsMPI=numpy.uint64(Iterations/len(Devices))
|
226 | 129 | equemene | if Iterations%len(Devices)!=0: |
227 | 129 | equemene | IterationsMPI+=1
|
228 | 129 | equemene | |
229 | 107 | equemene | for Blocks,Threads in itertools.product(BlocksList,ThreadsList): |
230 | 107 | equemene | |
231 | 107 | equemene | ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads) |
232 | 107 | equemene | ExploredBlocks=numpy.append(ExploredBlocks,Blocks) |
233 | 107 | equemene | ExploredThreads=numpy.append(ExploredThreads,Threads) |
234 | 129 | equemene | |
235 | 129 | equemene | DurationItem=numpy.array([]).astype(numpy.float32) |
236 | 129 | equemene | Duration=numpy.array([]).astype(numpy.float32) |
237 | 129 | equemene | Rate=numpy.array([]).astype(numpy.float32) |
238 | 129 | equemene | for i in range(Redo): |
239 | 129 | equemene | time_start=time.time() |
240 | 129 | equemene | |
241 | 129 | equemene | r=1
|
242 | 129 | equemene | # Distribution of Devices over nodes
|
243 | 129 | equemene | InputCL={} |
244 | 129 | equemene | InputCL['Iterations']=IterationsMPI
|
245 | 129 | equemene | InputCL['Steps']=1 |
246 | 129 | equemene | InputCL['Blocks']=Blocks
|
247 | 129 | equemene | InputCL['Threads']=Threads
|
248 | 129 | equemene | InputCL['RNG']=RNG
|
249 | 129 | equemene | InputCL['ValueType']=ValueType
|
250 | 129 | equemene | InputCL['GpuStyle']=GpuStyle
|
251 | 190 | equemene | InputCL['IfThen']=IfThen
|
252 | 129 | equemene | |
253 | 129 | equemene | for Device in Devices[1:]: |
254 | 129 | equemene | print("Send to device %i on rank %i" % (Device,r))
|
255 | 129 | equemene | InputCL['Device']=Device
|
256 | 239 | equemene | DeltaD=Device-min(Devices)+r+1 |
257 | 239 | equemene | DeltaS=(DeltaD-1+r)*524287 |
258 | 239 | equemene | InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS) |
259 | 129 | equemene | comm.send('CONTINUE',dest=r,tag=11) |
260 | 129 | equemene | comm.send(InputCL,dest=r,tag=11)
|
261 | 129 | equemene | r+=1
|
262 | 129 | equemene | |
263 | 129 | equemene | # Compute on rank 0
|
264 | 129 | equemene | print("Compute on rank 0")
|
265 | 239 | equemene | DeltaD=Device-min(Devices)+1 |
266 | 239 | equemene | DeltaS=(DeltaD-1)*524287 |
267 | 239 | equemene | InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS) |
268 | 129 | equemene | InputCL['Device']=Devices[0] |
269 | 129 | equemene | |
270 | 107 | equemene | if GpuStyle=='CUDA': |
271 | 107 | equemene | try:
|
272 | 129 | equemene | OutputCL=MetropolisCuda(InputCL) |
273 | 107 | equemene | except:
|
274 | 127 | equemene | print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
|
275 | 107 | equemene | elif GpuStyle=='OpenCL': |
276 | 107 | equemene | try:
|
277 | 129 | equemene | OutputCL=MetropolisOpenCL(InputCL) |
278 | 129 | equemene | except:
|
279 | 129 | equemene | print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
|
280 | 107 | equemene | |
281 | 129 | equemene | Inside=OutputCL['Inside']
|
282 | 129 | equemene | NewIterations=OutputCL['NewIterations']
|
283 | 107 | equemene | |
284 | 129 | equemene | for slave in range(1,len(Devices)): |
285 | 129 | equemene | print("Get OutputCL from %i" % slave)
|
286 | 129 | equemene | OutputCL=comm.recv(source=slave,tag=11)
|
287 | 129 | equemene | print(OutputCL) |
288 | 129 | equemene | NewIterations+=OutputCL['NewIterations']
|
289 | 129 | equemene | Inside+=OutputCL['Inside']
|
290 | 107 | equemene | |
291 | 129 | equemene | print("Pi estimation %.8f" % (4./NewIterations*Inside)) |
292 | 129 | equemene | |
293 | 129 | equemene | Duration=numpy.append(Duration,time.time()-time_start) |
294 | 129 | equemene | Rate=numpy.append(Rate,NewIterations/Duration[-1])
|
295 | 129 | equemene | |
296 | 107 | equemene | avgD=numpy.append(avgD,numpy.average(Duration)) |
297 | 107 | equemene | medD=numpy.append(medD,numpy.median(Duration)) |
298 | 107 | equemene | stdD=numpy.append(stdD,numpy.std(Duration)) |
299 | 107 | equemene | minD=numpy.append(minD,numpy.min(Duration)) |
300 | 107 | equemene | maxD=numpy.append(maxD,numpy.max(Duration)) |
301 | 107 | equemene | avgR=numpy.append(avgR,numpy.average(Rate)) |
302 | 107 | equemene | medR=numpy.append(medR,numpy.median(Rate)) |
303 | 107 | equemene | stdR=numpy.append(stdR,numpy.std(Rate)) |
304 | 107 | equemene | minR=numpy.append(minR,numpy.min(Rate)) |
305 | 107 | equemene | maxR=numpy.append(maxR,numpy.max(Rate)) |
306 | 107 | equemene | |
307 | 127 | equemene | print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1])) |
308 | 107 | equemene | |
309 | 131 | equemene | numpy.savez("PiMPI_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR)) |
310 | 107 | equemene | ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ] |
311 | 131 | equemene | numpy.savetxt("PiMPI_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i') |
312 | 107 | equemene | |
313 | 107 | equemene | if Fit:
|
314 | 107 | equemene | FitAndPrint(ExploredJobs,median,Curves) |
315 | 107 | equemene | # Send MPI exit tag
|
316 | 127 | equemene | for slave in range(1,RankSize): |
317 | 107 | equemene | comm.send('BREAK',dest=slave,tag=11) |
318 | 107 | equemene | |
319 | 107 | equemene | else:
|
320 | 107 | equemene | while True: |
321 | 107 | equemene | Signal=comm.recv(source=0,tag=11) |
322 | 107 | equemene | if Signal=='CONTINUE': |
323 | 107 | equemene | # Receive information from Master
|
324 | 107 | equemene | InputCL=comm.recv(source=0,tag=11) |
325 | 127 | equemene | print("Parameters retreive for rank %s of %s on %s from master:" % (rank,RankSize,gethostname()))
|
326 | 127 | equemene | print("Input CL:" % InputCL)
|
327 | 107 | equemene | # Execute on slave
|
328 | 129 | equemene | |
329 | 129 | equemene | if InputCL['GpuStyle']=='CUDA': |
330 | 129 | equemene | try:
|
331 | 129 | equemene | OutputCL=MetropolisCuda(InputCL) |
332 | 129 | equemene | except:
|
333 | 129 | equemene | print("Problem with (%i,%i) // computations on Cuda" % (InputCL['Blocks'],InputCL['Threads'])) |
334 | 129 | equemene | elif InputCL['GpuStyle']=='OpenCL': |
335 | 129 | equemene | try:
|
336 | 129 | equemene | OutputCL=MetropolisOpenCL(InputCL) |
337 | 129 | equemene | except:
|
338 | 129 | equemene | print("Problem with (%i,%i) // computations on OpenCL" % (InputCL['Blocks'],InputCL['Threads'])) |
339 | 129 | equemene | |
340 | 127 | equemene | print("Output CL:" % OutputCL)
|
341 | 107 | equemene | # Send information to Master
|
342 | 107 | equemene | comm.send(OutputCL,dest=0,tag=11) |
343 | 127 | equemene | print("Data sent to master")
|
344 | 107 | equemene | else:
|
345 | 127 | equemene | print('Exit signal from Master')
|
346 | 107 | equemene | break |