root / Pi / XPU / PiXpuMPI.py @ 240
Historique | Voir | Annoter | Télécharger (14,16 ko)
1 |
#!/usr/bin/env python3
|
---|---|
2 |
|
3 |
#
|
4 |
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
|
5 |
#
|
6 |
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
7 |
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
|
8 |
#
|
9 |
# Thanks to Andreas Klockner for PyCUDA:
|
10 |
# http://mathema.tician.de/software/pycuda
|
11 |
# Thanks to Andreas Klockner for PyOpenCL:
|
12 |
# http://mathema.tician.de/software/pyopencl
|
13 |
#
|
14 |
|
15 |
# 2013-01-01 : problems with launch timeout
|
16 |
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
|
17 |
# Option "Interactive" "0" in /etc/X11/xorg.conf
|
18 |
|
19 |
# Common tools
|
20 |
import numpy |
21 |
from numpy.random import randint as nprnd |
22 |
import sys |
23 |
import getopt |
24 |
import time |
25 |
import math |
26 |
import itertools |
27 |
from socket import gethostname |
28 |
|
29 |
import mpi4py |
30 |
from mpi4py import MPI |
31 |
|
32 |
from PiXPU import * |
33 |
|
34 |
if __name__=='__main__': |
35 |
|
36 |
# MPI Init
|
37 |
comm = MPI.COMM_WORLD |
38 |
rank = comm.Get_rank() |
39 |
|
40 |
# Define number of Nodes on with computing is performed (exclude 0)
|
41 |
RankSize=comm.Get_size() |
42 |
|
43 |
if rank == 0: |
44 |
|
45 |
# Set defaults values
|
46 |
|
47 |
# Id of Device : 1 is for first find !
|
48 |
Device=1
|
49 |
# GPU style can be Cuda (Nvidia implementation) or OpenCL
|
50 |
GpuStyle='OpenCL'
|
51 |
# Iterations is integer
|
52 |
Iterations=10000000
|
53 |
# BlocksBlocks in first number of Blocks to explore
|
54 |
BlocksBegin=1
|
55 |
# BlocksEnd is last number of Blocks to explore
|
56 |
BlocksEnd=16
|
57 |
# BlocksStep is the step of Blocks to explore
|
58 |
BlocksStep=1
|
59 |
# ThreadsBlocks in first number of Blocks to explore
|
60 |
ThreadsBegin=1
|
61 |
# ThreadsEnd is last number of Blocks to explore
|
62 |
ThreadsEnd=1
|
63 |
# ThreadsStep is the step of Blocks to explore
|
64 |
ThreadsStep=1
|
65 |
# Redo is the times to redo the test to improve metrology
|
66 |
Redo=1
|
67 |
# OutMetrology is method for duration estimation : False is GPU inside
|
68 |
OutMetrology=False
|
69 |
Metrology='InMetro'
|
70 |
# Curves is True to print the curves
|
71 |
Curves=False
|
72 |
# Fit is True to print the curves
|
73 |
Fit=False
|
74 |
# Marsaglia RNG
|
75 |
RNG='MWC'
|
76 |
# Seeds
|
77 |
Seeds=110271,101008 |
78 |
# Value type : INT32, INT64, FP32, FP64
|
79 |
ValueType='FP32'
|
80 |
# Inside based on If
|
81 |
IfThen=False
|
82 |
|
83 |
HowToUse='%s -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
|
84 |
|
85 |
try:
|
86 |
opts, args = getopt.getopt(sys.argv[1:],"hckg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="]) |
87 |
except getopt.GetoptError:
|
88 |
print(HowToUse % sys.argv[0])
|
89 |
sys.exit(2)
|
90 |
|
91 |
# List of Devices
|
92 |
Devices=[] |
93 |
Alu={} |
94 |
|
95 |
for opt, arg in opts: |
96 |
if opt == '-h': |
97 |
print(HowToUse % sys.argv[0])
|
98 |
|
99 |
print("\nInformations about devices detected under OpenCL:")
|
100 |
# For PyOpenCL import
|
101 |
try:
|
102 |
import pyopencl as cl |
103 |
Id=0
|
104 |
for platform in cl.get_platforms(): |
105 |
for device in platform.get_devices(): |
106 |
#deviceType=cl.device_type.to_string(device.type)
|
107 |
deviceType="xPU"
|
108 |
print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
|
109 |
Id=Id+1
|
110 |
|
111 |
print
|
112 |
except:
|
113 |
print("Your platform does not seem to support OpenCL")
|
114 |
|
115 |
print("\nInformations about devices detected under CUDA API:")
|
116 |
# For PyCUDA import
|
117 |
try:
|
118 |
import pycuda.driver as cuda |
119 |
cuda.init() |
120 |
for Id in range(cuda.Device.count()): |
121 |
device=cuda.Device(Id) |
122 |
print("Device #%i of type GPU : %s" % (Id,device.name()))
|
123 |
print
|
124 |
except:
|
125 |
print("Your platform does not seem to support CUDA")
|
126 |
|
127 |
sys.exit() |
128 |
|
129 |
elif opt == '-c': |
130 |
Curves=True
|
131 |
elif opt == '-k': |
132 |
IfThen=True
|
133 |
elif opt in ("-d", "--device"): |
134 |
Devices.append(int(arg))
|
135 |
elif opt in ("-g", "--gpustyle"): |
136 |
GpuStyle = arg |
137 |
elif opt in ("-m", "--marsaglia"): |
138 |
RNG = arg |
139 |
elif opt in ("-v", "--valuetype"): |
140 |
ValueType = arg |
141 |
elif opt in ("-i", "--iterations"): |
142 |
Iterations = numpy.uint64(arg) |
143 |
elif opt in ("-b", "--blocksbegin"): |
144 |
BlocksBegin = int(arg)
|
145 |
BlocksEnd = BlocksBegin |
146 |
elif opt in ("-e", "--blocksend"): |
147 |
BlocksEnd = int(arg)
|
148 |
elif opt in ("-s", "--blocksstep"): |
149 |
BlocksStep = int(arg)
|
150 |
elif opt in ("-f", "--threadsfirst"): |
151 |
ThreadsBegin = int(arg)
|
152 |
ThreadsEnd = ThreadsBegin |
153 |
elif opt in ("-l", "--threadslast"): |
154 |
ThreadsEnd = int(arg)
|
155 |
elif opt in ("-t", "--threadsstep"): |
156 |
ThreadsStep = int(arg)
|
157 |
elif opt in ("-r", "--redo"): |
158 |
Redo = int(arg)
|
159 |
|
160 |
print("Devices Identification : %s" % Devices)
|
161 |
print("GpuStyle used : %s" % GpuStyle)
|
162 |
print("Iterations : %s" % Iterations)
|
163 |
print("Number of Blocks on begin : %s" % BlocksBegin)
|
164 |
print("Number of Blocks on end : %s" % BlocksEnd)
|
165 |
print("Step on Blocks : %s" % BlocksStep)
|
166 |
print("Number of Threads on begin : %s" % ThreadsBegin)
|
167 |
print("Number of Threads on end : %s" % ThreadsEnd)
|
168 |
print("Step on Threads : %s" % ThreadsStep)
|
169 |
print("Number of redo : %s" % Redo)
|
170 |
print("Metrology done out of XPU : %r" % OutMetrology)
|
171 |
print("Type of Marsaglia RNG used : %s" % RNG)
|
172 |
print("Type of variable : %s" % ValueType)
|
173 |
|
174 |
if GpuStyle=='CUDA': |
175 |
try:
|
176 |
# For PyCUDA import
|
177 |
import pycuda.driver as cuda |
178 |
|
179 |
cuda.init() |
180 |
for Id in range(cuda.Device.count()): |
181 |
device=cuda.Device(Id) |
182 |
print("Device #%i of type GPU : %s" % (Id,device.name()))
|
183 |
if Id in Devices: |
184 |
Alu[Id]='GPU'
|
185 |
except ImportError: |
186 |
print("Platform does not seem to support CUDA")
|
187 |
|
188 |
if GpuStyle=='OpenCL': |
189 |
try:
|
190 |
# For PyOpenCL import
|
191 |
import pyopencl as cl |
192 |
Id=0
|
193 |
for platform in cl.get_platforms(): |
194 |
for device in platform.get_devices(): |
195 |
#deviceType=cl.device_type.to_string(device.type)
|
196 |
deviceType="xPU"
|
197 |
print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
|
198 |
|
199 |
if Id in Devices: |
200 |
# Set the Alu as detected Device Type
|
201 |
Alu[Id]=deviceType |
202 |
Id=Id+1
|
203 |
except ImportError: |
204 |
print("Platform does not seem to support OpenCL")
|
205 |
|
206 |
print(Devices,Alu) |
207 |
|
208 |
BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
|
209 |
ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
|
210 |
|
211 |
ExploredJobs=numpy.array([]).astype(numpy.uint32) |
212 |
ExploredBlocks=numpy.array([]).astype(numpy.uint32) |
213 |
ExploredThreads=numpy.array([]).astype(numpy.uint32) |
214 |
avgD=numpy.array([]).astype(numpy.float32) |
215 |
medD=numpy.array([]).astype(numpy.float32) |
216 |
stdD=numpy.array([]).astype(numpy.float32) |
217 |
minD=numpy.array([]).astype(numpy.float32) |
218 |
maxD=numpy.array([]).astype(numpy.float32) |
219 |
avgR=numpy.array([]).astype(numpy.float32) |
220 |
medR=numpy.array([]).astype(numpy.float32) |
221 |
stdR=numpy.array([]).astype(numpy.float32) |
222 |
minR=numpy.array([]).astype(numpy.float32) |
223 |
maxR=numpy.array([]).astype(numpy.float32) |
224 |
|
225 |
IterationsMPI=numpy.uint64(Iterations/len(Devices))
|
226 |
if Iterations%len(Devices)!=0: |
227 |
IterationsMPI+=1
|
228 |
|
229 |
for Blocks,Threads in itertools.product(BlocksList,ThreadsList): |
230 |
|
231 |
ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads) |
232 |
ExploredBlocks=numpy.append(ExploredBlocks,Blocks) |
233 |
ExploredThreads=numpy.append(ExploredThreads,Threads) |
234 |
|
235 |
DurationItem=numpy.array([]).astype(numpy.float32) |
236 |
Duration=numpy.array([]).astype(numpy.float32) |
237 |
Rate=numpy.array([]).astype(numpy.float32) |
238 |
for i in range(Redo): |
239 |
time_start=time.time() |
240 |
|
241 |
r=1
|
242 |
# Distribution of Devices over nodes
|
243 |
InputCL={} |
244 |
InputCL['Iterations']=IterationsMPI
|
245 |
InputCL['Steps']=1 |
246 |
InputCL['Blocks']=Blocks
|
247 |
InputCL['Threads']=Threads
|
248 |
InputCL['RNG']=RNG
|
249 |
InputCL['ValueType']=ValueType
|
250 |
InputCL['GpuStyle']=GpuStyle
|
251 |
InputCL['IfThen']=IfThen
|
252 |
|
253 |
for Device in Devices[1:]: |
254 |
print("Send to device %i on rank %i" % (Device,r))
|
255 |
InputCL['Device']=Device
|
256 |
DeltaD=Device-min(Devices)+r+1 |
257 |
DeltaS=(DeltaD-1+r)*524287 |
258 |
InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS) |
259 |
comm.send('CONTINUE',dest=r,tag=11) |
260 |
comm.send(InputCL,dest=r,tag=11)
|
261 |
r+=1
|
262 |
|
263 |
# Compute on rank 0
|
264 |
print("Compute on rank 0")
|
265 |
DeltaD=Device-min(Devices)+1 |
266 |
DeltaS=(DeltaD-1)*524287 |
267 |
InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS) |
268 |
InputCL['Device']=Devices[0] |
269 |
|
270 |
if GpuStyle=='CUDA': |
271 |
try:
|
272 |
OutputCL=MetropolisCuda(InputCL) |
273 |
except:
|
274 |
print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
|
275 |
elif GpuStyle=='OpenCL': |
276 |
try:
|
277 |
OutputCL=MetropolisOpenCL(InputCL) |
278 |
except:
|
279 |
print("Problem with (%i,%i) // computations on OpenCL" % (Blocks,Threads))
|
280 |
|
281 |
Inside=OutputCL['Inside']
|
282 |
NewIterations=OutputCL['NewIterations']
|
283 |
|
284 |
for slave in range(1,len(Devices)): |
285 |
print("Get OutputCL from %i" % slave)
|
286 |
OutputCL=comm.recv(source=slave,tag=11)
|
287 |
print(OutputCL) |
288 |
NewIterations+=OutputCL['NewIterations']
|
289 |
Inside+=OutputCL['Inside']
|
290 |
|
291 |
print("Pi estimation %.8f" % (4./NewIterations*Inside)) |
292 |
|
293 |
Duration=numpy.append(Duration,time.time()-time_start) |
294 |
Rate=numpy.append(Rate,NewIterations/Duration[-1])
|
295 |
|
296 |
avgD=numpy.append(avgD,numpy.average(Duration)) |
297 |
medD=numpy.append(medD,numpy.median(Duration)) |
298 |
stdD=numpy.append(stdD,numpy.std(Duration)) |
299 |
minD=numpy.append(minD,numpy.min(Duration)) |
300 |
maxD=numpy.append(maxD,numpy.max(Duration)) |
301 |
avgR=numpy.append(avgR,numpy.average(Rate)) |
302 |
medR=numpy.append(medR,numpy.median(Rate)) |
303 |
stdR=numpy.append(stdR,numpy.std(Rate)) |
304 |
minR=numpy.append(minR,numpy.min(Rate)) |
305 |
maxR=numpy.append(maxR,numpy.max(Rate)) |
306 |
|
307 |
print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1])) |
308 |
|
309 |
numpy.savez("PiMPI_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR)) |
310 |
ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ] |
311 |
numpy.savetxt("PiMPI_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i') |
312 |
|
313 |
if Fit:
|
314 |
FitAndPrint(ExploredJobs,median,Curves) |
315 |
# Send MPI exit tag
|
316 |
for slave in range(1,RankSize): |
317 |
comm.send('BREAK',dest=slave,tag=11) |
318 |
|
319 |
else:
|
320 |
while True: |
321 |
Signal=comm.recv(source=0,tag=11) |
322 |
if Signal=='CONTINUE': |
323 |
# Receive information from Master
|
324 |
InputCL=comm.recv(source=0,tag=11) |
325 |
print("Parameters retreive for rank %s of %s on %s from master:" % (rank,RankSize,gethostname()))
|
326 |
print("Input CL:" % InputCL)
|
327 |
# Execute on slave
|
328 |
|
329 |
if InputCL['GpuStyle']=='CUDA': |
330 |
try:
|
331 |
OutputCL=MetropolisCuda(InputCL) |
332 |
except:
|
333 |
print("Problem with (%i,%i) // computations on Cuda" % (InputCL['Blocks'],InputCL['Threads'])) |
334 |
elif InputCL['GpuStyle']=='OpenCL': |
335 |
try:
|
336 |
OutputCL=MetropolisOpenCL(InputCL) |
337 |
except:
|
338 |
print("Problem with (%i,%i) // computations on OpenCL" % (InputCL['Blocks'],InputCL['Threads'])) |
339 |
|
340 |
print("Output CL:" % OutputCL)
|
341 |
# Send information to Master
|
342 |
comm.send(OutputCL,dest=0,tag=11) |
343 |
print("Data sent to master")
|
344 |
else:
|
345 |
print('Exit signal from Master')
|
346 |
break
|