Statistiques
| Révision :

root / Pi / XPU / PiXpuMPI.py @ 246

Historique | Voir | Annoter | Télécharger (14,16 ko)

1 127 equemene
#!/usr/bin/env python3
2 107 equemene
3 107 equemene
#
4 107 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 107 equemene
#
6 107 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 107 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 107 equemene
#
9 107 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 107 equemene
# http://mathema.tician.de/software/pycuda
11 107 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 107 equemene
# http://mathema.tician.de/software/pyopencl
13 107 equemene
#
14 107 equemene
15 107 equemene
# 2013-01-01 : problems with launch timeout
16 107 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 107 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 107 equemene
19 107 equemene
# Common tools
20 107 equemene
import numpy
21 107 equemene
from numpy.random import randint as nprnd
22 107 equemene
import sys
23 107 equemene
import getopt
24 107 equemene
import time
25 107 equemene
import math
26 107 equemene
import itertools
27 107 equemene
from socket import gethostname
28 107 equemene
29 107 equemene
import mpi4py
30 107 equemene
from mpi4py import MPI
31 107 equemene
32 107 equemene
from PiXPU import *
33 107 equemene
34 107 equemene
if __name__=='__main__':
35 107 equemene
36 107 equemene
    # MPI Init
37 107 equemene
    comm = MPI.COMM_WORLD
38 107 equemene
    rank = comm.Get_rank()
39 107 equemene
40 107 equemene
    # Define number of Nodes on with computing is performed (exclude 0)
41 107 equemene
    RankSize=comm.Get_size()
42 107 equemene
43 107 equemene
    if rank == 0:
44 107 equemene
45 107 equemene
        # Set defaults values
46 107 equemene
47 107 equemene
        # Id of Device : 1 is for first find !
48 107 equemene
        Device=1
49 107 equemene
        # GPU style can be Cuda (Nvidia implementation) or OpenCL
50 107 equemene
        GpuStyle='OpenCL'
51 107 equemene
        # Iterations is integer
52 107 equemene
        Iterations=10000000
53 107 equemene
        # BlocksBlocks in first number of Blocks to explore
54 107 equemene
        BlocksBegin=1
55 107 equemene
        # BlocksEnd is last number of Blocks to explore
56 107 equemene
        BlocksEnd=16
57 107 equemene
        # BlocksStep is the step of Blocks to explore
58 107 equemene
        BlocksStep=1
59 107 equemene
        # ThreadsBlocks in first number of Blocks to explore
60 107 equemene
        ThreadsBegin=1
61 107 equemene
        # ThreadsEnd is last number of Blocks to explore
62 107 equemene
        ThreadsEnd=1
63 107 equemene
        # ThreadsStep is the step of Blocks to explore
64 107 equemene
        ThreadsStep=1
65 107 equemene
        # Redo is the times to redo the test to improve metrology
66 107 equemene
        Redo=1
67 107 equemene
        # OutMetrology is method for duration estimation : False is GPU inside
68 107 equemene
        OutMetrology=False
69 107 equemene
        Metrology='InMetro'
70 107 equemene
        # Curves is True to print the curves
71 107 equemene
        Curves=False
72 107 equemene
        # Fit is True to print the curves
73 107 equemene
        Fit=False
74 107 equemene
        # Marsaglia RNG
75 107 equemene
        RNG='MWC'
76 239 equemene
        # Seeds
77 239 equemene
        Seeds=110271,101008
78 107 equemene
        # Value type : INT32, INT64, FP32, FP64
79 107 equemene
        ValueType='FP32'
80 190 equemene
        # Inside based on If
81 190 equemene
        IfThen=False
82 190 equemene
83 190 equemene
        HowToUse='%s -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
84 107 equemene
85 107 equemene
        try:
86 190 equemene
            opts, args = getopt.getopt(sys.argv[1:],"hckg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
87 107 equemene
        except getopt.GetoptError:
88 127 equemene
            print(HowToUse % sys.argv[0])
89 107 equemene
            sys.exit(2)
90 107 equemene
91 107 equemene
        # List of Devices
92 107 equemene
        Devices=[]
93 107 equemene
        Alu={}
94 107 equemene
95 107 equemene
        for opt, arg in opts:
96 107 equemene
            if opt == '-h':
97 127 equemene
                print(HowToUse % sys.argv[0])
98 107 equemene
99 127 equemene
                print("\nInformations about devices detected under OpenCL:")
100 107 equemene
                # For PyOpenCL import
101 107 equemene
                try:
102 107 equemene
                    import pyopencl as cl
103 129 equemene
                    Id=0
104 107 equemene
                    for platform in cl.get_platforms():
105 107 equemene
                        for device in platform.get_devices():
106 138 equemene
                            #deviceType=cl.device_type.to_string(device.type)
107 157 equemene
                            deviceType="xPU"
108 127 equemene
                            print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
109 107 equemene
                            Id=Id+1
110 107 equemene
111 107 equemene
                    print
112 123 equemene
                except:
113 127 equemene
                    print("Your platform does not seem to support OpenCL")
114 123 equemene
115 129 equemene
                print("\nInformations about devices detected under CUDA API:")
116 123 equemene
                # For PyCUDA import
117 123 equemene
                try:
118 123 equemene
                    import pycuda.driver as cuda
119 123 equemene
                    cuda.init()
120 123 equemene
                    for Id in range(cuda.Device.count()):
121 123 equemene
                        device=cuda.Device(Id)
122 127 equemene
                        print("Device #%i of type GPU : %s" % (Id,device.name()))
123 123 equemene
                    print
124 123 equemene
                except:
125 127 equemene
                    print("Your platform does not seem to support CUDA")
126 107 equemene
127 123 equemene
                sys.exit()
128 123 equemene
129 107 equemene
            elif opt == '-c':
130 107 equemene
                Curves=True
131 190 equemene
            elif opt == '-k':
132 190 equemene
                IfThen=True
133 107 equemene
            elif opt in ("-d", "--device"):
134 107 equemene
                Devices.append(int(arg))
135 107 equemene
            elif opt in ("-g", "--gpustyle"):
136 107 equemene
                GpuStyle = arg
137 107 equemene
            elif opt in ("-m", "--marsaglia"):
138 107 equemene
                RNG = arg
139 107 equemene
            elif opt in ("-v", "--valuetype"):
140 107 equemene
                ValueType = arg
141 107 equemene
            elif opt in ("-i", "--iterations"):
142 107 equemene
                Iterations = numpy.uint64(arg)
143 107 equemene
            elif opt in ("-b", "--blocksbegin"):
144 107 equemene
                BlocksBegin = int(arg)
145 192 equemene
                BlocksEnd = BlocksBegin
146 107 equemene
            elif opt in ("-e", "--blocksend"):
147 107 equemene
                BlocksEnd = int(arg)
148 107 equemene
            elif opt in ("-s", "--blocksstep"):
149 107 equemene
                BlocksStep = int(arg)
150 107 equemene
            elif opt in ("-f", "--threadsfirst"):
151 107 equemene
                ThreadsBegin = int(arg)
152 192 equemene
                ThreadsEnd = ThreadsBegin
153 107 equemene
            elif opt in ("-l", "--threadslast"):
154 107 equemene
                ThreadsEnd = int(arg)
155 107 equemene
            elif opt in ("-t", "--threadsstep"):
156 107 equemene
                ThreadsStep = int(arg)
157 107 equemene
            elif opt in ("-r", "--redo"):
158 107 equemene
                Redo = int(arg)
159 107 equemene
160 127 equemene
        print("Devices Identification : %s" % Devices)
161 127 equemene
        print("GpuStyle used : %s" % GpuStyle)
162 127 equemene
        print("Iterations : %s" % Iterations)
163 127 equemene
        print("Number of Blocks on begin : %s" % BlocksBegin)
164 127 equemene
        print("Number of Blocks on end : %s" % BlocksEnd)
165 127 equemene
        print("Step on Blocks : %s" % BlocksStep)
166 127 equemene
        print("Number of Threads on begin : %s" % ThreadsBegin)
167 127 equemene
        print("Number of Threads on end : %s" % ThreadsEnd)
168 127 equemene
        print("Step on Threads : %s" % ThreadsStep)
169 127 equemene
        print("Number of redo : %s" % Redo)
170 127 equemene
        print("Metrology done out of XPU : %r" % OutMetrology)
171 127 equemene
        print("Type of Marsaglia RNG used : %s" % RNG)
172 127 equemene
        print("Type of variable : %s" % ValueType)
173 107 equemene
174 107 equemene
        if GpuStyle=='CUDA':
175 107 equemene
            try:
176 107 equemene
                # For PyCUDA import
177 107 equemene
                import pycuda.driver as cuda
178 129 equemene
179 129 equemene
                cuda.init()
180 129 equemene
                for Id in range(cuda.Device.count()):
181 129 equemene
                    device=cuda.Device(Id)
182 129 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
183 129 equemene
                    if Id in Devices:
184 129 equemene
                        Alu[Id]='GPU'
185 107 equemene
            except ImportError:
186 127 equemene
                print("Platform does not seem to support CUDA")
187 107 equemene
188 107 equemene
        if GpuStyle=='OpenCL':
189 107 equemene
            try:
190 107 equemene
                # For PyOpenCL import
191 107 equemene
                import pyopencl as cl
192 129 equemene
                Id=0
193 107 equemene
                for platform in cl.get_platforms():
194 107 equemene
                    for device in platform.get_devices():
195 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
196 240 equemene
                        deviceType="xPU"
197 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
198 107 equemene
199 107 equemene
                        if Id in Devices:
200 107 equemene
                            # Set the Alu as detected Device Type
201 107 equemene
                            Alu[Id]=deviceType
202 107 equemene
                        Id=Id+1
203 107 equemene
            except ImportError:
204 127 equemene
                print("Platform does not seem to support OpenCL")
205 107 equemene
206 127 equemene
        print(Devices,Alu)
207 107 equemene
208 127 equemene
        BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
209 127 equemene
        ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
210 107 equemene
211 107 equemene
        ExploredJobs=numpy.array([]).astype(numpy.uint32)
212 107 equemene
        ExploredBlocks=numpy.array([]).astype(numpy.uint32)
213 107 equemene
        ExploredThreads=numpy.array([]).astype(numpy.uint32)
214 107 equemene
        avgD=numpy.array([]).astype(numpy.float32)
215 107 equemene
        medD=numpy.array([]).astype(numpy.float32)
216 107 equemene
        stdD=numpy.array([]).astype(numpy.float32)
217 107 equemene
        minD=numpy.array([]).astype(numpy.float32)
218 107 equemene
        maxD=numpy.array([]).astype(numpy.float32)
219 107 equemene
        avgR=numpy.array([]).astype(numpy.float32)
220 107 equemene
        medR=numpy.array([]).astype(numpy.float32)
221 107 equemene
        stdR=numpy.array([]).astype(numpy.float32)
222 107 equemene
        minR=numpy.array([]).astype(numpy.float32)
223 107 equemene
        maxR=numpy.array([]).astype(numpy.float32)
224 107 equemene
225 129 equemene
        IterationsMPI=numpy.uint64(Iterations/len(Devices))
226 129 equemene
        if Iterations%len(Devices)!=0:
227 129 equemene
            IterationsMPI+=1
228 129 equemene
229 107 equemene
        for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
230 107 equemene
231 107 equemene
            ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
232 107 equemene
            ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
233 107 equemene
            ExploredThreads=numpy.append(ExploredThreads,Threads)
234 129 equemene
235 129 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
236 129 equemene
            Duration=numpy.array([]).astype(numpy.float32)
237 129 equemene
            Rate=numpy.array([]).astype(numpy.float32)
238 129 equemene
            for i in range(Redo):
239 129 equemene
                time_start=time.time()
240 129 equemene
241 129 equemene
                r=1
242 129 equemene
                # Distribution of Devices over nodes
243 129 equemene
                InputCL={}
244 129 equemene
                InputCL['Iterations']=IterationsMPI
245 129 equemene
                InputCL['Steps']=1
246 129 equemene
                InputCL['Blocks']=Blocks
247 129 equemene
                InputCL['Threads']=Threads
248 129 equemene
                InputCL['RNG']=RNG
249 129 equemene
                InputCL['ValueType']=ValueType
250 129 equemene
                InputCL['GpuStyle']=GpuStyle
251 190 equemene
                InputCL['IfThen']=IfThen
252 129 equemene
253 129 equemene
                for Device in Devices[1:]:
254 129 equemene
                    print("Send to device %i on rank %i" % (Device,r))
255 129 equemene
                    InputCL['Device']=Device
256 239 equemene
                    DeltaD=Device-min(Devices)+r+1
257 239 equemene
                    DeltaS=(DeltaD-1+r)*524287
258 239 equemene
                    InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
259 129 equemene
                    comm.send('CONTINUE',dest=r,tag=11)
260 129 equemene
                    comm.send(InputCL,dest=r,tag=11)
261 129 equemene
                    r+=1
262 129 equemene
263 129 equemene
                # Compute on rank 0
264 129 equemene
                print("Compute on rank 0")
265 239 equemene
                DeltaD=Device-min(Devices)+1
266 239 equemene
                DeltaS=(DeltaD-1)*524287
267 239 equemene
                InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
268 129 equemene
                InputCL['Device']=Devices[0]
269 129 equemene
270 107 equemene
                if GpuStyle=='CUDA':
271 107 equemene
                    try:
272 129 equemene
                        OutputCL=MetropolisCuda(InputCL)
273 107 equemene
                    except:
274 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
275 107 equemene
                elif GpuStyle=='OpenCL':
276 107 equemene
                    try:
277 129 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
278 129 equemene
                    except:
279 129 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" %  (Blocks,Threads))
280 107 equemene
281 129 equemene
                Inside=OutputCL['Inside']
282 129 equemene
                NewIterations=OutputCL['NewIterations']
283 107 equemene
284 129 equemene
                for slave in range(1,len(Devices)):
285 129 equemene
                    print("Get OutputCL from %i" % slave)
286 129 equemene
                    OutputCL=comm.recv(source=slave,tag=11)
287 129 equemene
                    print(OutputCL)
288 129 equemene
                    NewIterations+=OutputCL['NewIterations']
289 129 equemene
                    Inside+=OutputCL['Inside']
290 107 equemene
291 129 equemene
                print("Pi estimation %.8f" % (4./NewIterations*Inside))
292 129 equemene
293 129 equemene
                Duration=numpy.append(Duration,time.time()-time_start)
294 129 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
295 129 equemene
296 107 equemene
            avgD=numpy.append(avgD,numpy.average(Duration))
297 107 equemene
            medD=numpy.append(medD,numpy.median(Duration))
298 107 equemene
            stdD=numpy.append(stdD,numpy.std(Duration))
299 107 equemene
            minD=numpy.append(minD,numpy.min(Duration))
300 107 equemene
            maxD=numpy.append(maxD,numpy.max(Duration))
301 107 equemene
            avgR=numpy.append(avgR,numpy.average(Rate))
302 107 equemene
            medR=numpy.append(medR,numpy.median(Rate))
303 107 equemene
            stdR=numpy.append(stdR,numpy.std(Rate))
304 107 equemene
            minR=numpy.append(minR,numpy.min(Rate))
305 107 equemene
            maxR=numpy.append(maxR,numpy.max(Rate))
306 107 equemene
307 127 equemene
            print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
308 107 equemene
309 131 equemene
            numpy.savez("PiMPI_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
310 107 equemene
            ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
311 131 equemene
            numpy.savetxt("PiMPI_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
312 107 equemene
313 107 equemene
            if Fit:
314 107 equemene
                FitAndPrint(ExploredJobs,median,Curves)
315 107 equemene
        # Send MPI exit tag
316 127 equemene
        for slave in range(1,RankSize):
317 107 equemene
            comm.send('BREAK',dest=slave,tag=11)
318 107 equemene
319 107 equemene
    else:
320 107 equemene
        while True:
321 107 equemene
            Signal=comm.recv(source=0,tag=11)
322 107 equemene
            if Signal=='CONTINUE':
323 107 equemene
                # Receive information from Master
324 107 equemene
                InputCL=comm.recv(source=0,tag=11)
325 127 equemene
                print("Parameters retreive for rank %s of %s on %s from master:" % (rank,RankSize,gethostname()))
326 127 equemene
                print("Input CL:" % InputCL)
327 107 equemene
                # Execute on slave
328 129 equemene
329 129 equemene
                if InputCL['GpuStyle']=='CUDA':
330 129 equemene
                    try:
331 129 equemene
                        OutputCL=MetropolisCuda(InputCL)
332 129 equemene
                    except:
333 129 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (InputCL['Blocks'],InputCL['Threads']))
334 129 equemene
                elif InputCL['GpuStyle']=='OpenCL':
335 129 equemene
                    try:
336 129 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
337 129 equemene
                    except:
338 129 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" %  (InputCL['Blocks'],InputCL['Threads']))
339 129 equemene
340 127 equemene
                print("Output CL:" % OutputCL)
341 107 equemene
                # Send information to Master
342 107 equemene
                comm.send(OutputCL,dest=0,tag=11)
343 127 equemene
                print("Data sent to master")
344 107 equemene
            else:
345 127 equemene
                print('Exit signal from Master')
346 107 equemene
                break