Statistiques
| Révision :

root / Pi / XPU / PiXpuMPI.py @ 287

Historique | Voir | Annoter | Télécharger (14,26 ko)

1 127 equemene
#!/usr/bin/env python3
2 107 equemene
3 107 equemene
#
4 107 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 107 equemene
#
6 107 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 107 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 107 equemene
#
9 107 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 107 equemene
# http://mathema.tician.de/software/pycuda
11 107 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 107 equemene
# http://mathema.tician.de/software/pyopencl
13 107 equemene
#
14 107 equemene
15 107 equemene
# 2013-01-01 : problems with launch timeout
16 107 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 107 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 107 equemene
19 107 equemene
# Common tools
20 107 equemene
import numpy
21 107 equemene
from numpy.random import randint as nprnd
22 107 equemene
import sys
23 107 equemene
import getopt
24 107 equemene
import time
25 107 equemene
import math
26 107 equemene
import itertools
27 107 equemene
from socket import gethostname
28 107 equemene
29 107 equemene
import mpi4py
30 107 equemene
from mpi4py import MPI
31 107 equemene
32 107 equemene
from PiXPU import *
33 107 equemene
34 107 equemene
if __name__=='__main__':
35 107 equemene
36 107 equemene
    # MPI Init
37 107 equemene
    comm = MPI.COMM_WORLD
38 107 equemene
    rank = comm.Get_rank()
39 107 equemene
40 107 equemene
    # Define number of Nodes on with computing is performed (exclude 0)
41 107 equemene
    RankSize=comm.Get_size()
42 107 equemene
43 107 equemene
    if rank == 0:
44 107 equemene
45 107 equemene
        # Set defaults values
46 107 equemene
47 107 equemene
        # Id of Device : 1 is for first find !
48 107 equemene
        Device=1
49 107 equemene
        # GPU style can be Cuda (Nvidia implementation) or OpenCL
50 107 equemene
        GpuStyle='OpenCL'
51 107 equemene
        # Iterations is integer
52 107 equemene
        Iterations=10000000
53 107 equemene
        # BlocksBlocks in first number of Blocks to explore
54 107 equemene
        BlocksBegin=1
55 107 equemene
        # BlocksEnd is last number of Blocks to explore
56 107 equemene
        BlocksEnd=16
57 107 equemene
        # BlocksStep is the step of Blocks to explore
58 107 equemene
        BlocksStep=1
59 107 equemene
        # ThreadsBlocks in first number of Blocks to explore
60 107 equemene
        ThreadsBegin=1
61 107 equemene
        # ThreadsEnd is last number of Blocks to explore
62 107 equemene
        ThreadsEnd=1
63 107 equemene
        # ThreadsStep is the step of Blocks to explore
64 107 equemene
        ThreadsStep=1
65 107 equemene
        # Redo is the times to redo the test to improve metrology
66 107 equemene
        Redo=1
67 107 equemene
        # OutMetrology is method for duration estimation : False is GPU inside
68 107 equemene
        OutMetrology=False
69 107 equemene
        Metrology='InMetro'
70 107 equemene
        # Curves is True to print the curves
71 107 equemene
        Curves=False
72 107 equemene
        # Fit is True to print the curves
73 107 equemene
        Fit=False
74 107 equemene
        # Marsaglia RNG
75 107 equemene
        RNG='MWC'
76 239 equemene
        # Seeds
77 239 equemene
        Seeds=110271,101008
78 107 equemene
        # Value type : INT32, INT64, FP32, FP64
79 107 equemene
        ValueType='FP32'
80 190 equemene
        # Inside based on If
81 190 equemene
        IfThen=False
82 190 equemene
83 190 equemene
        HowToUse='%s -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
84 107 equemene
85 107 equemene
        try:
86 190 equemene
            opts, args = getopt.getopt(sys.argv[1:],"hckg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
87 107 equemene
        except getopt.GetoptError:
88 127 equemene
            print(HowToUse % sys.argv[0])
89 107 equemene
            sys.exit(2)
90 107 equemene
91 107 equemene
        # List of Devices
92 107 equemene
        Devices=[]
93 107 equemene
        Alu={}
94 107 equemene
95 107 equemene
        for opt, arg in opts:
96 107 equemene
            if opt == '-h':
97 127 equemene
                print(HowToUse % sys.argv[0])
98 107 equemene
99 127 equemene
                print("\nInformations about devices detected under OpenCL:")
100 107 equemene
                # For PyOpenCL import
101 107 equemene
                try:
102 107 equemene
                    import pyopencl as cl
103 129 equemene
                    Id=0
104 107 equemene
                    for platform in cl.get_platforms():
105 107 equemene
                        for device in platform.get_devices():
106 138 equemene
                            #deviceType=cl.device_type.to_string(device.type)
107 157 equemene
                            deviceType="xPU"
108 127 equemene
                            print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
109 107 equemene
                            Id=Id+1
110 107 equemene
111 107 equemene
                    print
112 123 equemene
                except:
113 127 equemene
                    print("Your platform does not seem to support OpenCL")
114 123 equemene
115 129 equemene
                print("\nInformations about devices detected under CUDA API:")
116 123 equemene
                # For PyCUDA import
117 123 equemene
                try:
118 123 equemene
                    import pycuda.driver as cuda
119 123 equemene
                    cuda.init()
120 123 equemene
                    for Id in range(cuda.Device.count()):
121 123 equemene
                        device=cuda.Device(Id)
122 127 equemene
                        print("Device #%i of type GPU : %s" % (Id,device.name()))
123 123 equemene
                    print
124 123 equemene
                except:
125 127 equemene
                    print("Your platform does not seem to support CUDA")
126 107 equemene
127 123 equemene
                sys.exit()
128 123 equemene
129 107 equemene
            elif opt == '-c':
130 107 equemene
                Curves=True
131 190 equemene
            elif opt == '-k':
132 190 equemene
                IfThen=True
133 107 equemene
            elif opt in ("-d", "--device"):
134 107 equemene
                Devices.append(int(arg))
135 107 equemene
            elif opt in ("-g", "--gpustyle"):
136 107 equemene
                GpuStyle = arg
137 107 equemene
            elif opt in ("-m", "--marsaglia"):
138 107 equemene
                RNG = arg
139 107 equemene
            elif opt in ("-v", "--valuetype"):
140 107 equemene
                ValueType = arg
141 107 equemene
            elif opt in ("-i", "--iterations"):
142 107 equemene
                Iterations = numpy.uint64(arg)
143 107 equemene
            elif opt in ("-b", "--blocksbegin"):
144 107 equemene
                BlocksBegin = int(arg)
145 192 equemene
                BlocksEnd = BlocksBegin
146 107 equemene
            elif opt in ("-e", "--blocksend"):
147 107 equemene
                BlocksEnd = int(arg)
148 107 equemene
            elif opt in ("-s", "--blocksstep"):
149 107 equemene
                BlocksStep = int(arg)
150 107 equemene
            elif opt in ("-f", "--threadsfirst"):
151 107 equemene
                ThreadsBegin = int(arg)
152 192 equemene
                ThreadsEnd = ThreadsBegin
153 107 equemene
            elif opt in ("-l", "--threadslast"):
154 107 equemene
                ThreadsEnd = int(arg)
155 107 equemene
            elif opt in ("-t", "--threadsstep"):
156 107 equemene
                ThreadsStep = int(arg)
157 107 equemene
            elif opt in ("-r", "--redo"):
158 107 equemene
                Redo = int(arg)
159 107 equemene
160 127 equemene
        print("Devices Identification : %s" % Devices)
161 127 equemene
        print("GpuStyle used : %s" % GpuStyle)
162 127 equemene
        print("Iterations : %s" % Iterations)
163 127 equemene
        print("Number of Blocks on begin : %s" % BlocksBegin)
164 127 equemene
        print("Number of Blocks on end : %s" % BlocksEnd)
165 127 equemene
        print("Step on Blocks : %s" % BlocksStep)
166 127 equemene
        print("Number of Threads on begin : %s" % ThreadsBegin)
167 127 equemene
        print("Number of Threads on end : %s" % ThreadsEnd)
168 127 equemene
        print("Step on Threads : %s" % ThreadsStep)
169 127 equemene
        print("Number of redo : %s" % Redo)
170 127 equemene
        print("Metrology done out of XPU : %r" % OutMetrology)
171 127 equemene
        print("Type of Marsaglia RNG used : %s" % RNG)
172 127 equemene
        print("Type of variable : %s" % ValueType)
173 107 equemene
174 107 equemene
        if GpuStyle=='CUDA':
175 107 equemene
            try:
176 107 equemene
                # For PyCUDA import
177 107 equemene
                import pycuda.driver as cuda
178 129 equemene
179 129 equemene
                cuda.init()
180 129 equemene
                for Id in range(cuda.Device.count()):
181 129 equemene
                    device=cuda.Device(Id)
182 129 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
183 129 equemene
                    if Id in Devices:
184 129 equemene
                        Alu[Id]='GPU'
185 107 equemene
            except ImportError:
186 127 equemene
                print("Platform does not seem to support CUDA")
187 107 equemene
188 107 equemene
        if GpuStyle=='OpenCL':
189 107 equemene
            try:
190 107 equemene
                # For PyOpenCL import
191 107 equemene
                import pyopencl as cl
192 129 equemene
                Id=0
193 107 equemene
                for platform in cl.get_platforms():
194 107 equemene
                    for device in platform.get_devices():
195 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
196 240 equemene
                        deviceType="xPU"
197 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
198 107 equemene
199 107 equemene
                        if Id in Devices:
200 107 equemene
                            # Set the Alu as detected Device Type
201 107 equemene
                            Alu[Id]=deviceType
202 107 equemene
                        Id=Id+1
203 107 equemene
            except ImportError:
204 127 equemene
                print("Platform does not seem to support OpenCL")
205 107 equemene
206 127 equemene
        print(Devices,Alu)
207 107 equemene
208 127 equemene
        BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
209 127 equemene
        ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
210 107 equemene
211 107 equemene
        ExploredJobs=numpy.array([]).astype(numpy.uint32)
212 107 equemene
        ExploredBlocks=numpy.array([]).astype(numpy.uint32)
213 107 equemene
        ExploredThreads=numpy.array([]).astype(numpy.uint32)
214 107 equemene
        avgD=numpy.array([]).astype(numpy.float32)
215 107 equemene
        medD=numpy.array([]).astype(numpy.float32)
216 107 equemene
        stdD=numpy.array([]).astype(numpy.float32)
217 107 equemene
        minD=numpy.array([]).astype(numpy.float32)
218 107 equemene
        maxD=numpy.array([]).astype(numpy.float32)
219 107 equemene
        avgR=numpy.array([]).astype(numpy.float32)
220 107 equemene
        medR=numpy.array([]).astype(numpy.float32)
221 107 equemene
        stdR=numpy.array([]).astype(numpy.float32)
222 107 equemene
        minR=numpy.array([]).astype(numpy.float32)
223 107 equemene
        maxR=numpy.array([]).astype(numpy.float32)
224 107 equemene
225 129 equemene
        IterationsMPI=numpy.uint64(Iterations/len(Devices))
226 129 equemene
        if Iterations%len(Devices)!=0:
227 129 equemene
            IterationsMPI+=1
228 129 equemene
229 107 equemene
        for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
230 107 equemene
231 107 equemene
            ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
232 107 equemene
            ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
233 107 equemene
            ExploredThreads=numpy.append(ExploredThreads,Threads)
234 129 equemene
235 129 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
236 129 equemene
            Duration=numpy.array([]).astype(numpy.float32)
237 129 equemene
            Rate=numpy.array([]).astype(numpy.float32)
238 287 equemene
239 129 equemene
            for i in range(Redo):
240 129 equemene
                time_start=time.time()
241 129 equemene
242 129 equemene
                r=1
243 129 equemene
                # Distribution of Devices over nodes
244 129 equemene
                InputCL={}
245 129 equemene
                InputCL['Iterations']=IterationsMPI
246 129 equemene
                InputCL['Steps']=1
247 129 equemene
                InputCL['Blocks']=Blocks
248 129 equemene
                InputCL['Threads']=Threads
249 129 equemene
                InputCL['RNG']=RNG
250 129 equemene
                InputCL['ValueType']=ValueType
251 129 equemene
                InputCL['GpuStyle']=GpuStyle
252 190 equemene
                InputCL['IfThen']=IfThen
253 129 equemene
254 129 equemene
                for Device in Devices[1:]:
255 129 equemene
                    print("Send to device %i on rank %i" % (Device,r))
256 129 equemene
                    InputCL['Device']=Device
257 239 equemene
                    DeltaD=Device-min(Devices)+r+1
258 239 equemene
                    DeltaS=(DeltaD-1+r)*524287
259 239 equemene
                    InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
260 129 equemene
                    comm.send('CONTINUE',dest=r,tag=11)
261 129 equemene
                    comm.send(InputCL,dest=r,tag=11)
262 129 equemene
                    r+=1
263 129 equemene
264 129 equemene
                # Compute on rank 0
265 129 equemene
                print("Compute on rank 0")
266 239 equemene
                DeltaD=Device-min(Devices)+1
267 239 equemene
                DeltaS=(DeltaD-1)*524287
268 239 equemene
                InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
269 129 equemene
                InputCL['Device']=Devices[0]
270 129 equemene
271 107 equemene
                if GpuStyle=='CUDA':
272 107 equemene
                    try:
273 129 equemene
                        OutputCL=MetropolisCuda(InputCL)
274 107 equemene
                    except:
275 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
276 107 equemene
                elif GpuStyle=='OpenCL':
277 107 equemene
                    try:
278 129 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
279 129 equemene
                    except:
280 129 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" %  (Blocks,Threads))
281 107 equemene
282 129 equemene
                Inside=OutputCL['Inside']
283 129 equemene
                NewIterations=OutputCL['NewIterations']
284 107 equemene
285 129 equemene
                for slave in range(1,len(Devices)):
286 129 equemene
                    print("Get OutputCL from %i" % slave)
287 129 equemene
                    OutputCL=comm.recv(source=slave,tag=11)
288 129 equemene
                    print(OutputCL)
289 129 equemene
                    NewIterations+=OutputCL['NewIterations']
290 129 equemene
                    Inside+=OutputCL['Inside']
291 107 equemene
292 287 equemene
                Duration=numpy.append(Duration,time.time()-time_start)
293 287 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
294 266 equemene
295 287 equemene
                print("Itops %i\nLogItops %.2f " % (int(Rate[-1]),numpy.log(Rate[-1])/numpy.log(10)))
296 287 equemene
                print("Pi estimation %.8f" % (4./NewIterations*Inside))
297 129 equemene
298 107 equemene
            avgD=numpy.append(avgD,numpy.average(Duration))
299 107 equemene
            medD=numpy.append(medD,numpy.median(Duration))
300 107 equemene
            stdD=numpy.append(stdD,numpy.std(Duration))
301 107 equemene
            minD=numpy.append(minD,numpy.min(Duration))
302 107 equemene
            maxD=numpy.append(maxD,numpy.max(Duration))
303 107 equemene
            avgR=numpy.append(avgR,numpy.average(Rate))
304 107 equemene
            medR=numpy.append(medR,numpy.median(Rate))
305 107 equemene
            stdR=numpy.append(stdR,numpy.std(Rate))
306 107 equemene
            minR=numpy.append(minR,numpy.min(Rate))
307 107 equemene
            maxR=numpy.append(maxR,numpy.max(Rate))
308 107 equemene
309 127 equemene
            print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
310 107 equemene
311 131 equemene
            numpy.savez("PiMPI_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
312 107 equemene
            ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
313 131 equemene
            numpy.savetxt("PiMPI_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
314 107 equemene
315 107 equemene
            if Fit:
316 107 equemene
                FitAndPrint(ExploredJobs,median,Curves)
317 107 equemene
        # Send MPI exit tag
318 127 equemene
        for slave in range(1,RankSize):
319 107 equemene
            comm.send('BREAK',dest=slave,tag=11)
320 107 equemene
321 107 equemene
    else:
322 107 equemene
        while True:
323 107 equemene
            Signal=comm.recv(source=0,tag=11)
324 107 equemene
            if Signal=='CONTINUE':
325 107 equemene
                # Receive information from Master
326 107 equemene
                InputCL=comm.recv(source=0,tag=11)
327 127 equemene
                print("Parameters retreive for rank %s of %s on %s from master:" % (rank,RankSize,gethostname()))
328 127 equemene
                print("Input CL:" % InputCL)
329 107 equemene
                # Execute on slave
330 129 equemene
331 129 equemene
                if InputCL['GpuStyle']=='CUDA':
332 129 equemene
                    try:
333 129 equemene
                        OutputCL=MetropolisCuda(InputCL)
334 129 equemene
                    except:
335 129 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (InputCL['Blocks'],InputCL['Threads']))
336 129 equemene
                elif InputCL['GpuStyle']=='OpenCL':
337 129 equemene
                    try:
338 129 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
339 129 equemene
                    except:
340 129 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" %  (InputCL['Blocks'],InputCL['Threads']))
341 129 equemene
342 127 equemene
                print("Output CL:" % OutputCL)
343 107 equemene
                # Send information to Master
344 107 equemene
                comm.send(OutputCL,dest=0,tag=11)
345 127 equemene
                print("Data sent to master")
346 107 equemene
            else:
347 127 equemene
                print('Exit signal from Master')
348 107 equemene
                break