Statistiques
| Révision :

root / Pi / XPU / PiXpuMPI.py @ 194

Historique | Voir | Annoter | Télécharger (13,72 ko)

1 127 equemene
#!/usr/bin/env python3
2 107 equemene
3 107 equemene
#
4 107 equemene
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5 107 equemene
#
6 107 equemene
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
7 107 equemene
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8 107 equemene
#
9 107 equemene
# Thanks to Andreas Klockner for PyCUDA:
10 107 equemene
# http://mathema.tician.de/software/pycuda
11 107 equemene
# Thanks to Andreas Klockner for PyOpenCL:
12 107 equemene
# http://mathema.tician.de/software/pyopencl
13 107 equemene
#
14 107 equemene
15 107 equemene
# 2013-01-01 : problems with launch timeout
16 107 equemene
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17 107 equemene
# Option "Interactive" "0" in /etc/X11/xorg.conf
18 107 equemene
19 107 equemene
# Common tools
20 107 equemene
import numpy
21 107 equemene
from numpy.random import randint as nprnd
22 107 equemene
import sys
23 107 equemene
import getopt
24 107 equemene
import time
25 107 equemene
import math
26 107 equemene
import itertools
27 107 equemene
from socket import gethostname
28 107 equemene
29 107 equemene
import mpi4py
30 107 equemene
from mpi4py import MPI
31 107 equemene
32 107 equemene
from PiXPU import *
33 107 equemene
34 107 equemene
if __name__=='__main__':
35 107 equemene
36 107 equemene
    # MPI Init
37 107 equemene
    comm = MPI.COMM_WORLD
38 107 equemene
    rank = comm.Get_rank()
39 107 equemene
40 107 equemene
    # Define number of Nodes on with computing is performed (exclude 0)
41 107 equemene
    RankSize=comm.Get_size()
42 107 equemene
43 107 equemene
    if rank == 0:
44 107 equemene
45 107 equemene
        # Set defaults values
46 107 equemene
47 107 equemene
        # Id of Device : 1 is for first find !
48 107 equemene
        Device=1
49 107 equemene
        # GPU style can be Cuda (Nvidia implementation) or OpenCL
50 107 equemene
        GpuStyle='OpenCL'
51 107 equemene
        # Iterations is integer
52 107 equemene
        Iterations=10000000
53 107 equemene
        # BlocksBlocks in first number of Blocks to explore
54 107 equemene
        BlocksBegin=1
55 107 equemene
        # BlocksEnd is last number of Blocks to explore
56 107 equemene
        BlocksEnd=16
57 107 equemene
        # BlocksStep is the step of Blocks to explore
58 107 equemene
        BlocksStep=1
59 107 equemene
        # ThreadsBlocks in first number of Blocks to explore
60 107 equemene
        ThreadsBegin=1
61 107 equemene
        # ThreadsEnd is last number of Blocks to explore
62 107 equemene
        ThreadsEnd=1
63 107 equemene
        # ThreadsStep is the step of Blocks to explore
64 107 equemene
        ThreadsStep=1
65 107 equemene
        # Redo is the times to redo the test to improve metrology
66 107 equemene
        Redo=1
67 107 equemene
        # OutMetrology is method for duration estimation : False is GPU inside
68 107 equemene
        OutMetrology=False
69 107 equemene
        Metrology='InMetro'
70 107 equemene
        # Curves is True to print the curves
71 107 equemene
        Curves=False
72 107 equemene
        # Fit is True to print the curves
73 107 equemene
        Fit=False
74 107 equemene
        # Marsaglia RNG
75 107 equemene
        RNG='MWC'
76 107 equemene
        # Value type : INT32, INT64, FP32, FP64
77 107 equemene
        ValueType='FP32'
78 190 equemene
        # Inside based on If
79 190 equemene
        IfThen=False
80 190 equemene
81 190 equemene
        HowToUse='%s -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
82 107 equemene
83 107 equemene
        try:
84 190 equemene
            opts, args = getopt.getopt(sys.argv[1:],"hckg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
85 107 equemene
        except getopt.GetoptError:
86 127 equemene
            print(HowToUse % sys.argv[0])
87 107 equemene
            sys.exit(2)
88 107 equemene
89 107 equemene
        # List of Devices
90 107 equemene
        Devices=[]
91 107 equemene
        Alu={}
92 107 equemene
93 107 equemene
        for opt, arg in opts:
94 107 equemene
            if opt == '-h':
95 127 equemene
                print(HowToUse % sys.argv[0])
96 107 equemene
97 127 equemene
                print("\nInformations about devices detected under OpenCL:")
98 107 equemene
                # For PyOpenCL import
99 107 equemene
                try:
100 107 equemene
                    import pyopencl as cl
101 129 equemene
                    Id=0
102 107 equemene
                    for platform in cl.get_platforms():
103 107 equemene
                        for device in platform.get_devices():
104 138 equemene
                            #deviceType=cl.device_type.to_string(device.type)
105 157 equemene
                            deviceType="xPU"
106 127 equemene
                            print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
107 107 equemene
                            Id=Id+1
108 107 equemene
109 107 equemene
                    print
110 123 equemene
                except:
111 127 equemene
                    print("Your platform does not seem to support OpenCL")
112 123 equemene
113 129 equemene
                print("\nInformations about devices detected under CUDA API:")
114 123 equemene
                # For PyCUDA import
115 123 equemene
                try:
116 123 equemene
                    import pycuda.driver as cuda
117 123 equemene
                    cuda.init()
118 123 equemene
                    for Id in range(cuda.Device.count()):
119 123 equemene
                        device=cuda.Device(Id)
120 127 equemene
                        print("Device #%i of type GPU : %s" % (Id,device.name()))
121 123 equemene
                    print
122 123 equemene
                except:
123 127 equemene
                    print("Your platform does not seem to support CUDA")
124 107 equemene
125 123 equemene
                sys.exit()
126 123 equemene
127 107 equemene
            elif opt == '-c':
128 107 equemene
                Curves=True
129 190 equemene
            elif opt == '-k':
130 190 equemene
                IfThen=True
131 107 equemene
            elif opt in ("-d", "--device"):
132 107 equemene
                Devices.append(int(arg))
133 107 equemene
            elif opt in ("-g", "--gpustyle"):
134 107 equemene
                GpuStyle = arg
135 107 equemene
            elif opt in ("-m", "--marsaglia"):
136 107 equemene
                RNG = arg
137 107 equemene
            elif opt in ("-v", "--valuetype"):
138 107 equemene
                ValueType = arg
139 107 equemene
            elif opt in ("-i", "--iterations"):
140 107 equemene
                Iterations = numpy.uint64(arg)
141 107 equemene
            elif opt in ("-b", "--blocksbegin"):
142 107 equemene
                BlocksBegin = int(arg)
143 192 equemene
                BlocksEnd = BlocksBegin
144 107 equemene
            elif opt in ("-e", "--blocksend"):
145 107 equemene
                BlocksEnd = int(arg)
146 107 equemene
            elif opt in ("-s", "--blocksstep"):
147 107 equemene
                BlocksStep = int(arg)
148 107 equemene
            elif opt in ("-f", "--threadsfirst"):
149 107 equemene
                ThreadsBegin = int(arg)
150 192 equemene
                ThreadsEnd = ThreadsBegin
151 107 equemene
            elif opt in ("-l", "--threadslast"):
152 107 equemene
                ThreadsEnd = int(arg)
153 107 equemene
            elif opt in ("-t", "--threadsstep"):
154 107 equemene
                ThreadsStep = int(arg)
155 107 equemene
            elif opt in ("-r", "--redo"):
156 107 equemene
                Redo = int(arg)
157 107 equemene
158 127 equemene
        print("Devices Identification : %s" % Devices)
159 127 equemene
        print("GpuStyle used : %s" % GpuStyle)
160 127 equemene
        print("Iterations : %s" % Iterations)
161 127 equemene
        print("Number of Blocks on begin : %s" % BlocksBegin)
162 127 equemene
        print("Number of Blocks on end : %s" % BlocksEnd)
163 127 equemene
        print("Step on Blocks : %s" % BlocksStep)
164 127 equemene
        print("Number of Threads on begin : %s" % ThreadsBegin)
165 127 equemene
        print("Number of Threads on end : %s" % ThreadsEnd)
166 127 equemene
        print("Step on Threads : %s" % ThreadsStep)
167 127 equemene
        print("Number of redo : %s" % Redo)
168 127 equemene
        print("Metrology done out of XPU : %r" % OutMetrology)
169 127 equemene
        print("Type of Marsaglia RNG used : %s" % RNG)
170 127 equemene
        print("Type of variable : %s" % ValueType)
171 107 equemene
172 107 equemene
        if GpuStyle=='CUDA':
173 107 equemene
            try:
174 107 equemene
                # For PyCUDA import
175 107 equemene
                import pycuda.driver as cuda
176 129 equemene
177 129 equemene
                cuda.init()
178 129 equemene
                for Id in range(cuda.Device.count()):
179 129 equemene
                    device=cuda.Device(Id)
180 129 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
181 129 equemene
                    if Id in Devices:
182 129 equemene
                        Alu[Id]='GPU'
183 107 equemene
            except ImportError:
184 127 equemene
                print("Platform does not seem to support CUDA")
185 107 equemene
186 107 equemene
        if GpuStyle=='OpenCL':
187 107 equemene
            try:
188 107 equemene
                # For PyOpenCL import
189 107 equemene
                import pyopencl as cl
190 129 equemene
                Id=0
191 107 equemene
                for platform in cl.get_platforms():
192 107 equemene
                    for device in platform.get_devices():
193 138 equemene
                        #deviceType=cl.device_type.to_string(device.type)
194 138 equemene
                        deviceType="*PU"
195 127 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
196 107 equemene
197 107 equemene
                        if Id in Devices:
198 107 equemene
                            # Set the Alu as detected Device Type
199 107 equemene
                            Alu[Id]=deviceType
200 107 equemene
                        Id=Id+1
201 107 equemene
            except ImportError:
202 127 equemene
                print("Platform does not seem to support OpenCL")
203 107 equemene
204 127 equemene
        print(Devices,Alu)
205 107 equemene
206 127 equemene
        BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
207 127 equemene
        ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
208 107 equemene
209 107 equemene
        ExploredJobs=numpy.array([]).astype(numpy.uint32)
210 107 equemene
        ExploredBlocks=numpy.array([]).astype(numpy.uint32)
211 107 equemene
        ExploredThreads=numpy.array([]).astype(numpy.uint32)
212 107 equemene
        avgD=numpy.array([]).astype(numpy.float32)
213 107 equemene
        medD=numpy.array([]).astype(numpy.float32)
214 107 equemene
        stdD=numpy.array([]).astype(numpy.float32)
215 107 equemene
        minD=numpy.array([]).astype(numpy.float32)
216 107 equemene
        maxD=numpy.array([]).astype(numpy.float32)
217 107 equemene
        avgR=numpy.array([]).astype(numpy.float32)
218 107 equemene
        medR=numpy.array([]).astype(numpy.float32)
219 107 equemene
        stdR=numpy.array([]).astype(numpy.float32)
220 107 equemene
        minR=numpy.array([]).astype(numpy.float32)
221 107 equemene
        maxR=numpy.array([]).astype(numpy.float32)
222 107 equemene
223 129 equemene
        IterationsMPI=numpy.uint64(Iterations/len(Devices))
224 129 equemene
        if Iterations%len(Devices)!=0:
225 129 equemene
            IterationsMPI+=1
226 129 equemene
227 107 equemene
        for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
228 107 equemene
229 107 equemene
            ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
230 107 equemene
            ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
231 107 equemene
            ExploredThreads=numpy.append(ExploredThreads,Threads)
232 129 equemene
233 129 equemene
            DurationItem=numpy.array([]).astype(numpy.float32)
234 129 equemene
            Duration=numpy.array([]).astype(numpy.float32)
235 129 equemene
            Rate=numpy.array([]).astype(numpy.float32)
236 129 equemene
            for i in range(Redo):
237 129 equemene
                time_start=time.time()
238 129 equemene
239 129 equemene
                r=1
240 129 equemene
                # Distribution of Devices over nodes
241 129 equemene
                InputCL={}
242 129 equemene
                InputCL['Iterations']=IterationsMPI
243 129 equemene
                InputCL['Steps']=1
244 129 equemene
                InputCL['Blocks']=Blocks
245 129 equemene
                InputCL['Threads']=Threads
246 129 equemene
                InputCL['RNG']=RNG
247 129 equemene
                InputCL['ValueType']=ValueType
248 129 equemene
                InputCL['GpuStyle']=GpuStyle
249 190 equemene
                InputCL['IfThen']=IfThen
250 129 equemene
251 129 equemene
                for Device in Devices[1:]:
252 129 equemene
                    print("Send to device %i on rank %i" % (Device,r))
253 129 equemene
                    InputCL['Device']=Device
254 129 equemene
                    comm.send('CONTINUE',dest=r,tag=11)
255 129 equemene
                    comm.send(InputCL,dest=r,tag=11)
256 129 equemene
                    r+=1
257 129 equemene
258 129 equemene
                # Compute on rank 0
259 129 equemene
                print("Compute on rank 0")
260 129 equemene
                InputCL['Device']=Devices[0]
261 129 equemene
262 107 equemene
                if GpuStyle=='CUDA':
263 107 equemene
                    try:
264 129 equemene
                        OutputCL=MetropolisCuda(InputCL)
265 107 equemene
                    except:
266 127 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
267 107 equemene
                elif GpuStyle=='OpenCL':
268 107 equemene
                    try:
269 129 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
270 129 equemene
                    except:
271 129 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" %  (Blocks,Threads))
272 107 equemene
273 129 equemene
                Inside=OutputCL['Inside']
274 129 equemene
                NewIterations=OutputCL['NewIterations']
275 107 equemene
276 129 equemene
                for slave in range(1,len(Devices)):
277 129 equemene
                    print("Get OutputCL from %i" % slave)
278 129 equemene
                    OutputCL=comm.recv(source=slave,tag=11)
279 129 equemene
                    print(OutputCL)
280 129 equemene
                    NewIterations+=OutputCL['NewIterations']
281 129 equemene
                    Inside+=OutputCL['Inside']
282 107 equemene
283 129 equemene
                print("Pi estimation %.8f" % (4./NewIterations*Inside))
284 129 equemene
285 129 equemene
                Duration=numpy.append(Duration,time.time()-time_start)
286 129 equemene
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
287 129 equemene
288 107 equemene
            avgD=numpy.append(avgD,numpy.average(Duration))
289 107 equemene
            medD=numpy.append(medD,numpy.median(Duration))
290 107 equemene
            stdD=numpy.append(stdD,numpy.std(Duration))
291 107 equemene
            minD=numpy.append(minD,numpy.min(Duration))
292 107 equemene
            maxD=numpy.append(maxD,numpy.max(Duration))
293 107 equemene
            avgR=numpy.append(avgR,numpy.average(Rate))
294 107 equemene
            medR=numpy.append(medR,numpy.median(Rate))
295 107 equemene
            stdR=numpy.append(stdR,numpy.std(Rate))
296 107 equemene
            minR=numpy.append(minR,numpy.min(Rate))
297 107 equemene
            maxR=numpy.append(maxR,numpy.max(Rate))
298 107 equemene
299 127 equemene
            print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
300 107 equemene
301 131 equemene
            numpy.savez("PiMPI_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
302 107 equemene
            ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
303 131 equemene
            numpy.savetxt("PiMPI_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
304 107 equemene
305 107 equemene
            if Fit:
306 107 equemene
                FitAndPrint(ExploredJobs,median,Curves)
307 107 equemene
        # Send MPI exit tag
308 127 equemene
        for slave in range(1,RankSize):
309 107 equemene
            comm.send('BREAK',dest=slave,tag=11)
310 107 equemene
311 107 equemene
    else:
312 107 equemene
        while True:
313 107 equemene
            Signal=comm.recv(source=0,tag=11)
314 107 equemene
            if Signal=='CONTINUE':
315 107 equemene
                # Receive information from Master
316 107 equemene
                InputCL=comm.recv(source=0,tag=11)
317 127 equemene
                print("Parameters retreive for rank %s of %s on %s from master:" % (rank,RankSize,gethostname()))
318 127 equemene
                print("Input CL:" % InputCL)
319 107 equemene
                # Execute on slave
320 129 equemene
321 129 equemene
                if InputCL['GpuStyle']=='CUDA':
322 129 equemene
                    try:
323 129 equemene
                        OutputCL=MetropolisCuda(InputCL)
324 129 equemene
                    except:
325 129 equemene
                        print("Problem with (%i,%i) // computations on Cuda" % (InputCL['Blocks'],InputCL['Threads']))
326 129 equemene
                elif InputCL['GpuStyle']=='OpenCL':
327 129 equemene
                    try:
328 129 equemene
                        OutputCL=MetropolisOpenCL(InputCL)
329 129 equemene
                    except:
330 129 equemene
                        print("Problem with (%i,%i) // computations on OpenCL" %  (InputCL['Blocks'],InputCL['Threads']))
331 129 equemene
332 127 equemene
                print("Output CL:" % OutputCL)
333 107 equemene
                # Send information to Master
334 107 equemene
                comm.send(OutputCL,dest=0,tag=11)
335 127 equemene
                print("Data sent to master")
336 107 equemene
            else:
337 127 equemene
                print('Exit signal from Master')
338 107 equemene
                break