Statistiques
| Révision :

root / Pi / XPU / PiXpuMPI.py @ 309

Historique | Voir | Annoter | Télécharger (14,26 ko)

1
#!/usr/bin/env python3
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import math
26
import itertools
27
from socket import gethostname
28

    
29
import mpi4py
30
from mpi4py import MPI
31

    
32
from PiXPU import *
33

    
34
if __name__=='__main__':
35

    
36
    # MPI Init
37
    comm = MPI.COMM_WORLD
38
    rank = comm.Get_rank()
39
        
40
    # Define number of Nodes on with computing is performed (exclude 0)
41
    RankSize=comm.Get_size()
42

    
43
    if rank == 0:
44
    
45
        # Set defaults values
46
  
47
        # Id of Device : 1 is for first find !
48
        Device=1
49
        # GPU style can be Cuda (Nvidia implementation) or OpenCL
50
        GpuStyle='OpenCL'
51
        # Iterations is integer
52
        Iterations=10000000
53
        # BlocksBlocks in first number of Blocks to explore
54
        BlocksBegin=1
55
        # BlocksEnd is last number of Blocks to explore
56
        BlocksEnd=16
57
        # BlocksStep is the step of Blocks to explore
58
        BlocksStep=1
59
        # ThreadsBlocks in first number of Blocks to explore
60
        ThreadsBegin=1
61
        # ThreadsEnd is last number of Blocks to explore
62
        ThreadsEnd=1
63
        # ThreadsStep is the step of Blocks to explore
64
        ThreadsStep=1
65
        # Redo is the times to redo the test to improve metrology
66
        Redo=1
67
        # OutMetrology is method for duration estimation : False is GPU inside
68
        OutMetrology=False
69
        Metrology='InMetro'
70
        # Curves is True to print the curves
71
        Curves=False
72
        # Fit is True to print the curves
73
        Fit=False
74
        # Marsaglia RNG
75
        RNG='MWC'
76
        # Seeds
77
        Seeds=110271,101008
78
        # Value type : INT32, INT64, FP32, FP64
79
        ValueType='FP32'
80
        # Inside based on If
81
        IfThen=False
82
        
83
        HowToUse='%s -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
84
    
85
        try:
86
            opts, args = getopt.getopt(sys.argv[1:],"hckg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
87
        except getopt.GetoptError:
88
            print(HowToUse % sys.argv[0])
89
            sys.exit(2)
90

    
91
        # List of Devices
92
        Devices=[]
93
        Alu={}
94
        
95
        for opt, arg in opts:
96
            if opt == '-h':
97
                print(HowToUse % sys.argv[0])
98

    
99
                print("\nInformations about devices detected under OpenCL:")
100
                # For PyOpenCL import
101
                try:
102
                    import pyopencl as cl
103
                    Id=0
104
                    for platform in cl.get_platforms():
105
                        for device in platform.get_devices():
106
                            #deviceType=cl.device_type.to_string(device.type)
107
                            deviceType="xPU"
108
                            print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
109
                            Id=Id+1
110

    
111
                    print
112
                except:
113
                    print("Your platform does not seem to support OpenCL")
114

    
115
                print("\nInformations about devices detected under CUDA API:")
116
                # For PyCUDA import
117
                try:
118
                    import pycuda.driver as cuda
119
                    cuda.init()
120
                    for Id in range(cuda.Device.count()):
121
                        device=cuda.Device(Id)
122
                        print("Device #%i of type GPU : %s" % (Id,device.name()))
123
                    print
124
                except:
125
                    print("Your platform does not seem to support CUDA")
126
        
127
                sys.exit()
128
                    
129
            elif opt == '-c':
130
                Curves=True
131
            elif opt == '-k':
132
                IfThen=True
133
            elif opt in ("-d", "--device"):
134
                Devices.append(int(arg))
135
            elif opt in ("-g", "--gpustyle"):
136
                GpuStyle = arg
137
            elif opt in ("-m", "--marsaglia"):
138
                RNG = arg
139
            elif opt in ("-v", "--valuetype"):
140
                ValueType = arg
141
            elif opt in ("-i", "--iterations"):
142
                Iterations = numpy.uint64(arg)
143
            elif opt in ("-b", "--blocksbegin"):
144
                BlocksBegin = int(arg)
145
                BlocksEnd = BlocksBegin
146
            elif opt in ("-e", "--blocksend"):
147
                BlocksEnd = int(arg)
148
            elif opt in ("-s", "--blocksstep"):
149
                BlocksStep = int(arg)
150
            elif opt in ("-f", "--threadsfirst"):
151
                ThreadsBegin = int(arg)
152
                ThreadsEnd = ThreadsBegin
153
            elif opt in ("-l", "--threadslast"):
154
                ThreadsEnd = int(arg)
155
            elif opt in ("-t", "--threadsstep"):
156
                ThreadsStep = int(arg)
157
            elif opt in ("-r", "--redo"):
158
                Redo = int(arg)
159

    
160
        print("Devices Identification : %s" % Devices)
161
        print("GpuStyle used : %s" % GpuStyle)
162
        print("Iterations : %s" % Iterations)
163
        print("Number of Blocks on begin : %s" % BlocksBegin)
164
        print("Number of Blocks on end : %s" % BlocksEnd)
165
        print("Step on Blocks : %s" % BlocksStep)
166
        print("Number of Threads on begin : %s" % ThreadsBegin)
167
        print("Number of Threads on end : %s" % ThreadsEnd)
168
        print("Step on Threads : %s" % ThreadsStep)
169
        print("Number of redo : %s" % Redo)
170
        print("Metrology done out of XPU : %r" % OutMetrology)
171
        print("Type of Marsaglia RNG used : %s" % RNG)
172
        print("Type of variable : %s" % ValueType)
173

    
174
        if GpuStyle=='CUDA':
175
            try:
176
                # For PyCUDA import
177
                import pycuda.driver as cuda
178

    
179
                cuda.init()
180
                for Id in range(cuda.Device.count()):
181
                    device=cuda.Device(Id)
182
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
183
                    if Id in Devices:
184
                        Alu[Id]='GPU'
185
            except ImportError:
186
                print("Platform does not seem to support CUDA")
187

    
188
        if GpuStyle=='OpenCL':
189
            try:
190
                # For PyOpenCL import
191
                import pyopencl as cl
192
                Id=0
193
                for platform in cl.get_platforms():
194
                    for device in platform.get_devices():
195
                        #deviceType=cl.device_type.to_string(device.type)
196
                        deviceType="xPU"
197
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
198

    
199
                        if Id in Devices:
200
                            # Set the Alu as detected Device Type
201
                            Alu[Id]=deviceType
202
                        Id=Id+1
203
            except ImportError:
204
                print("Platform does not seem to support OpenCL")
205

    
206
        print(Devices,Alu)
207
            
208
        BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
209
        ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
210
    
211
        ExploredJobs=numpy.array([]).astype(numpy.uint32)
212
        ExploredBlocks=numpy.array([]).astype(numpy.uint32)
213
        ExploredThreads=numpy.array([]).astype(numpy.uint32)
214
        avgD=numpy.array([]).astype(numpy.float32)
215
        medD=numpy.array([]).astype(numpy.float32)
216
        stdD=numpy.array([]).astype(numpy.float32)
217
        minD=numpy.array([]).astype(numpy.float32)
218
        maxD=numpy.array([]).astype(numpy.float32)
219
        avgR=numpy.array([]).astype(numpy.float32)
220
        medR=numpy.array([]).astype(numpy.float32)
221
        stdR=numpy.array([]).astype(numpy.float32)
222
        minR=numpy.array([]).astype(numpy.float32)
223
        maxR=numpy.array([]).astype(numpy.float32)
224

    
225
        IterationsMPI=numpy.uint64(Iterations/len(Devices))
226
        if Iterations%len(Devices)!=0:
227
            IterationsMPI+=1
228

    
229
        for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
230
        
231
            ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
232
            ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
233
            ExploredThreads=numpy.append(ExploredThreads,Threads)
234

    
235
            DurationItem=numpy.array([]).astype(numpy.float32)
236
            Duration=numpy.array([]).astype(numpy.float32)
237
            Rate=numpy.array([]).astype(numpy.float32)
238
            
239
            for i in range(Redo):
240
                time_start=time.time()
241

    
242
                r=1
243
                # Distribution of Devices over nodes
244
                InputCL={}
245
                InputCL['Iterations']=IterationsMPI
246
                InputCL['Steps']=1
247
                InputCL['Blocks']=Blocks
248
                InputCL['Threads']=Threads
249
                InputCL['RNG']=RNG
250
                InputCL['ValueType']=ValueType
251
                InputCL['GpuStyle']=GpuStyle
252
                InputCL['IfThen']=IfThen
253

    
254
                for Device in Devices[1:]:
255
                    print("Send to device %i on rank %i" % (Device,r))
256
                    InputCL['Device']=Device
257
                    DeltaD=Device-min(Devices)+r+1
258
                    DeltaS=(DeltaD-1+r)*524287
259
                    InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
260
                    comm.send('CONTINUE',dest=r,tag=11)
261
                    comm.send(InputCL,dest=r,tag=11)
262
                    r+=1
263

    
264
                # Compute on rank 0
265
                print("Compute on rank 0")
266
                DeltaD=Device-min(Devices)+1
267
                DeltaS=(DeltaD-1)*524287
268
                InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
269
                InputCL['Device']=Devices[0]
270

    
271
                if GpuStyle=='CUDA':
272
                    try:
273
                        OutputCL=MetropolisCuda(InputCL)
274
                    except:
275
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
276
                elif GpuStyle=='OpenCL':
277
                    try:
278
                        OutputCL=MetropolisOpenCL(InputCL)
279
                    except:
280
                        print("Problem with (%i,%i) // computations on OpenCL" %  (Blocks,Threads))
281

    
282
                Inside=OutputCL['Inside']
283
                NewIterations=OutputCL['NewIterations']
284

    
285
                for slave in range(1,len(Devices)):
286
                    print("Get OutputCL from %i" % slave)
287
                    OutputCL=comm.recv(source=slave,tag=11)
288
                    print(OutputCL)
289
                    NewIterations+=OutputCL['NewIterations']
290
                    Inside+=OutputCL['Inside']
291

    
292
                Duration=numpy.append(Duration,time.time()-time_start)
293
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
294

    
295
                print("Itops %i\nLogItops %.2f " % (int(Rate[-1]),numpy.log(Rate[-1])/numpy.log(10)))
296
                print("Pi estimation %.8f" % (4./NewIterations*Inside))
297
                                            
298
            avgD=numpy.append(avgD,numpy.average(Duration))
299
            medD=numpy.append(medD,numpy.median(Duration))
300
            stdD=numpy.append(stdD,numpy.std(Duration))
301
            minD=numpy.append(minD,numpy.min(Duration))
302
            maxD=numpy.append(maxD,numpy.max(Duration))
303
            avgR=numpy.append(avgR,numpy.average(Rate))
304
            medR=numpy.append(medR,numpy.median(Rate))
305
            stdR=numpy.append(stdR,numpy.std(Rate))
306
            minR=numpy.append(minR,numpy.min(Rate))
307
            maxR=numpy.append(maxR,numpy.max(Rate))
308

    
309
            print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
310
        
311
            numpy.savez("PiMPI_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
312
            ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
313
            numpy.savetxt("PiMPI_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
314

    
315
            if Fit:
316
                FitAndPrint(ExploredJobs,median,Curves)
317
        # Send MPI exit tag
318
        for slave in range(1,RankSize):
319
            comm.send('BREAK',dest=slave,tag=11)
320
            
321
    else:
322
        while True:
323
            Signal=comm.recv(source=0,tag=11)
324
            if Signal=='CONTINUE':
325
                # Receive information from Master
326
                InputCL=comm.recv(source=0,tag=11)
327
                print("Parameters retreive for rank %s of %s on %s from master:" % (rank,RankSize,gethostname()))
328
                print("Input CL:" % InputCL)
329
                # Execute on slave
330

    
331
                if InputCL['GpuStyle']=='CUDA':
332
                    try:
333
                        OutputCL=MetropolisCuda(InputCL)
334
                    except:
335
                        print("Problem with (%i,%i) // computations on Cuda" % (InputCL['Blocks'],InputCL['Threads']))
336
                elif InputCL['GpuStyle']=='OpenCL':
337
                    try:
338
                        OutputCL=MetropolisOpenCL(InputCL)
339
                    except:
340
                        print("Problem with (%i,%i) // computations on OpenCL" %  (InputCL['Blocks'],InputCL['Threads']))
341

    
342
                print("Output CL:" % OutputCL)
343
                # Send information to Master
344
                comm.send(OutputCL,dest=0,tag=11)
345
                print("Data sent to master")
346
            else:
347
                print('Exit signal from Master')
348
                break