Statistiques
| Révision :

root / Pi / XPU / PiXpuMPI.py @ 240

Historique | Voir | Annoter | Télécharger (14,16 ko)

1
#!/usr/bin/env python3
2

    
3
#
4
# Pi-by-MonteCarlo using PyCUDA/PyOpenCL
5
#
6
# CC BY-NC-SA 2011 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com> 
7
# Cecill v2 : Emmanuel QUEMENER <emmanuel.quemener@gmail.com>
8
#
9
# Thanks to Andreas Klockner for PyCUDA:
10
# http://mathema.tician.de/software/pycuda
11
# Thanks to Andreas Klockner for PyOpenCL:
12
# http://mathema.tician.de/software/pyopencl
13
# 
14

    
15
# 2013-01-01 : problems with launch timeout
16
# http://stackoverflow.com/questions/497685/how-do-you-get-around-the-maximum-cuda-run-time
17
# Option "Interactive" "0" in /etc/X11/xorg.conf
18

    
19
# Common tools
20
import numpy
21
from numpy.random import randint as nprnd
22
import sys
23
import getopt
24
import time
25
import math
26
import itertools
27
from socket import gethostname
28

    
29
import mpi4py
30
from mpi4py import MPI
31

    
32
from PiXPU import *
33

    
34
if __name__=='__main__':
35

    
36
    # MPI Init
37
    comm = MPI.COMM_WORLD
38
    rank = comm.Get_rank()
39
        
40
    # Define number of Nodes on with computing is performed (exclude 0)
41
    RankSize=comm.Get_size()
42

    
43
    if rank == 0:
44
    
45
        # Set defaults values
46
  
47
        # Id of Device : 1 is for first find !
48
        Device=1
49
        # GPU style can be Cuda (Nvidia implementation) or OpenCL
50
        GpuStyle='OpenCL'
51
        # Iterations is integer
52
        Iterations=10000000
53
        # BlocksBlocks in first number of Blocks to explore
54
        BlocksBegin=1
55
        # BlocksEnd is last number of Blocks to explore
56
        BlocksEnd=16
57
        # BlocksStep is the step of Blocks to explore
58
        BlocksStep=1
59
        # ThreadsBlocks in first number of Blocks to explore
60
        ThreadsBegin=1
61
        # ThreadsEnd is last number of Blocks to explore
62
        ThreadsEnd=1
63
        # ThreadsStep is the step of Blocks to explore
64
        ThreadsStep=1
65
        # Redo is the times to redo the test to improve metrology
66
        Redo=1
67
        # OutMetrology is method for duration estimation : False is GPU inside
68
        OutMetrology=False
69
        Metrology='InMetro'
70
        # Curves is True to print the curves
71
        Curves=False
72
        # Fit is True to print the curves
73
        Fit=False
74
        # Marsaglia RNG
75
        RNG='MWC'
76
        # Seeds
77
        Seeds=110271,101008
78
        # Value type : INT32, INT64, FP32, FP64
79
        ValueType='FP32'
80
        # Inside based on If
81
        IfThen=False
82
        
83
        HowToUse='%s -c (Print Curves) -k (Case On IfThen) -d <DeviceId> -g <CUDA/OpenCL> -i <Iterations> -b <BlocksBegin> -e <BlocksEnd> -s <BlocksStep> -f <ThreadsFirst> -l <ThreadsLast> -t <ThreadssTep> -r <RedoToImproveStats> -m <SHR3/CONG/MWC/KISS> -v <INT32/INT64/FP32/FP64>'
84
    
85
        try:
86
            opts, args = getopt.getopt(sys.argv[1:],"hckg:i:b:e:s:f:l:t:r:d:m:v:",["gpustyle=","iterations=","blocksBegin=","blocksEnd=","blocksStep=","threadsFirst=","threadsLast=","threadssTep=","redo=","device=","marsaglia=","valuetype="])
87
        except getopt.GetoptError:
88
            print(HowToUse % sys.argv[0])
89
            sys.exit(2)
90

    
91
        # List of Devices
92
        Devices=[]
93
        Alu={}
94
        
95
        for opt, arg in opts:
96
            if opt == '-h':
97
                print(HowToUse % sys.argv[0])
98

    
99
                print("\nInformations about devices detected under OpenCL:")
100
                # For PyOpenCL import
101
                try:
102
                    import pyopencl as cl
103
                    Id=0
104
                    for platform in cl.get_platforms():
105
                        for device in platform.get_devices():
106
                            #deviceType=cl.device_type.to_string(device.type)
107
                            deviceType="xPU"
108
                            print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
109
                            Id=Id+1
110

    
111
                    print
112
                except:
113
                    print("Your platform does not seem to support OpenCL")
114

    
115
                print("\nInformations about devices detected under CUDA API:")
116
                # For PyCUDA import
117
                try:
118
                    import pycuda.driver as cuda
119
                    cuda.init()
120
                    for Id in range(cuda.Device.count()):
121
                        device=cuda.Device(Id)
122
                        print("Device #%i of type GPU : %s" % (Id,device.name()))
123
                    print
124
                except:
125
                    print("Your platform does not seem to support CUDA")
126
        
127
                sys.exit()
128
                    
129
            elif opt == '-c':
130
                Curves=True
131
            elif opt == '-k':
132
                IfThen=True
133
            elif opt in ("-d", "--device"):
134
                Devices.append(int(arg))
135
            elif opt in ("-g", "--gpustyle"):
136
                GpuStyle = arg
137
            elif opt in ("-m", "--marsaglia"):
138
                RNG = arg
139
            elif opt in ("-v", "--valuetype"):
140
                ValueType = arg
141
            elif opt in ("-i", "--iterations"):
142
                Iterations = numpy.uint64(arg)
143
            elif opt in ("-b", "--blocksbegin"):
144
                BlocksBegin = int(arg)
145
                BlocksEnd = BlocksBegin
146
            elif opt in ("-e", "--blocksend"):
147
                BlocksEnd = int(arg)
148
            elif opt in ("-s", "--blocksstep"):
149
                BlocksStep = int(arg)
150
            elif opt in ("-f", "--threadsfirst"):
151
                ThreadsBegin = int(arg)
152
                ThreadsEnd = ThreadsBegin
153
            elif opt in ("-l", "--threadslast"):
154
                ThreadsEnd = int(arg)
155
            elif opt in ("-t", "--threadsstep"):
156
                ThreadsStep = int(arg)
157
            elif opt in ("-r", "--redo"):
158
                Redo = int(arg)
159

    
160
        print("Devices Identification : %s" % Devices)
161
        print("GpuStyle used : %s" % GpuStyle)
162
        print("Iterations : %s" % Iterations)
163
        print("Number of Blocks on begin : %s" % BlocksBegin)
164
        print("Number of Blocks on end : %s" % BlocksEnd)
165
        print("Step on Blocks : %s" % BlocksStep)
166
        print("Number of Threads on begin : %s" % ThreadsBegin)
167
        print("Number of Threads on end : %s" % ThreadsEnd)
168
        print("Step on Threads : %s" % ThreadsStep)
169
        print("Number of redo : %s" % Redo)
170
        print("Metrology done out of XPU : %r" % OutMetrology)
171
        print("Type of Marsaglia RNG used : %s" % RNG)
172
        print("Type of variable : %s" % ValueType)
173

    
174
        if GpuStyle=='CUDA':
175
            try:
176
                # For PyCUDA import
177
                import pycuda.driver as cuda
178

    
179
                cuda.init()
180
                for Id in range(cuda.Device.count()):
181
                    device=cuda.Device(Id)
182
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
183
                    if Id in Devices:
184
                        Alu[Id]='GPU'
185
            except ImportError:
186
                print("Platform does not seem to support CUDA")
187

    
188
        if GpuStyle=='OpenCL':
189
            try:
190
                # For PyOpenCL import
191
                import pyopencl as cl
192
                Id=0
193
                for platform in cl.get_platforms():
194
                    for device in platform.get_devices():
195
                        #deviceType=cl.device_type.to_string(device.type)
196
                        deviceType="xPU"
197
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
198

    
199
                        if Id in Devices:
200
                            # Set the Alu as detected Device Type
201
                            Alu[Id]=deviceType
202
                        Id=Id+1
203
            except ImportError:
204
                print("Platform does not seem to support OpenCL")
205

    
206
        print(Devices,Alu)
207
            
208
        BlocksList=range(BlocksBegin,BlocksEnd+BlocksStep,BlocksStep)
209
        ThreadsList=range(ThreadsBegin,ThreadsEnd+ThreadsStep,ThreadsStep)
210
    
211
        ExploredJobs=numpy.array([]).astype(numpy.uint32)
212
        ExploredBlocks=numpy.array([]).astype(numpy.uint32)
213
        ExploredThreads=numpy.array([]).astype(numpy.uint32)
214
        avgD=numpy.array([]).astype(numpy.float32)
215
        medD=numpy.array([]).astype(numpy.float32)
216
        stdD=numpy.array([]).astype(numpy.float32)
217
        minD=numpy.array([]).astype(numpy.float32)
218
        maxD=numpy.array([]).astype(numpy.float32)
219
        avgR=numpy.array([]).astype(numpy.float32)
220
        medR=numpy.array([]).astype(numpy.float32)
221
        stdR=numpy.array([]).astype(numpy.float32)
222
        minR=numpy.array([]).astype(numpy.float32)
223
        maxR=numpy.array([]).astype(numpy.float32)
224

    
225
        IterationsMPI=numpy.uint64(Iterations/len(Devices))
226
        if Iterations%len(Devices)!=0:
227
            IterationsMPI+=1
228

    
229
        for Blocks,Threads in itertools.product(BlocksList,ThreadsList):
230
        
231
            ExploredJobs=numpy.append(ExploredJobs,Blocks*Threads)
232
            ExploredBlocks=numpy.append(ExploredBlocks,Blocks)
233
            ExploredThreads=numpy.append(ExploredThreads,Threads)
234

    
235
            DurationItem=numpy.array([]).astype(numpy.float32)
236
            Duration=numpy.array([]).astype(numpy.float32)
237
            Rate=numpy.array([]).astype(numpy.float32)
238
            for i in range(Redo):
239
                time_start=time.time()
240

    
241
                r=1
242
                # Distribution of Devices over nodes
243
                InputCL={}
244
                InputCL['Iterations']=IterationsMPI
245
                InputCL['Steps']=1
246
                InputCL['Blocks']=Blocks
247
                InputCL['Threads']=Threads
248
                InputCL['RNG']=RNG
249
                InputCL['ValueType']=ValueType
250
                InputCL['GpuStyle']=GpuStyle
251
                InputCL['IfThen']=IfThen
252

    
253
                for Device in Devices[1:]:
254
                    print("Send to device %i on rank %i" % (Device,r))
255
                    InputCL['Device']=Device
256
                    DeltaD=Device-min(Devices)+r+1
257
                    DeltaS=(DeltaD-1+r)*524287
258
                    InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
259
                    comm.send('CONTINUE',dest=r,tag=11)
260
                    comm.send(InputCL,dest=r,tag=11)
261
                    r+=1
262

    
263
                # Compute on rank 0
264
                print("Compute on rank 0")
265
                DeltaD=Device-min(Devices)+1
266
                DeltaS=(DeltaD-1)*524287
267
                InputCL['Seeds']=numpy.uint32(Seeds[0]*DeltaD+DeltaS),numpy.uint32(Seeds[1]*DeltaD+DeltaS)
268
                InputCL['Device']=Devices[0]
269

    
270
                if GpuStyle=='CUDA':
271
                    try:
272
                        OutputCL=MetropolisCuda(InputCL)
273
                    except:
274
                        print("Problem with (%i,%i) // computations on Cuda" % (Blocks,Threads))
275
                elif GpuStyle=='OpenCL':
276
                    try:
277
                        OutputCL=MetropolisOpenCL(InputCL)
278
                    except:
279
                        print("Problem with (%i,%i) // computations on OpenCL" %  (Blocks,Threads))
280

    
281
                Inside=OutputCL['Inside']
282
                NewIterations=OutputCL['NewIterations']
283

    
284
                for slave in range(1,len(Devices)):
285
                    print("Get OutputCL from %i" % slave)
286
                    OutputCL=comm.recv(source=slave,tag=11)
287
                    print(OutputCL)
288
                    NewIterations+=OutputCL['NewIterations']
289
                    Inside+=OutputCL['Inside']
290

    
291
                print("Pi estimation %.8f" % (4./NewIterations*Inside))
292
                
293
                Duration=numpy.append(Duration,time.time()-time_start)
294
                Rate=numpy.append(Rate,NewIterations/Duration[-1])
295
                                            
296
            avgD=numpy.append(avgD,numpy.average(Duration))
297
            medD=numpy.append(medD,numpy.median(Duration))
298
            stdD=numpy.append(stdD,numpy.std(Duration))
299
            minD=numpy.append(minD,numpy.min(Duration))
300
            maxD=numpy.append(maxD,numpy.max(Duration))
301
            avgR=numpy.append(avgR,numpy.average(Rate))
302
            medR=numpy.append(medR,numpy.median(Rate))
303
            stdR=numpy.append(stdR,numpy.std(Rate))
304
            minR=numpy.append(minR,numpy.min(Rate))
305
            maxR=numpy.append(maxR,numpy.max(Rate))
306

    
307
            print("%.2f %.2f %.2f %.2f %.2f %i %i %i %i %i" % (avgD[-1],medD[-1],stdD[-1],minD[-1],maxD[-1],avgR[-1],medR[-1],stdR[-1],minR[-1],maxR[-1]))
308
        
309
            numpy.savez("PiMPI_%s_%s_%s_%s_%s_%s_%s_%s_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),(ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR))
310
            ToSave=[ ExploredBlocks,ExploredThreads,avgD,medD,stdD,minD,maxD,avgR,medR,stdR,minR,maxR ]
311
            numpy.savetxt("PiMPI_%s_%s_%s_%s_%s_%s_%s_%i_%.8i_Device%i_%s_%s" % (ValueType,RNG,Alu[Devices[0]],GpuStyle,BlocksBegin,BlocksEnd,ThreadsBegin,ThreadsEnd,Iterations,Devices[0],Metrology,gethostname()),numpy.transpose(ToSave),fmt='%i %i %e %e %e %e %e %i %i %i %i %i')
312

    
313
            if Fit:
314
                FitAndPrint(ExploredJobs,median,Curves)
315
        # Send MPI exit tag
316
        for slave in range(1,RankSize):
317
            comm.send('BREAK',dest=slave,tag=11)
318
            
319
    else:
320
        while True:
321
            Signal=comm.recv(source=0,tag=11)
322
            if Signal=='CONTINUE':
323
                # Receive information from Master
324
                InputCL=comm.recv(source=0,tag=11)
325
                print("Parameters retreive for rank %s of %s on %s from master:" % (rank,RankSize,gethostname()))
326
                print("Input CL:" % InputCL)
327
                # Execute on slave
328

    
329
                if InputCL['GpuStyle']=='CUDA':
330
                    try:
331
                        OutputCL=MetropolisCuda(InputCL)
332
                    except:
333
                        print("Problem with (%i,%i) // computations on Cuda" % (InputCL['Blocks'],InputCL['Threads']))
334
                elif InputCL['GpuStyle']=='OpenCL':
335
                    try:
336
                        OutputCL=MetropolisOpenCL(InputCL)
337
                    except:
338
                        print("Problem with (%i,%i) // computations on OpenCL" %  (InputCL['Blocks'],InputCL['Threads']))
339

    
340
                print("Output CL:" % OutputCL)
341
                # Send information to Master
342
                comm.send(OutputCL,dest=0,tag=11)
343
                print("Data sent to master")
344
            else:
345
                print('Exit signal from Master')
346
                break