Statistiques
| Révision :

root / ETSN / MyDFT_9.py @ 300

Historique | Voir | Annoter | Télécharger (12,52 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

    
7
# Naive Discrete Fourier Transform
8
def MyDFT(x,y):
9
    size=x.shape[0]
10
    X=np.zeros(size).astype(np.float32)
11
    Y=np.zeros(size).astype(np.float32)
12
    for i in range(size):
13
        for j in range(size):
14
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)+y[j]*sin(2.*pi*i*j/size)
15
            Y[i]=Y[i]-x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16
    return(X,Y)
17

    
18
# Numpy Discrete Fourier Transform
19
def NumpyDFT(x,y):
20
    size=x.shape[0]
21
    X=np.zeros(size).astype(np.float32)
22
    Y=np.zeros(size).astype(np.float32)
23
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24
    for i in range(size):
25
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26
        Y[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),y),np.multiply(np.sin(i*nj),x)))
27
    return(X,Y)
28

    
29
# Numba Discrete Fourier Transform
30
import numba
31
@numba.njit(parallel=True)
32
def NumbaDFT(x,y):
33
    size=x.shape[0]
34
    X=np.zeros(size).astype(np.float32)
35
    Y=np.zeros(size).astype(np.float32)
36
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37
    for i in numba.prange(size):
38
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39
        Y[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),y),np.multiply(np.sin(i*nj),x)))
40
    return(X,Y)
41

    
42
# OpenCL complete operation
43
def OpenCLDFT(a_np,b_np,Device):
44

    
45
    Id=0
46
    HasXPU=False
47
    for platform in cl.get_platforms():
48
        for device in platform.get_devices():
49
            if Id==Device:
50
                XPU=device
51
                print("CPU/GPU selected: ",device.name.lstrip())
52
                HasXPU=True
53
            Id+=1
54
            # print(Id)
55

    
56
    if HasXPU==False:
57
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58
        sys.exit()           
59

    
60
    try:
61
        ctx = cl.Context(devices=[XPU])
62
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63
    except:
64
        print("Crash during context creation")
65

    
66
    TimeIn=time.time()
67
    # Copy from Host to Device using pointers
68
    mf = cl.mem_flags
69
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71
    Elapsed=time.time()-TimeIn
72
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73

    
74
    TimeIn=time.time()
75
    # Definition of kernel under OpenCL
76
    prg = cl.Program(ctx, """
77

78
#define PI 3.141592653589793
79

80
__kernel void MyDFT(
81
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82
{
83
  int gid = get_global_id(0);
84
  uint size = get_global_size(0);
85
  float A=0.,B=0.;
86
  for (uint i=0; i<size;i++) 
87
  {
88
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90
  }
91
  A_g[gid]=A;
92
  B_g[gid]=B;
93
}
94
""").build()
95
    Elapsed=time.time()-TimeIn
96
    print("Building kernels : %.3f" % Elapsed)
97
    
98
    TimeIn=time.time()
99
    # Memory allocation on Device for result
100
    A_ocl = np.empty_like(a_np)
101
    B_ocl = np.empty_like(a_np)
102
    Elapsed=time.time()-TimeIn
103
    print("Allocation on Host for results : %.3f" % Elapsed)
104

    
105
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107
    Elapsed=time.time()-TimeIn
108
    print("Allocation on Device for results : %.3f" % Elapsed)
109

    
110
    TimeIn=time.time()
111
    # Synthesis of function "sillysum" inside Kernel Sources
112
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113
    Elapsed=time.time()-TimeIn
114
    print("Synthesis of kernel : %.3f" % Elapsed)
115

    
116
    TimeIn=time.time()
117
    # Call of kernel previously defined 
118
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119
    # 
120
    CallCL.wait()
121
    Elapsed=time.time()-TimeIn
122
    print("Execution of kernel : %.3f" % Elapsed)
123

    
124
    TimeIn=time.time()
125
    # Copy from Device to Host
126
    cl.enqueue_copy(queue, A_ocl, A_g)
127
    cl.enqueue_copy(queue, B_ocl, B_g)
128
    Elapsed=time.time()-TimeIn
129
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130

    
131
    # Liberation of memory
132
    a_g.release()
133
    b_g.release()
134
    A_g.release()
135
    B_g.release()
136
    
137
    return(A_ocl,B_ocl)
138

    
139
# CUDA complete operation
140
def CUDADFT(a_np,b_np,Device,Threads):
141
    # import pycuda.autoinit
142
    import pycuda.driver as drv
143
    from pycuda.compiler import SourceModule
144
    
145
    try:
146
        # For PyCUDA import
147
        import pycuda.driver as cuda
148
        from pycuda.compiler import SourceModule
149
        
150
        cuda.init()
151
        for Id in range(cuda.Device.count()):
152
            if Id==Device:
153
                XPU=cuda.Device(Id)
154
                print("GPU selected %s" % XPU.name())
155
        print
156

    
157
    except ImportError:
158
        print("Platform does not seem to support CUDA")
159

    
160
    Context=XPU.make_context()
161
        
162
    TimeIn=time.time()
163
    mod = SourceModule("""
164

165
#define PI 3.141592653589793
166

167
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
168
{
169
  const int gid = blockIdx.x*blockDim.x+threadIdx.x;
170
  uint size = gridDim.x*blockDim.x;
171
  float A=0.,B=0.;
172
  for (uint i=0; i<size;i++) 
173
  {
174
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
175
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
176
  }
177
  A_g[gid]=A;
178
  B_g[gid]=B;
179
}
180

181
""")
182
    Elapsed=time.time()-TimeIn
183
    print("Definition of kernel : %.3f" % Elapsed)
184

    
185
    TimeIn=time.time()
186
    MyDFT = mod.get_function("MyDFT")
187
    Elapsed=time.time()-TimeIn
188
    print("Synthesis of kernel : %.3f" % Elapsed)
189

    
190
    TimeIn=time.time()
191
    A_np = np.zeros_like(a_np)
192
    B_np = np.zeros_like(a_np)
193
    Elapsed=time.time()-TimeIn
194
    print("Allocation on Host for results : %.3f" % Elapsed)
195

    
196
    Size=a_np.size
197
    if (Size % Threads != 0):
198
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
199
        TimeIn=time.time()
200
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
201
              block=(1,1,1), grid=(a_np.size,1))
202
        Elapsed=time.time()-TimeIn
203
        print("Execution of kernel : %.3f" % Elapsed)
204
    else:
205
        Blocks=int(Size/Threads)
206
        TimeIn=time.time()
207
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
208
              block=(Threads,1,1), grid=(Blocks,1))
209
        Elapsed=time.time()-TimeIn
210
        print("Execution of kernel : %.3f" % Elapsed)
211
        
212
    Context.pop()
213
    Context.detach()
214
    
215
    return(A_np,B_np)
216

    
217
import sys
218
import time
219

    
220
if __name__=='__main__':
221

    
222
    SIZE=1024
223
    Device=0
224
    NaiveMethod=False
225
    NumpyMethod=False
226
    NumbaMethod=False
227
    OpenCLMethod=True
228
    CUDAMethod=False
229
    Threads=1
230
    
231
    import getopt
232

    
233
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
234
    
235
    try:
236
        opts, args = getopt.getopt(sys.argv[1:],"nyaochs:d:t:",["size=","device="])
237
    except getopt.GetoptError:
238
        print(HowToUse % sys.argv[0])
239
        sys.exit(2)
240

    
241
    # List of Devices
242
    Devices=[]
243
    Alu={}
244
        
245
    for opt, arg in opts:
246
        if opt == '-h':
247
            print(HowToUse % sys.argv[0])
248

    
249
            print("\nInformations about devices detected under OpenCL API:")
250
            # For PyOpenCL import
251
            try:
252
                import pyopencl as cl
253
                Id=0
254
                for platform in cl.get_platforms():
255
                    for device in platform.get_devices():
256
                        #deviceType=cl.device_type.to_string(device.type)
257
                        deviceType="xPU"
258
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
259
                        Id=Id+1
260

    
261
            except:
262
                print("Your platform does not seem to support OpenCL")
263

    
264
            print("\nInformations about devices detected under CUDA API:")
265
            # For PyCUDA import
266
            try:
267
                import pycuda.driver as cuda
268
                cuda.init()
269
                for Id in range(cuda.Device.count()):
270
                    device=cuda.Device(Id)
271
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
272
                print
273
            except:
274
                print("Your platform does not seem to support CUDA")
275
        
276
            sys.exit()
277
        
278
        elif opt in ("-d", "--device"):
279
            Device=int(arg)
280
        elif opt in ("-s", "--size"):
281
            SIZE = int(arg)
282
        elif opt in ("-t", "--threads"):
283
            Threads = int(arg)
284
        elif opt in ("-n"):
285
            NaiveMethod=True
286
        elif opt in ("-y"):
287
            NumpyMethod=True
288
        elif opt in ("-a"):
289
            NumbaMethod=True
290
        elif opt in ("-o"):
291
            OpenCLMethod=True
292
        elif opt in ("-c"):
293
            CUDAMethod=True
294

    
295
    print("Device Selection : %i" % Device)
296
    print("Size of complex vector : %i" % SIZE)
297
    print("DFT Naive computation %s " % NaiveMethod )
298
    print("DFT Numpy computation %s " % NumpyMethod )
299
    print("DFT Numba computation %s " % NumbaMethod )
300
    print("DFT OpenCL computation %s " % OpenCLMethod )
301
    print("DFT CUDA computation %s " % CUDAMethod )
302
    
303
    if CUDAMethod:
304
        try:
305
            # For PyCUDA import
306
            import pycuda.driver as cuda
307
            
308
            cuda.init()
309
            for Id in range(cuda.Device.count()):
310
                device=cuda.Device(Id)
311
                print("Device #%i of type GPU : %s" % (Id,device.name()))
312
                if Id in Devices:
313
                    Alu[Id]='GPU'
314
            
315
        except ImportError:
316
            print("Platform does not seem to support CUDA")
317

    
318
    if OpenCLMethod:
319
        try:
320
            # For PyOpenCL import
321
            import pyopencl as cl
322
            Id=0
323
            for platform in cl.get_platforms():
324
                for device in platform.get_devices():
325
                    #deviceType=cl.device_type.to_string(device.type)
326
                    deviceType="xPU"
327
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
328

    
329
                    if Id in Devices:
330
                    # Set the Alu as detected Device Type
331
                        Alu[Id]=deviceType
332
                    Id=Id+1
333
        except ImportError:
334
            print("Platform does not seem to support OpenCL")
335

    
336
    
337
        
338
    a_np = np.ones(SIZE).astype(np.float32)
339
    b_np = np.ones(SIZE).astype(np.float32)
340

    
341
    C_np = np.zeros(SIZE).astype(np.float32)
342
    D_np = np.zeros(SIZE).astype(np.float32)
343
    C_np[0] = np.float32(SIZE)
344
    D_np[0] = np.float32(SIZE)
345
    
346
    # Native & Naive Implementation
347
    if NaiveMethod:
348
        print("Performing naive implementation")
349
        TimeIn=time.time()
350
        c_np,d_np=MyDFT(a_np,b_np)
351
        NativeElapsed=time.time()-TimeIn
352
        NativeRate=int(SIZE/NativeElapsed)
353
        print("NativeRate: %i" % NativeRate)
354
        print("Precision: ",np.linalg.norm(c_np-C_np),
355
              np.linalg.norm(d_np-D_np)) 
356

    
357
    # Native & Numpy Implementation
358
    if NumpyMethod:
359
        print("Performing Numpy implementation")
360
        TimeIn=time.time()
361
        e_np,f_np=NumpyDFT(a_np,b_np)
362
        NumpyElapsed=time.time()-TimeIn
363
        NumpyRate=int(SIZE/NumpyElapsed)
364
        print("NumpyRate: %i" % NumpyRate)
365
        print("Precision: ",np.linalg.norm(e_np-C_np),
366
              np.linalg.norm(f_np-D_np)) 
367
        
368
    # Native & Numba Implementation
369
    if NumbaMethod:
370
        print("Performing Numba implementation")
371
        TimeIn=time.time()
372
        g_np,h_np=NumbaDFT(a_np,b_np)
373
        NumbaElapsed=time.time()-TimeIn
374
        NumbaRate=int(SIZE/NumbaElapsed)
375
        print("NumbaRate: %i" % NumbaRate)
376
        print("Precision: ",np.linalg.norm(g_np-C_np),
377
              np.linalg.norm(h_np-D_np)) 
378
    
379
    # OpenCL Implementation
380
    if OpenCLMethod:
381
        print("Performing OpenCL implementation")
382
        TimeIn=time.time()
383
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
384
        OpenCLElapsed=time.time()-TimeIn
385
        OpenCLRate=int(SIZE/OpenCLElapsed)
386
        print("OpenCLRate: %i" % OpenCLRate)
387
        print("Precision: ",np.linalg.norm(i_np-C_np),
388
              np.linalg.norm(j_np-D_np)) 
389
    
390
    # CUDA Implementation
391
    if CUDAMethod:
392
        print("Performing CUDA implementation")
393
        TimeIn=time.time()
394
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
395
        CUDAElapsed=time.time()-TimeIn
396
        CUDARate=int(SIZE/CUDAElapsed)
397
        print("CUDARate: %i" % CUDARate)
398
        print("Precision: ",np.linalg.norm(k_np-C_np),
399
              np.linalg.norm(l_np-D_np)) 
400