Statistiques
| Révision :

root / ETSN / MyDFT_10.py @ 288

Historique | Voir | Annoter | Télécharger (14,39 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

    
7
#
8
def NumpyFFT(x,y):
9
    xy=x+1.j*y
10
    XY=np.fft.fft(xy)
11
    return(XY.real,XY.imag)
12

    
13
#
14
def OpenCLFFT(x,y,device):
15
    import pyopencl as cl
16
    import pyopencl.array as cla
17
    import time
18
    import gpyfft
19
    from gpyfft.fft import FFT
20

    
21
    Id=0
22
    HasXPU=False
23
    for platform in cl.get_platforms():
24
        for device in platform.get_devices():
25
            if Id==Device:
26
                XPU=device
27
                print("CPU/GPU selected: ",device.name.lstrip())
28
                HasXPU=True
29
            Id+=1
30
            # print(Id)
31

    
32
    if HasXPU==False:
33
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
34
        sys.exit()           
35

    
36
    try:
37
        ctx = cl.Context(devices=[XPU])
38
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
39
    except:
40
        print("Crash during context creation")
41

    
42
    XY_gpu = cla.to_device(queue, x+1.j*y)
43

    
44
    transform = FFT(ctx, queue, XY_gpu)
45

    
46
    event, = transform.enqueue()
47
    event.wait()
48

    
49
    XY = XY_gpu.get()
50
    return(XY.real,XY.imag)
51

    
52
# Naive Discrete Fourier Transform
53
def MyDFT(x,y):
54
    size=x.shape[0]
55
    X=np.zeros(size).astype(np.float32)
56
    Y=np.zeros(size).astype(np.float32)
57
    for i in range(size):
58
        for j in range(size):
59
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
60
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
61
    return(X,Y)
62

    
63
# Numpy Discrete Fourier Transform
64
def NumpyDFT(x,y):
65
    size=x.shape[0]
66
    X=np.zeros(size).astype(np.float32)
67
    Y=np.zeros(size).astype(np.float32)
68
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
69
    for i in range(size):
70
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
71
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
72
    return(X,Y)
73

    
74
# Numba Discrete Fourier Transform
75
import numba
76
@numba.njit(parallel=True)
77
def NumbaDFT(x,y):
78
    size=x.shape[0]
79
    X=np.zeros(size).astype(np.float32)
80
    Y=np.zeros(size).astype(np.float32)
81
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
82
    for i in numba.prange(size):
83
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
84
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
85
    return(X,Y)
86

    
87
# OpenCL complete operation
88
def OpenCLDFT(a_np,b_np,Device):
89

    
90
    Id=0
91
    HasXPU=False
92
    for platform in cl.get_platforms():
93
        for device in platform.get_devices():
94
            if Id==Device:
95
                XPU=device
96
                print("CPU/GPU selected: ",device.name.lstrip())
97
                HasXPU=True
98
            Id+=1
99
            # print(Id)
100

    
101
    if HasXPU==False:
102
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
103
        sys.exit()           
104

    
105
    try:
106
        ctx = cl.Context(devices=[XPU])
107
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
108
    except:
109
        print("Crash during context creation")
110

    
111
    TimeIn=time.time()
112
    # Copy from Host to Device using pointers
113
    mf = cl.mem_flags
114
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
115
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
116
    Elapsed=time.time()-TimeIn
117
    print("Copy from Host 2 Device : %.3f" % Elapsed)
118

    
119
    TimeIn=time.time()
120
    # Definition of kernel under OpenCL
121
    prg = cl.Program(ctx, """
122

123
#define PI 3.141592653589793
124

125
__kernel void MyDFT(
126
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
127
{
128
  int gid = get_global_id(0);
129
  uint size = get_global_size(0);
130
  float A=0.,B=0.;
131
  for (uint i=0; i<size;i++) 
132
  {
133
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
134
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
135
  }
136
  A_g[gid]=A;
137
  B_g[gid]=B;
138
}
139
""").build()
140
    Elapsed=time.time()-TimeIn
141
    print("Building kernels : %.3f" % Elapsed)
142
    
143
    TimeIn=time.time()
144
    # Memory allocation on Device for result
145
    A_ocl = np.empty_like(a_np)
146
    B_ocl = np.empty_like(a_np)
147
    Elapsed=time.time()-TimeIn
148
    print("Allocation on Host for results : %.3f" % Elapsed)
149

    
150
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
151
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
152
    Elapsed=time.time()-TimeIn
153
    print("Allocation on Device for results : %.3f" % Elapsed)
154

    
155
    TimeIn=time.time()
156
    # Synthesis of function "sillysum" inside Kernel Sources
157
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
158
    Elapsed=time.time()-TimeIn
159
    print("Synthesis of kernel : %.3f" % Elapsed)
160

    
161
    TimeIn=time.time()
162
    # Call of kernel previously defined 
163
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
164
    # 
165
    CallCL.wait()
166
    Elapsed=time.time()-TimeIn
167
    print("Execution of kernel : %.3f" % Elapsed)
168

    
169
    TimeIn=time.time()
170
    # Copy from Device to Host
171
    cl.enqueue_copy(queue, A_ocl, A_g)
172
    cl.enqueue_copy(queue, B_ocl, B_g)
173
    Elapsed=time.time()-TimeIn
174
    print("Copy from Device 2 Host : %.3f" % Elapsed)
175

    
176
    # Liberation of memory
177
    a_g.release()
178
    b_g.release()
179
    A_g.release()
180
    B_g.release()
181
    
182
    return(A_ocl,B_ocl)
183

    
184
# CUDA complete operation
185
def CUDADFT(a_np,b_np,Device,Threads):
186
    # import pycuda.autoinit
187
    import pycuda.driver as drv
188
    from pycuda.compiler import SourceModule
189
    
190
    try:
191
        # For PyCUDA import
192
        import pycuda.driver as cuda
193
        from pycuda.compiler import SourceModule
194
        
195
        cuda.init()
196
        for Id in range(cuda.Device.count()):
197
            if Id==Device:
198
                XPU=cuda.Device(Id)
199
                print("GPU selected %s" % XPU.name())
200
        print
201

    
202
    except ImportError:
203
        print("Platform does not seem to support CUDA")
204

    
205
    Context=XPU.make_context()
206
        
207
    TimeIn=time.time()
208
    mod = SourceModule("""
209

210
#define PI 3.141592653589793
211

212
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
213
{
214
  const int gid = blockIdx.x*blockDim.x+threadIdx.x;
215
  uint size = gridDim.x*blockDim.x;
216
  float A=0.,B=0.;
217
  for (uint i=0; i<size;i++) 
218
  {
219
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
220
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
221
  }
222
  A_g[gid]=A;
223
  B_g[gid]=B;
224
}
225

226
""")
227
    Elapsed=time.time()-TimeIn
228
    print("Definition of kernel : %.3f" % Elapsed)
229

    
230
    TimeIn=time.time()
231
    MyDFT = mod.get_function("MyDFT")
232
    Elapsed=time.time()-TimeIn
233
    print("Synthesis of kernel : %.3f" % Elapsed)
234

    
235
    TimeIn=time.time()
236
    A_np = np.zeros_like(a_np)
237
    B_np = np.zeros_like(a_np)
238
    Elapsed=time.time()-TimeIn
239
    print("Allocation on Host for results : %.3f" % Elapsed)
240

    
241
    Size=a_np.size
242
    if (Size % Threads != 0):
243
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
244
        TimeIn=time.time()
245
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
246
              block=(1,1,1), grid=(a_np.size,1))
247
        Elapsed=time.time()-TimeIn
248
        print("Execution of kernel : %.3f" % Elapsed)
249
    else:
250
        Blocks=int(Size/Threads)
251
        TimeIn=time.time()
252
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
253
              block=(Threads,1,1), grid=(Blocks,1))
254
        Elapsed=time.time()-TimeIn
255
        print("Execution of kernel : %.3f" % Elapsed)
256
        
257
    Context.pop()
258
    Context.detach()
259
    
260
    return(A_np,B_np)
261

    
262
import sys
263
import time
264

    
265
if __name__=='__main__':
266

    
267
    SIZE=1024
268
    Device=0
269
    NaiveMethod=False
270
    NumpyFFTMethod=True
271
    OpenCLFFTMethod=True
272
    NumpyMethod=False
273
    NumbaMethod=False
274
    OpenCLMethod=False
275
    CUDAMethod=False
276
    Threads=1
277
    
278
    import getopt
279

    
280
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
281
    
282
    try:
283
        opts, args = getopt.getopt(sys.argv[1:],"nyaochs:d:t:",["size=","device="])
284
    except getopt.GetoptError:
285
        print(HowToUse % sys.argv[0])
286
        sys.exit(2)
287

    
288
    # List of Devices
289
    Devices=[]
290
    Alu={}
291
        
292
    for opt, arg in opts:
293
        if opt == '-h':
294
            print(HowToUse % sys.argv[0])
295

    
296
            print("\nInformations about devices detected under OpenCL API:")
297
            # For PyOpenCL import
298
            try:
299
                import pyopencl as cl
300
                Id=0
301
                for platform in cl.get_platforms():
302
                    for device in platform.get_devices():
303
                        #deviceType=cl.device_type.to_string(device.type)
304
                        deviceType="xPU"
305
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
306
                        Id=Id+1
307

    
308
            except:
309
                print("Your platform does not seem to support OpenCL")
310

    
311
            print("\nInformations about devices detected under CUDA API:")
312
            # For PyCUDA import
313
            try:
314
                import pycuda.driver as cuda
315
                cuda.init()
316
                for Id in range(cuda.Device.count()):
317
                    device=cuda.Device(Id)
318
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
319
                print
320
            except:
321
                print("Your platform does not seem to support CUDA")
322
        
323
            sys.exit()
324
        
325
        elif opt in ("-d", "--device"):
326
            Device=int(arg)
327
        elif opt in ("-s", "--size"):
328
            SIZE = int(arg)
329
        elif opt in ("-t", "--threads"):
330
            Threads = int(arg)
331
        elif opt in ("-n"):
332
            NaiveMethod=True
333
        elif opt in ("-y"):
334
            NumpyMethod=True
335
        elif opt in ("-a"):
336
            NumbaMethod=True
337
        elif opt in ("-o"):
338
            OpenCLMethod=True
339
        elif opt in ("-c"):
340
            CUDAMethod=True
341

    
342
    print("Device Selection : %i" % Device)
343
    print("Size of complex vector : %i" % SIZE)
344
    print("DFT Naive computation %s " % NaiveMethod )
345
    print("DFT Numpy computation %s " % NumpyMethod )
346
    print("DFT Numba computation %s " % NumbaMethod )
347
    print("DFT OpenCL computation %s " % OpenCLMethod )
348
    print("DFT CUDA computation %s " % CUDAMethod )
349
    
350
    if CUDAMethod:
351
        try:
352
            # For PyCUDA import
353
            import pycuda.driver as cuda
354
            
355
            cuda.init()
356
            for Id in range(cuda.Device.count()):
357
                device=cuda.Device(Id)
358
                print("Device #%i of type GPU : %s" % (Id,device.name()))
359
                if Id in Devices:
360
                    Alu[Id]='GPU'
361
            
362
        except ImportError:
363
            print("Platform does not seem to support CUDA")
364

    
365
    if OpenCLMethod:
366
        try:
367
            # For PyOpenCL import
368
            import pyopencl as cl
369
            Id=0
370
            for platform in cl.get_platforms():
371
                for device in platform.get_devices():
372
                    #deviceType=cl.device_type.to_string(device.type)
373
                    deviceType="xPU"
374
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
375

    
376
                    if Id in Devices:
377
                    # Set the Alu as detected Device Type
378
                        Alu[Id]=deviceType
379
                    Id=Id+1
380
        except ImportError:
381
            print("Platform does not seem to support OpenCL")
382

    
383
    
384
        
385
    a_np = np.ones(SIZE).astype(np.float32)
386
    b_np = np.ones(SIZE).astype(np.float32)
387

    
388
    C_np = np.zeros(SIZE).astype(np.float32)
389
    D_np = np.zeros(SIZE).astype(np.float32)
390
    C_np[0] = np.float32(SIZE)
391
    D_np[0] = np.float32(SIZE)
392
    
393
    # Native & Naive Implementation
394
    if NaiveMethod:
395
        print("Performing naive implementation")
396
        TimeIn=time.time()
397
        c_np,d_np=MyDFT(a_np,b_np)
398
        NativeElapsed=time.time()-TimeIn
399
        NativeRate=int(SIZE/NativeElapsed)
400
        print("NativeRate: %i" % NativeRate)
401
        print("Precision: ",np.linalg.norm(c_np-C_np),
402
              np.linalg.norm(d_np-D_np)) 
403

    
404
    # Native & Numpy Implementation
405
    if NumpyMethod:
406
        print("Performing Numpy implementation")
407
        TimeIn=time.time()
408
        e_np,f_np=NumpyDFT(a_np,b_np)
409
        NumpyElapsed=time.time()-TimeIn
410
        NumpyRate=int(SIZE/NumpyElapsed)
411
        print("NumpyRate: %i" % NumpyRate)
412
        print("Precision: ",np.linalg.norm(e_np-C_np),
413
              np.linalg.norm(f_np-D_np)) 
414
        
415
    # Native & Numba Implementation
416
    if NumbaMethod:
417
        print("Performing Numba implementation")
418
        TimeIn=time.time()
419
        g_np,h_np=NumbaDFT(a_np,b_np)
420
        NumbaElapsed=time.time()-TimeIn
421
        NumbaRate=int(SIZE/NumbaElapsed)
422
        print("NumbaRate: %i" % NumbaRate)
423
        print("Precision: ",np.linalg.norm(g_np-C_np),
424
              np.linalg.norm(h_np-D_np)) 
425
    
426
    # OpenCL Implementation
427
    if OpenCLMethod:
428
        print("Performing OpenCL implementation")
429
        TimeIn=time.time()
430
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
431
        OpenCLElapsed=time.time()-TimeIn
432
        OpenCLRate=int(SIZE/OpenCLElapsed)
433
        print("OpenCLRate: %i" % OpenCLRate)
434
        print("Precision: ",np.linalg.norm(i_np-C_np),
435
              np.linalg.norm(j_np-D_np)) 
436
    
437
    # CUDA Implementation
438
    if CUDAMethod:
439
        print("Performing CUDA implementation")
440
        TimeIn=time.time()
441
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
442
        CUDAElapsed=time.time()-TimeIn
443
        CUDARate=int(SIZE/CUDAElapsed)
444
        print("CUDARate: %i" % CUDARate)
445
        print("Precision: ",np.linalg.norm(k_np-C_np),
446
              np.linalg.norm(l_np-D_np)) 
447
    
448
    if NumpyFFTMethod:
449
        print("Performing NumpyFFT implementation")
450
        TimeIn=time.time()
451
        m_np,n_np=NumpyFFT(a_np,b_np)
452
        NumpyFFTElapsed=time.time()-TimeIn
453
        NumpyFFTRate=int(SIZE/NumpyFFTElapsed)
454
        print("NumpyFFTRate: %i" % NumpyFFTRate)
455
        print("Precision: ",np.linalg.norm(m_np-C_np),
456
              np.linalg.norm(n_np-D_np)) 
457
    
458
    # OpenCL Implementation
459
    if OpenCLFFTMethod:
460
        print("Performing OpenCL implementation")
461
        TimeIn=time.time()
462
        i_np,j_np=OpenCLFFT(a_np,b_np,Device)
463
        OpenCLFFTElapsed=time.time()-TimeIn
464
        OpenCLFFTRate=int(SIZE/OpenCLFFTElapsed)
465
        print("OpenCLRate: %i" % OpenCLFFTRate)
466
        print("Precision: ",np.linalg.norm(i_np-C_np),
467
              np.linalg.norm(j_np-D_np)) 
468