Statistiques
| Révision :

root / ETSN / MyDFT_10.py

Historique | Voir | Annoter | Télécharger (17,05 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

    
7
#
8
def NumpyFFT(x,y):
9
    xy=np.csingle(x+1.j*y)
10
    XY=np.fft.fft(xy)
11
    return(XY.real,XY.imag)
12

    
13
#
14
def OpenCLFFT(x,y,device):
15
    import pyopencl as cl
16
    import pyopencl.array as cla
17
    import time
18
    import gpyfft
19
    from gpyfft.fft import FFT
20

    
21
    TimeIn=time.time()
22
    Id=0
23
    HasXPU=False
24
    for platform in cl.get_platforms():
25
        for device in platform.get_devices():
26
            if Id==Device:
27
                XPU=device
28
                print("CPU/GPU selected: ",device.name.lstrip())
29
                HasXPU=True
30
            Id+=1
31
            # print(Id)
32

    
33
    if HasXPU==False:
34
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
35
        sys.exit()           
36
    Elapsed=time.time()-TimeIn
37
    print("Selection of device : %.3f" % Elapsed)
38

    
39
    TimeIn=time.time()
40
    try:
41
        ctx = cl.Context(devices=[XPU])
42
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
43
    except:
44
        print("Crash during context creation")
45
    Elapsed=time.time()-TimeIn
46
    print("Context initialisation : %.3f" % Elapsed)
47

    
48
    TimeIn=time.time()
49
    XY_gpu = cla.to_device(queue, np.csingle(x+1.j*y))
50
    Elapsed=time.time()-TimeIn
51
    print("Copy from Host to Device : %.3f" % Elapsed)
52

    
53
    TimeIn=time.time()
54
    transform = FFT(ctx, queue, XY_gpu)
55
    event, = transform.enqueue()
56
    event.wait()
57
    Elapsed=time.time()-TimeIn
58
    print("Compute FFT : %.3f" % Elapsed)
59
    TimeIn=time.time()
60
    XY = XY_gpu.get()
61
    Elapsed=time.time()-TimeIn
62
    print("Copy from Device to Host : %.3f" % Elapsed)
63
    return(XY.real,XY.imag)
64

    
65
# Naive Discrete Fourier Transform
66
def MyDFT(x,y):
67
    size=x.shape[0]
68
    X=np.zeros(size).astype(np.float32)
69
    Y=np.zeros(size).astype(np.float32)
70
    for i in range(size):
71
        for j in range(size):
72
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)+y[j]*sin(2.*pi*i*j/size)
73
            Y[i]=Y[i]-x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
74
    return(X,Y)
75

    
76
# Numpy Discrete Fourier Transform
77
def NumpyDFT(x,y):
78
    size=x.shape[0]
79
    X=np.zeros(size).astype(np.float32)
80
    Y=np.zeros(size).astype(np.float32)
81
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
82
    for i in range(size):
83
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
84
        Y[i]=np.sum(-np.subtract(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
85
    return(X,Y)
86

    
87
# Numba Discrete Fourier Transform
88
import numba
89
@numba.njit(parallel=True)
90
def NumbaDFT(x,y):
91
    size=x.shape[0]
92
    X=np.zeros(size).astype(np.float32)
93
    Y=np.zeros(size).astype(np.float32)
94
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
95
    for i in numba.prange(size):
96
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
97
        Y[i]=np.sum(-np.subtract(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
98
    return(X,Y)
99

    
100
# OpenCL complete operation
101
def OpenCLDFT(a_np,b_np,Device):
102

    
103
    Id=0
104
    HasXPU=False
105
    for platform in cl.get_platforms():
106
        for device in platform.get_devices():
107
            if Id==Device:
108
                XPU=device
109
                print("CPU/GPU selected: ",device.name.lstrip())
110
                HasXPU=True
111
            Id+=1
112
            # print(Id)
113

    
114
    if HasXPU==False:
115
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
116
        sys.exit()           
117

    
118
    try:
119
        ctx = cl.Context(devices=[XPU])
120
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
121
    except:
122
        print("Crash during context creation")
123

    
124
    TimeIn=time.time()
125
    # Copy from Host to Device using pointers
126
    mf = cl.mem_flags
127
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
128
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
129
    Elapsed=time.time()-TimeIn
130
    print("Copy from Host 2 Device : %.3f" % Elapsed)
131

    
132
    TimeIn=time.time()
133
    # Definition of kernel under OpenCL
134
    prg = cl.Program(ctx, """
135

136
#define PI 3.141592653589793
137

138
__kernel void MyDFT(
139
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
140
{
141
  int gid = get_global_id(0);
142
  uint size = get_global_size(0);
143
  float A=0.,B=0.;
144
  for (uint i=0; i<size;i++) 
145
  {
146
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
147
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
148
  }
149
  A_g[gid]=A;
150
  B_g[gid]=B;
151
}
152
""").build()
153
    Elapsed=time.time()-TimeIn
154
    print("Building kernels : %.3f" % Elapsed)
155
    
156
    TimeIn=time.time()
157
    # Memory allocation on Device for result
158
    A_ocl = np.empty_like(a_np)
159
    B_ocl = np.empty_like(a_np)
160
    Elapsed=time.time()-TimeIn
161
    print("Allocation on Host for results : %.3f" % Elapsed)
162

    
163
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
164
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
165
    Elapsed=time.time()-TimeIn
166
    print("Allocation on Device for results : %.3f" % Elapsed)
167

    
168
    TimeIn=time.time()
169
    # Synthesis of function "sillysum" inside Kernel Sources
170
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
171
    Elapsed=time.time()-TimeIn
172
    print("Synthesis of kernel : %.3f" % Elapsed)
173

    
174
    TimeIn=time.time()
175
    # Call of kernel previously defined 
176
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
177
    # 
178
    CallCL.wait()
179
    Elapsed=time.time()-TimeIn
180
    print("Execution of kernel : %.3f" % Elapsed)
181

    
182
    TimeIn=time.time()
183
    # Copy from Device to Host
184
    cl.enqueue_copy(queue, A_ocl, A_g)
185
    cl.enqueue_copy(queue, B_ocl, B_g)
186
    Elapsed=time.time()-TimeIn
187
    print("Copy from Device 2 Host : %.3f" % Elapsed)
188

    
189
    # Liberation of memory
190
    a_g.release()
191
    b_g.release()
192
    A_g.release()
193
    B_g.release()
194
    
195
    return(A_ocl,B_ocl)
196

    
197
# CUDA complete operation
198
def CUDADFT(a_np,b_np,Device,Threads):
199
    # import pycuda.autoinit
200
    import pycuda.driver as drv
201
    from pycuda.compiler import SourceModule
202
    
203
    try:
204
        # For PyCUDA import
205
        import pycuda.driver as cuda
206
        from pycuda.compiler import SourceModule
207
        
208
        cuda.init()
209
        for Id in range(cuda.Device.count()):
210
            if Id==Device:
211
                XPU=cuda.Device(Id)
212
                print("GPU selected %s" % XPU.name())
213
        print
214

    
215
    except ImportError:
216
        print("Platform does not seem to support CUDA")
217

    
218
    Context=XPU.make_context()
219
        
220
    TimeIn=time.time()
221
    mod = SourceModule("""
222

223
#define PI 3.141592653589793
224

225
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
226
{
227
  const int gid = blockIdx.x*blockDim.x+threadIdx.x;
228
  uint size = gridDim.x*blockDim.x;
229
  float A=0.,B=0.;
230
  for (uint i=0; i<size;i++) 
231
  {
232
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
233
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
234
  }
235
  A_g[gid]=A;
236
  B_g[gid]=B;
237
}
238

239
""")
240
    Elapsed=time.time()-TimeIn
241
    print("Definition of kernel : %.3f" % Elapsed)
242

    
243
    TimeIn=time.time()
244
    MyDFT = mod.get_function("MyDFT")
245
    Elapsed=time.time()-TimeIn
246
    print("Synthesis of kernel : %.3f" % Elapsed)
247

    
248
    TimeIn=time.time()
249
    A_np = np.zeros_like(a_np)
250
    B_np = np.zeros_like(a_np)
251
    Elapsed=time.time()-TimeIn
252
    print("Allocation on Host for results : %.3f" % Elapsed)
253

    
254
    Size=a_np.size
255
    if (Size % Threads != 0):
256
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
257
        TimeIn=time.time()
258
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
259
              block=(1,1,1), grid=(a_np.size,1))
260
        Elapsed=time.time()-TimeIn
261
        print("Execution of kernel : %.3f" % Elapsed)
262
    else:
263
        Blocks=int(Size/Threads)
264
        TimeIn=time.time()
265
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
266
              block=(Threads,1,1), grid=(Blocks,1))
267
        Elapsed=time.time()-TimeIn
268
        print("Execution of kernel : %.3f" % Elapsed)
269
        
270
    Context.pop()
271
    Context.detach()
272
    
273
    return(A_np,B_np)
274

    
275
import sys
276
import time
277

    
278
if __name__=='__main__':
279

    
280
    SIZE=1024
281
    Device=0
282
    NaiveMethod=False
283
    NumpyFFTMethod=True
284
    OpenCLFFTMethod=True
285
    NumpyMethod=False
286
    NumbaMethod=False
287
    OpenCLMethod=False
288
    CUDAMethod=False
289
    Threads=1
290
    Verbose=True
291
    
292
    import getopt
293

    
294
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
295
    
296
    try:
297
        opts, args = getopt.getopt(sys.argv[1:],"vnyaochs:d:t:",["size=","device="])
298
    except getopt.GetoptError:
299
        print(HowToUse % sys.argv[0])
300
        sys.exit(2)
301

    
302
    # List of Devices
303
    Devices=[]
304
    Alu={}
305
        
306
    for opt, arg in opts:
307
        if opt == '-h':
308
            print(HowToUse % sys.argv[0])
309

    
310
            print("\nInformations about devices detected under OpenCL API:")
311
            # For PyOpenCL import
312
            try:
313
                import pyopencl as cl
314
                Id=0
315
                for platform in cl.get_platforms():
316
                    for device in platform.get_devices():
317
                        #deviceType=cl.device_type.to_string(device.type)
318
                        deviceType="xPU"
319
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
320
                        Id=Id+1
321

    
322
            except:
323
                print("Your platform does not seem to support OpenCL")
324

    
325
            print("\nInformations about devices detected under CUDA API:")
326
            # For PyCUDA import
327
            try:
328
                import pycuda.driver as cuda
329
                cuda.init()
330
                for Id in range(cuda.Device.count()):
331
                    device=cuda.Device(Id)
332
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
333
                print
334
            except:
335
                print("Your platform does not seem to support CUDA")
336
        
337
            sys.exit()
338
        
339
        elif opt in ("-d", "--device"):
340
            Device=int(arg)
341
        elif opt in ("-s", "--size"):
342
            SIZE = int(arg)
343
        elif opt in ("-t", "--threads"):
344
            Threads = int(arg)
345
        elif opt in ("-n"):
346
            NaiveMethod=True
347
        elif opt in ("-y"):
348
            NumpyMethod=True
349
        elif opt in ("-a"):
350
            NumbaMethod=True
351
        elif opt in ("-o"):
352
            OpenCLMethod=True
353
        elif opt in ("-c"):
354
            CUDAMethod=True
355

    
356
    print("Device Selection : %i" % Device)
357
    print("Size of complex vector : %i" % SIZE)
358
    print("DFT Naive computation %s " % NaiveMethod )
359
    print("DFT Numpy computation %s " % NumpyMethod )
360
    print("FFT Numpy computation %s " % NumpyFFTMethod )
361
    print("DFT Numba computation %s " % NumbaMethod )
362
    print("DFT OpenCL computation %s " % OpenCLMethod )
363
    print("FFT OpenCL computation %s " % OpenCLFFTMethod )
364
    print("DFT CUDA computation %s " % CUDAMethod )
365
    
366
    if CUDAMethod:
367
        try:
368
            # For PyCUDA import
369
            import pycuda.driver as cuda
370
            
371
            cuda.init()
372
            for Id in range(cuda.Device.count()):
373
                device=cuda.Device(Id)
374
                print("Device #%i of type GPU : %s" % (Id,device.name()))
375
                if Id in Devices:
376
                    Alu[Id]='GPU'
377
            
378
        except ImportError:
379
            print("Platform does not seem to support CUDA")
380

    
381
    if OpenCLMethod:
382
        try:
383
            # For PyOpenCL import
384
            import pyopencl as cl
385
            Id=0
386
            for platform in cl.get_platforms():
387
                for device in platform.get_devices():
388
                    #deviceType=cl.device_type.to_string(device.type)
389
                    deviceType="xPU"
390
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
391

    
392
                    if Id in Devices:
393
                    # Set the Alu as detected Device Type
394
                        Alu[Id]=deviceType
395
                    Id=Id+1
396
        except ImportError:
397
            print("Platform does not seem to support OpenCL")
398

    
399
    
400
        
401
    a_np = np.ones(SIZE).astype(np.float32)
402
    b_np = np.ones(SIZE).astype(np.float32)
403
    # a_np = np.random.rand(SIZE).astype(np.float32)
404
    # b_np = np.random.rand(SIZE).astype(np.float32)
405

    
406
    C_np = np.zeros(SIZE).astype(np.float32)
407
    D_np = np.zeros(SIZE).astype(np.float32)
408
    C_np[0] = np.float32(SIZE)
409
    D_np[0] = np.float32(SIZE)
410
    
411
    # Native & Naive Implementation
412
    if NaiveMethod:
413
        print("Performing naive implementation")
414
        TimeIn=time.time()
415
        c_np,d_np=MyDFT(a_np,b_np)
416
        NativeElapsed=time.time()-TimeIn
417
        NativeRate=int(SIZE/NativeElapsed)
418
        print("NativeRate: %i" % NativeRate)
419
        print("Precision: ",np.linalg.norm(c_np-C_np),
420
              np.linalg.norm(d_np-D_np)) 
421

    
422
    # Native & Numpy Implementation
423
    if NumpyMethod:
424
        print("Performing Numpy implementation")
425
        TimeIn=time.time()
426
        e_np,f_np=NumpyDFT(a_np,b_np)
427
        NumpyElapsed=time.time()-TimeIn
428
        NumpyRate=int(SIZE/NumpyElapsed)
429
        print("NumpyRate: %i" % NumpyRate)
430
        print("Precision: ",np.linalg.norm(e_np-C_np),
431
              np.linalg.norm(f_np-D_np)) 
432
        
433
    # Native & Numba Implementation
434
    if NumbaMethod:
435
        print("Performing Numba implementation")
436
        TimeIn=time.time()
437
        g_np,h_np=NumbaDFT(a_np,b_np)
438
        NumbaElapsed=time.time()-TimeIn
439
        NumbaRate=int(SIZE/NumbaElapsed)
440
        print("NumbaRate: %i" % NumbaRate)
441
        print("Precision: ",np.linalg.norm(g_np-C_np),
442
              np.linalg.norm(h_np-D_np)) 
443
    
444
    # OpenCL Implementation
445
    if OpenCLMethod:
446
        print("Performing OpenCL implementation")
447
        TimeIn=time.time()
448
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
449
        OpenCLElapsed=time.time()-TimeIn
450
        OpenCLRate=int(SIZE/OpenCLElapsed)
451
        print("OpenCLRate: %i" % OpenCLRate)
452
        print("Precision: ",np.linalg.norm(i_np-C_np),
453
              np.linalg.norm(j_np-D_np)) 
454
    
455
    # CUDA Implementation
456
    if CUDAMethod:
457
        print("Performing CUDA implementation")
458
        TimeIn=time.time()
459
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
460
        CUDAElapsed=time.time()-TimeIn
461
        CUDARate=int(SIZE/CUDAElapsed)
462
        print("CUDARate: %i" % CUDARate)
463
        print("Precision: ",np.linalg.norm(k_np-C_np),
464
              np.linalg.norm(l_np-D_np)) 
465
    
466
    if NumpyFFTMethod:
467
        print("Performing NumpyFFT implementation")
468
        TimeIn=time.time()
469
        m_np,n_np=NumpyFFT(a_np,b_np)
470
        NumpyFFTElapsed=time.time()-TimeIn
471
        NumpyFFTRate=int(SIZE/NumpyFFTElapsed)
472
        print("NumpyFFTElapsed: %i" % NumpyFFTElapsed)
473
        print("NumpyFFTRate: %i" % NumpyFFTRate)
474
        print("Precision: ",np.linalg.norm(m_np-C_np),
475
              np.linalg.norm(n_np-D_np)) 
476
    
477
    # OpenCL Implementation
478
    if OpenCLFFTMethod:
479
        print("Performing OpenCLFFT implementation")
480
        TimeIn=time.time()
481
        i_np,j_np=OpenCLFFT(a_np,b_np,Device)
482
        OpenCLFFTElapsed=time.time()-TimeIn
483
        OpenCLFFTRate=int(SIZE/OpenCLFFTElapsed)
484
        print("OpenCLFFTElapsed: %i" % OpenCLFFTElapsed)
485
        print("OpenCLFFTRate: %i" % OpenCLFFTRate)
486
        print("Precision: ",np.linalg.norm(i_np-C_np),
487
              np.linalg.norm(j_np-D_np)) 
488
    
489
    if OpenCLMethod and NumpyFFTMethod:
490
        print(OpenCLMethod,NumpyFFTMethod)
491
        print("Precision: ",np.linalg.norm(m_np-i_np),
492
              np.linalg.norm(n_np-j_np)) 
493
        print((m_np-i_np),(n_np-j_np)) 
494
        print(i_np,j_np) 
495
        print(m_np,n_np) 
496
        print((i_np-m_np),(j_np-n_np)) 
497
        
498
    if CUDAMethod and NumpyFFTMethod:
499
        print(CUDAMethod,NumpyFFTMethod)
500
        print("Precision: ",np.linalg.norm(m_np-k_np),
501
              np.linalg.norm(n_np-l_np)) 
502
        print((m_np-k_np),(n_np-l_np)) 
503
        print(k_np,l_np) 
504
        print(m_np,n_np) 
505
        print((k_np-m_np),(l_np-n_np)) 
506
        
507
    if OpenCLMethod and NumpyMethod:
508
        print(OpenCLMethod,NumpyMethod)
509
        print("Precision: ",np.linalg.norm(e_np-i_np),
510
              np.linalg.norm(f_np-j_np)) 
511
        print((e_np-i_np),(f_np-j_np)) 
512
        
513
    if NumpyFFTMethod and NumpyMethod:
514
        print(NumpyFFTMethod,NumpyMethod)
515
        print("Precision: ",np.linalg.norm(e_np-m_np),
516
              np.linalg.norm(f_np-n_np)) 
517
        print(e_np,f_np) 
518
        print(m_np,n_np) 
519
        print((e_np-m_np),(f_np-n_np)) 
520
        
521
    if NumpyFFTMethod and NaiveMethod:
522
        print(NumpyFFTMethod,NaiveMethod)
523
        print("Precision: ",np.linalg.norm(c_np-m_np),
524
              np.linalg.norm(d_np-n_np)) 
525
        print(c_np,d_np) 
526
        print(m_np,n_np) 
527
        print((c_np-m_np),(d_np-n_np)) 
528
        
529
    if NumpyFFTMethod and NumbaMethod:
530
        print(NumpyFFTMethod,NumbaMethod)
531
        print("Precision: ",np.linalg.norm(g_np-m_np),
532
              np.linalg.norm(h_np-n_np)) 
533
        print(g_np,h_np) 
534
        print(m_np,n_np) 
535
        print((g_np-m_np),(h_np-n_np)) 
536
        
537
    if OpenCLFFTMethod and NumpyFFTMethod:
538
        print("NumpyOpenCLRatio: %f" % (OpenCLFFTRate/NumpyFFTRate))