Statistiques
| Révision :

root / ETSN / MyDFT_10.py @ 302

Historique | Voir | Annoter | Télécharger (16,99 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

    
7
#
8
def NumpyFFT(x,y):
9
    xy=np.csingle(x+1.j*y)
10
    XY=np.fft.fft(xy)
11
    print(XY)
12
    return(XY.real,XY.imag)
13

    
14
#
15
def OpenCLFFT(x,y,device):
16
    import pyopencl as cl
17
    import pyopencl.array as cla
18
    import time
19
    import gpyfft
20
    from gpyfft.fft import FFT
21

    
22
    TimeIn=time.time()
23
    Id=0
24
    HasXPU=False
25
    for platform in cl.get_platforms():
26
        for device in platform.get_devices():
27
            if Id==Device:
28
                XPU=device
29
                print("CPU/GPU selected: ",device.name.lstrip())
30
                HasXPU=True
31
            Id+=1
32
            # print(Id)
33

    
34
    if HasXPU==False:
35
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
36
        sys.exit()           
37
    Elapsed=time.time()-TimeIn
38
    print("Selection of device : %.3f" % Elapsed)
39

    
40
    TimeIn=time.time()
41
    try:
42
        ctx = cl.Context(devices=[XPU])
43
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
44
    except:
45
        print("Crash during context creation")
46
    Elapsed=time.time()-TimeIn
47
    print("Context initialisation : %.3f" % Elapsed)
48

    
49
    TimeIn=time.time()
50
    XY_gpu = cla.to_device(queue, np.csingle(x+1.j*y))
51
    Elapsed=time.time()-TimeIn
52
    print("Copy from Host to Device : %.3f" % Elapsed)
53

    
54
    TimeIn=time.time()
55
    transform = FFT(ctx, queue, XY_gpu)   
56
    event, = transform.enqueue()
57
    event.wait()
58
    Elapsed=time.time()-TimeIn
59
    print("Compute FFT : %.3f" % Elapsed)
60
    TimeIn=time.time()
61
    XY = XY_gpu.get()
62
    Elapsed=time.time()-TimeIn
63
    print("Copy from Device to Host : %.3f" % Elapsed)
64
    print(XY)
65
    return(XY.real,XY.imag)
66

    
67
# Naive Discrete Fourier Transform
68
def MyDFT(x,y):
69
    size=x.shape[0]
70
    X=np.zeros(size).astype(np.float32)
71
    Y=np.zeros(size).astype(np.float32)
72
    for i in range(size):
73
        for j in range(size):
74
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)+y[j]*sin(2.*pi*i*j/size)
75
            Y[i]=Y[i]-x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
76
    return(X,Y)
77

    
78
# Numpy Discrete Fourier Transform
79
def NumpyDFT(x,y):
80
    size=x.shape[0]
81
    X=np.zeros(size).astype(np.float32)
82
    Y=np.zeros(size).astype(np.float32)
83
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
84
    for i in range(size):
85
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
86
        Y[i]=np.sum(-np.subtract(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
87
    return(X,Y)
88

    
89
# Numba Discrete Fourier Transform
90
import numba
91
@numba.njit(parallel=True)
92
def NumbaDFT(x,y):
93
    size=x.shape[0]
94
    X=np.zeros(size).astype(np.float32)
95
    Y=np.zeros(size).astype(np.float32)
96
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
97
    for i in numba.prange(size):
98
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
99
        Y[i]=np.sum(-np.subtract(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
100
    return(X,Y)
101

    
102
# OpenCL complete operation
103
def OpenCLDFT(a_np,b_np,Device):
104

    
105
    Id=0
106
    HasXPU=False
107
    for platform in cl.get_platforms():
108
        for device in platform.get_devices():
109
            if Id==Device:
110
                XPU=device
111
                print("CPU/GPU selected: ",device.name.lstrip())
112
                HasXPU=True
113
            Id+=1
114
            # print(Id)
115

    
116
    if HasXPU==False:
117
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
118
        sys.exit()           
119

    
120
    try:
121
        ctx = cl.Context(devices=[XPU])
122
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
123
    except:
124
        print("Crash during context creation")
125

    
126
    TimeIn=time.time()
127
    # Copy from Host to Device using pointers
128
    mf = cl.mem_flags
129
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
130
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
131
    Elapsed=time.time()-TimeIn
132
    print("Copy from Host 2 Device : %.3f" % Elapsed)
133

    
134
    TimeIn=time.time()
135
    # Definition of kernel under OpenCL
136
    prg = cl.Program(ctx, """
137

138
#define PI 3.141592653589793
139

140
__kernel void MyDFT(
141
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
142
{
143
  int gid = get_global_id(0);
144
  uint size = get_global_size(0);
145
  float A=0.,B=0.;
146
  for (uint i=0; i<size;i++) 
147
  {
148
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
149
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
150
  }
151
  A_g[gid]=A;
152
  B_g[gid]=B;
153
}
154
""").build()
155
    Elapsed=time.time()-TimeIn
156
    print("Building kernels : %.3f" % Elapsed)
157
    
158
    TimeIn=time.time()
159
    # Memory allocation on Device for result
160
    A_ocl = np.empty_like(a_np)
161
    B_ocl = np.empty_like(a_np)
162
    Elapsed=time.time()-TimeIn
163
    print("Allocation on Host for results : %.3f" % Elapsed)
164

    
165
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
166
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
167
    Elapsed=time.time()-TimeIn
168
    print("Allocation on Device for results : %.3f" % Elapsed)
169

    
170
    TimeIn=time.time()
171
    # Synthesis of function "sillysum" inside Kernel Sources
172
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
173
    Elapsed=time.time()-TimeIn
174
    print("Synthesis of kernel : %.3f" % Elapsed)
175

    
176
    TimeIn=time.time()
177
    # Call of kernel previously defined 
178
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
179
    # 
180
    CallCL.wait()
181
    Elapsed=time.time()-TimeIn
182
    print("Execution of kernel : %.3f" % Elapsed)
183

    
184
    TimeIn=time.time()
185
    # Copy from Device to Host
186
    cl.enqueue_copy(queue, A_ocl, A_g)
187
    cl.enqueue_copy(queue, B_ocl, B_g)
188
    Elapsed=time.time()-TimeIn
189
    print("Copy from Device 2 Host : %.3f" % Elapsed)
190

    
191
    # Liberation of memory
192
    a_g.release()
193
    b_g.release()
194
    A_g.release()
195
    B_g.release()
196
    
197
    return(A_ocl,B_ocl)
198

    
199
# CUDA complete operation
200
def CUDADFT(a_np,b_np,Device,Threads):
201
    # import pycuda.autoinit
202
    import pycuda.driver as drv
203
    from pycuda.compiler import SourceModule
204
    
205
    try:
206
        # For PyCUDA import
207
        import pycuda.driver as cuda
208
        from pycuda.compiler import SourceModule
209
        
210
        cuda.init()
211
        for Id in range(cuda.Device.count()):
212
            if Id==Device:
213
                XPU=cuda.Device(Id)
214
                print("GPU selected %s" % XPU.name())
215
        print
216

    
217
    except ImportError:
218
        print("Platform does not seem to support CUDA")
219

    
220
    Context=XPU.make_context()
221
        
222
    TimeIn=time.time()
223
    mod = SourceModule("""
224

225
#define PI 3.141592653589793
226

227
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
228
{
229
  const int gid = blockIdx.x*blockDim.x+threadIdx.x;
230
  uint size = gridDim.x*blockDim.x;
231
  float A=0.,B=0.;
232
  for (uint i=0; i<size;i++) 
233
  {
234
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
235
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
236
  }
237
  A_g[gid]=A;
238
  B_g[gid]=B;
239
}
240

241
""")
242
    Elapsed=time.time()-TimeIn
243
    print("Definition of kernel : %.3f" % Elapsed)
244

    
245
    TimeIn=time.time()
246
    MyDFT = mod.get_function("MyDFT")
247
    Elapsed=time.time()-TimeIn
248
    print("Synthesis of kernel : %.3f" % Elapsed)
249

    
250
    TimeIn=time.time()
251
    A_np = np.zeros_like(a_np)
252
    B_np = np.zeros_like(a_np)
253
    Elapsed=time.time()-TimeIn
254
    print("Allocation on Host for results : %.3f" % Elapsed)
255

    
256
    Size=a_np.size
257
    if (Size % Threads != 0):
258
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
259
        TimeIn=time.time()
260
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
261
              block=(1,1,1), grid=(a_np.size,1))
262
        Elapsed=time.time()-TimeIn
263
        print("Execution of kernel : %.3f" % Elapsed)
264
    else:
265
        Blocks=int(Size/Threads)
266
        TimeIn=time.time()
267
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
268
              block=(Threads,1,1), grid=(Blocks,1))
269
        Elapsed=time.time()-TimeIn
270
        print("Execution of kernel : %.3f" % Elapsed)
271
        
272
    Context.pop()
273
    Context.detach()
274
    
275
    return(A_np,B_np)
276

    
277
import sys
278
import time
279

    
280
if __name__=='__main__':
281

    
282
    SIZE=1024
283
    Device=0
284
    NaiveMethod=False
285
    NumpyFFTMethod=True
286
    OpenCLFFTMethod=False
287
    NumpyMethod=False
288
    NumbaMethod=False
289
    OpenCLMethod=False
290
    CUDAMethod=True
291
    Threads=1
292
    
293
    import getopt
294

    
295
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
296
    
297
    try:
298
        opts, args = getopt.getopt(sys.argv[1:],"nyaochs:d:t:",["size=","device="])
299
    except getopt.GetoptError:
300
        print(HowToUse % sys.argv[0])
301
        sys.exit(2)
302

    
303
    # List of Devices
304
    Devices=[]
305
    Alu={}
306
        
307
    for opt, arg in opts:
308
        if opt == '-h':
309
            print(HowToUse % sys.argv[0])
310

    
311
            print("\nInformations about devices detected under OpenCL API:")
312
            # For PyOpenCL import
313
            try:
314
                import pyopencl as cl
315
                Id=0
316
                for platform in cl.get_platforms():
317
                    for device in platform.get_devices():
318
                        #deviceType=cl.device_type.to_string(device.type)
319
                        deviceType="xPU"
320
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
321
                        Id=Id+1
322

    
323
            except:
324
                print("Your platform does not seem to support OpenCL")
325

    
326
            print("\nInformations about devices detected under CUDA API:")
327
            # For PyCUDA import
328
            try:
329
                import pycuda.driver as cuda
330
                cuda.init()
331
                for Id in range(cuda.Device.count()):
332
                    device=cuda.Device(Id)
333
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
334
                print
335
            except:
336
                print("Your platform does not seem to support CUDA")
337
        
338
            sys.exit()
339
        
340
        elif opt in ("-d", "--device"):
341
            Device=int(arg)
342
        elif opt in ("-s", "--size"):
343
            SIZE = int(arg)
344
        elif opt in ("-t", "--threads"):
345
            Threads = int(arg)
346
        elif opt in ("-n"):
347
            NaiveMethod=True
348
        elif opt in ("-y"):
349
            NumpyMethod=True
350
        elif opt in ("-a"):
351
            NumbaMethod=True
352
        elif opt in ("-o"):
353
            OpenCLMethod=True
354
        elif opt in ("-c"):
355
            CUDAMethod=True
356

    
357
    print("Device Selection : %i" % Device)
358
    print("Size of complex vector : %i" % SIZE)
359
    print("DFT Naive computation %s " % NaiveMethod )
360
    print("DFT Numpy computation %s " % NumpyMethod )
361
    print("FFT Numpy computation %s " % NumpyFFTMethod )
362
    print("DFT Numba computation %s " % NumbaMethod )
363
    print("DFT OpenCL computation %s " % OpenCLMethod )
364
    print("DFT CUDA computation %s " % CUDAMethod )
365
    
366
    if CUDAMethod:
367
        try:
368
            # For PyCUDA import
369
            import pycuda.driver as cuda
370
            
371
            cuda.init()
372
            for Id in range(cuda.Device.count()):
373
                device=cuda.Device(Id)
374
                print("Device #%i of type GPU : %s" % (Id,device.name()))
375
                if Id in Devices:
376
                    Alu[Id]='GPU'
377
            
378
        except ImportError:
379
            print("Platform does not seem to support CUDA")
380

    
381
    if OpenCLMethod:
382
        try:
383
            # For PyOpenCL import
384
            import pyopencl as cl
385
            Id=0
386
            for platform in cl.get_platforms():
387
                for device in platform.get_devices():
388
                    #deviceType=cl.device_type.to_string(device.type)
389
                    deviceType="xPU"
390
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
391

    
392
                    if Id in Devices:
393
                    # Set the Alu as detected Device Type
394
                        Alu[Id]=deviceType
395
                    Id=Id+1
396
        except ImportError:
397
            print("Platform does not seem to support OpenCL")
398

    
399
    
400
        
401
    a_np = np.ones(SIZE).astype(np.float32)
402
    b_np = np.ones(SIZE).astype(np.float32)
403
    # a_np = np.random.rand(SIZE).astype(np.float32)
404
    # b_np = np.random.rand(SIZE).astype(np.float32)
405

    
406
    C_np = np.zeros(SIZE).astype(np.float32)
407
    D_np = np.zeros(SIZE).astype(np.float32)
408
    C_np[0] = np.float32(SIZE)
409
    D_np[0] = np.float32(SIZE)
410
    
411
    # Native & Naive Implementation
412
    if NaiveMethod:
413
        print("Performing naive implementation")
414
        TimeIn=time.time()
415
        c_np,d_np=MyDFT(a_np,b_np)
416
        NativeElapsed=time.time()-TimeIn
417
        NativeRate=int(SIZE/NativeElapsed)
418
        print("NativeRate: %i" % NativeRate)
419
        print("Precision: ",np.linalg.norm(c_np-C_np),
420
              np.linalg.norm(d_np-D_np)) 
421

    
422
    # Native & Numpy Implementation
423
    if NumpyMethod:
424
        print("Performing Numpy implementation")
425
        TimeIn=time.time()
426
        e_np,f_np=NumpyDFT(a_np,b_np)
427
        NumpyElapsed=time.time()-TimeIn
428
        NumpyRate=int(SIZE/NumpyElapsed)
429
        print("NumpyRate: %i" % NumpyRate)
430
        print("Precision: ",np.linalg.norm(e_np-C_np),
431
              np.linalg.norm(f_np-D_np)) 
432
        
433
    # Native & Numba Implementation
434
    if NumbaMethod:
435
        print("Performing Numba implementation")
436
        TimeIn=time.time()
437
        g_np,h_np=NumbaDFT(a_np,b_np)
438
        NumbaElapsed=time.time()-TimeIn
439
        NumbaRate=int(SIZE/NumbaElapsed)
440
        print("NumbaRate: %i" % NumbaRate)
441
        print("Precision: ",np.linalg.norm(g_np-C_np),
442
              np.linalg.norm(h_np-D_np)) 
443
    
444
    # OpenCL Implementation
445
    if OpenCLMethod:
446
        print("Performing OpenCL implementation")
447
        TimeIn=time.time()
448
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
449
        OpenCLElapsed=time.time()-TimeIn
450
        OpenCLRate=int(SIZE/OpenCLElapsed)
451
        print("OpenCLRate: %i" % OpenCLRate)
452
        print("Precision: ",np.linalg.norm(i_np-C_np),
453
              np.linalg.norm(j_np-D_np)) 
454
    
455
    # CUDA Implementation
456
    if CUDAMethod:
457
        print("Performing CUDA implementation")
458
        TimeIn=time.time()
459
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
460
        CUDAElapsed=time.time()-TimeIn
461
        CUDARate=int(SIZE/CUDAElapsed)
462
        print("CUDARate: %i" % CUDARate)
463
        print("Precision: ",np.linalg.norm(k_np-C_np),
464
              np.linalg.norm(l_np-D_np)) 
465
    
466
    if NumpyFFTMethod:
467
        print("Performing NumpyFFT implementation")
468
        TimeIn=time.time()
469
        m_np,n_np=NumpyFFT(a_np,b_np)
470
        NumpyFFTElapsed=time.time()-TimeIn
471
        NumpyFFTRate=int(SIZE/NumpyFFTElapsed)
472
        print("NumpyFFTElapsed: %i" % NumpyFFTElapsed)
473
        print("NumpyFFTRate: %i" % NumpyFFTRate)
474
        print("Precision: ",np.linalg.norm(m_np-C_np),
475
              np.linalg.norm(n_np-D_np)) 
476
    
477
    # OpenCL Implementation
478
    if OpenCLFFTMethod:
479
        print("Performing OpenCL implementation")
480
        TimeIn=time.time()
481
        i_np,j_np=OpenCLFFT(a_np,b_np,Device)
482
        OpenCLFFTElapsed=time.time()-TimeIn
483
        OpenCLFFTRate=int(SIZE/OpenCLFFTElapsed)
484
        print("OpenCLElapsed: %i" % OpenCLFFTElapsed)
485
        print("OpenCLRate: %i" % OpenCLFFTRate)
486
        print("Precision: ",np.linalg.norm(i_np-C_np),
487
              np.linalg.norm(j_np-D_np)) 
488
    
489
    if OpenCLMethod and NumpyFFTMethod:
490
        print(OpenCLMethod,NumpyFFTMethod)
491
        print("Precision: ",np.linalg.norm(m_np-i_np),
492
              np.linalg.norm(n_np-j_np)) 
493
        print((m_np-i_np),(n_np-j_np)) 
494
        print(i_np,j_np) 
495
        print(m_np,n_np) 
496
        print((i_np-m_np),(j_np-n_np)) 
497
        
498
    if CUDAMethod and NumpyFFTMethod:
499
        print(CUDAMethod,NumpyFFTMethod)
500
        print("Precision: ",np.linalg.norm(m_np-k_np),
501
              np.linalg.norm(n_np-l_np)) 
502
        print((m_np-k_np),(n_np-l_np)) 
503
        print(k_np,l_np) 
504
        print(m_np,n_np) 
505
        print((k_np-m_np),(l_np-n_np)) 
506
        
507
    if OpenCLMethod and NumpyMethod:
508
        print(OpenCLMethod,NumpyMethod)
509
        print("Precision: ",np.linalg.norm(e_np-i_np),
510
              np.linalg.norm(f_np-j_np)) 
511
        print((e_np-i_np),(f_np-j_np)) 
512
        
513
    if NumpyFFTMethod and NumpyMethod:
514
        print(NumpyFFTMethod,NumpyMethod)
515
        print("Precision: ",np.linalg.norm(e_np-m_np),
516
              np.linalg.norm(f_np-n_np)) 
517
        print(e_np,f_np) 
518
        print(m_np,n_np) 
519
        print((e_np-m_np),(f_np-n_np)) 
520
        
521
    if NumpyFFTMethod and NaiveMethod:
522
        print(NumpyFFTMethod,NaiveMethod)
523
        print("Precision: ",np.linalg.norm(c_np-m_np),
524
              np.linalg.norm(d_np-n_np)) 
525
        print(c_np,d_np) 
526
        print(m_np,n_np) 
527
        print((c_np-m_np),(d_np-n_np)) 
528
        
529
    if NumpyFFTMethod and NumbaMethod:
530
        print(NumpyFFTMethod,NumbaMethod)
531
        print("Precision: ",np.linalg.norm(g_np-m_np),
532
              np.linalg.norm(h_np-n_np)) 
533
        print(g_np,h_np) 
534
        print(m_np,n_np) 
535
        print((g_np-m_np),(h_np-n_np)) 
536
        
537
    if OpenCLFFTMethod and NumpyFFTMethod:
538
        print("NumpyOpenCLRatio: %f" % (OpenCLFFTRate/NumpyFFTRate))