Statistiques
| Révision :

root / ETSN / MyDFT_10.py @ 297

Historique | Voir | Annoter | Télécharger (15,17 ko)

1 281 equemene
#!/usr/bin/env python3
2 281 equemene
3 281 equemene
import numpy as np
4 281 equemene
import pyopencl as cl
5 281 equemene
from numpy import pi,cos,sin
6 281 equemene
7 281 equemene
#
8 281 equemene
def NumpyFFT(x,y):
9 296 equemene
    xy=np.csingle(x+1.j*y)
10 281 equemene
    XY=np.fft.fft(xy)
11 296 equemene
    print(XY)
12 281 equemene
    return(XY.real,XY.imag)
13 281 equemene
14 281 equemene
#
15 281 equemene
def OpenCLFFT(x,y,device):
16 281 equemene
    import pyopencl as cl
17 281 equemene
    import pyopencl.array as cla
18 281 equemene
    import time
19 281 equemene
    import gpyfft
20 281 equemene
    from gpyfft.fft import FFT
21 281 equemene
22 296 equemene
    TimeIn=time.time()
23 281 equemene
    Id=0
24 281 equemene
    HasXPU=False
25 281 equemene
    for platform in cl.get_platforms():
26 281 equemene
        for device in platform.get_devices():
27 281 equemene
            if Id==Device:
28 281 equemene
                XPU=device
29 281 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
30 281 equemene
                HasXPU=True
31 281 equemene
            Id+=1
32 281 equemene
            # print(Id)
33 281 equemene
34 281 equemene
    if HasXPU==False:
35 281 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
36 281 equemene
        sys.exit()
37 296 equemene
    Elapsed=time.time()-TimeIn
38 296 equemene
    print("Selection of device : %.3f" % Elapsed)
39 281 equemene
40 296 equemene
    TimeIn=time.time()
41 281 equemene
    try:
42 281 equemene
        ctx = cl.Context(devices=[XPU])
43 281 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
44 281 equemene
    except:
45 281 equemene
        print("Crash during context creation")
46 296 equemene
    Elapsed=time.time()-TimeIn
47 296 equemene
    print("Context initialisation : %.3f" % Elapsed)
48 281 equemene
49 296 equemene
    TimeIn=time.time()
50 296 equemene
    XY_gpu = cla.to_device(queue, np.csingle(x+1.j*y))
51 296 equemene
    Elapsed=time.time()-TimeIn
52 296 equemene
    print("Copy from Host to Device : %.3f" % Elapsed)
53 281 equemene
54 296 equemene
    TimeIn=time.time()
55 296 equemene
    transform = FFT(ctx, queue, XY_gpu)
56 281 equemene
    event, = transform.enqueue()
57 281 equemene
    event.wait()
58 296 equemene
    Elapsed=time.time()-TimeIn
59 296 equemene
    print("Compute FFT : %.3f" % Elapsed)
60 296 equemene
    TimeIn=time.time()
61 281 equemene
    XY = XY_gpu.get()
62 296 equemene
    Elapsed=time.time()-TimeIn
63 296 equemene
    print("Copy from Device to Host : %.3f" % Elapsed)
64 296 equemene
    print(XY)
65 281 equemene
    return(XY.real,XY.imag)
66 281 equemene
67 281 equemene
# Naive Discrete Fourier Transform
68 281 equemene
def MyDFT(x,y):
69 281 equemene
    size=x.shape[0]
70 281 equemene
    X=np.zeros(size).astype(np.float32)
71 281 equemene
    Y=np.zeros(size).astype(np.float32)
72 281 equemene
    for i in range(size):
73 281 equemene
        for j in range(size):
74 281 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
75 281 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
76 281 equemene
    return(X,Y)
77 281 equemene
78 281 equemene
# Numpy Discrete Fourier Transform
79 281 equemene
def NumpyDFT(x,y):
80 281 equemene
    size=x.shape[0]
81 281 equemene
    X=np.zeros(size).astype(np.float32)
82 281 equemene
    Y=np.zeros(size).astype(np.float32)
83 281 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
84 281 equemene
    for i in range(size):
85 281 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
86 281 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
87 281 equemene
    return(X,Y)
88 281 equemene
89 281 equemene
# Numba Discrete Fourier Transform
90 281 equemene
import numba
91 281 equemene
@numba.njit(parallel=True)
92 281 equemene
def NumbaDFT(x,y):
93 281 equemene
    size=x.shape[0]
94 281 equemene
    X=np.zeros(size).astype(np.float32)
95 281 equemene
    Y=np.zeros(size).astype(np.float32)
96 281 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
97 281 equemene
    for i in numba.prange(size):
98 281 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
99 281 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
100 281 equemene
    return(X,Y)
101 281 equemene
102 281 equemene
# OpenCL complete operation
103 281 equemene
def OpenCLDFT(a_np,b_np,Device):
104 281 equemene
105 281 equemene
    Id=0
106 281 equemene
    HasXPU=False
107 281 equemene
    for platform in cl.get_platforms():
108 281 equemene
        for device in platform.get_devices():
109 281 equemene
            if Id==Device:
110 281 equemene
                XPU=device
111 281 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
112 281 equemene
                HasXPU=True
113 281 equemene
            Id+=1
114 281 equemene
            # print(Id)
115 281 equemene
116 281 equemene
    if HasXPU==False:
117 281 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
118 281 equemene
        sys.exit()
119 281 equemene
120 281 equemene
    try:
121 281 equemene
        ctx = cl.Context(devices=[XPU])
122 281 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
123 281 equemene
    except:
124 281 equemene
        print("Crash during context creation")
125 281 equemene
126 281 equemene
    TimeIn=time.time()
127 281 equemene
    # Copy from Host to Device using pointers
128 281 equemene
    mf = cl.mem_flags
129 281 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
130 281 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
131 281 equemene
    Elapsed=time.time()-TimeIn
132 281 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
133 281 equemene
134 281 equemene
    TimeIn=time.time()
135 281 equemene
    # Definition of kernel under OpenCL
136 281 equemene
    prg = cl.Program(ctx, """
137 281 equemene

138 281 equemene
#define PI 3.141592653589793
139 281 equemene

140 281 equemene
__kernel void MyDFT(
141 281 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
142 281 equemene
{
143 281 equemene
  int gid = get_global_id(0);
144 281 equemene
  uint size = get_global_size(0);
145 281 equemene
  float A=0.,B=0.;
146 281 equemene
  for (uint i=0; i<size;i++)
147 281 equemene
  {
148 281 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
149 281 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
150 281 equemene
  }
151 281 equemene
  A_g[gid]=A;
152 281 equemene
  B_g[gid]=B;
153 281 equemene
}
154 281 equemene
""").build()
155 281 equemene
    Elapsed=time.time()-TimeIn
156 281 equemene
    print("Building kernels : %.3f" % Elapsed)
157 281 equemene
158 281 equemene
    TimeIn=time.time()
159 281 equemene
    # Memory allocation on Device for result
160 281 equemene
    A_ocl = np.empty_like(a_np)
161 281 equemene
    B_ocl = np.empty_like(a_np)
162 281 equemene
    Elapsed=time.time()-TimeIn
163 281 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
164 281 equemene
165 281 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
166 281 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
167 281 equemene
    Elapsed=time.time()-TimeIn
168 281 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
169 281 equemene
170 281 equemene
    TimeIn=time.time()
171 281 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
172 281 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
173 281 equemene
    Elapsed=time.time()-TimeIn
174 281 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
175 281 equemene
176 281 equemene
    TimeIn=time.time()
177 281 equemene
    # Call of kernel previously defined
178 281 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
179 281 equemene
    #
180 281 equemene
    CallCL.wait()
181 281 equemene
    Elapsed=time.time()-TimeIn
182 281 equemene
    print("Execution of kernel : %.3f" % Elapsed)
183 281 equemene
184 281 equemene
    TimeIn=time.time()
185 281 equemene
    # Copy from Device to Host
186 281 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
187 281 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
188 281 equemene
    Elapsed=time.time()-TimeIn
189 281 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
190 281 equemene
191 281 equemene
    # Liberation of memory
192 281 equemene
    a_g.release()
193 281 equemene
    b_g.release()
194 281 equemene
    A_g.release()
195 281 equemene
    B_g.release()
196 281 equemene
197 281 equemene
    return(A_ocl,B_ocl)
198 281 equemene
199 281 equemene
# CUDA complete operation
200 281 equemene
def CUDADFT(a_np,b_np,Device,Threads):
201 281 equemene
    # import pycuda.autoinit
202 281 equemene
    import pycuda.driver as drv
203 281 equemene
    from pycuda.compiler import SourceModule
204 281 equemene
205 281 equemene
    try:
206 281 equemene
        # For PyCUDA import
207 281 equemene
        import pycuda.driver as cuda
208 281 equemene
        from pycuda.compiler import SourceModule
209 281 equemene
210 281 equemene
        cuda.init()
211 281 equemene
        for Id in range(cuda.Device.count()):
212 281 equemene
            if Id==Device:
213 281 equemene
                XPU=cuda.Device(Id)
214 281 equemene
                print("GPU selected %s" % XPU.name())
215 281 equemene
        print
216 281 equemene
217 281 equemene
    except ImportError:
218 281 equemene
        print("Platform does not seem to support CUDA")
219 281 equemene
220 281 equemene
    Context=XPU.make_context()
221 281 equemene
222 281 equemene
    TimeIn=time.time()
223 281 equemene
    mod = SourceModule("""
224 281 equemene

225 281 equemene
#define PI 3.141592653589793
226 281 equemene

227 281 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
228 281 equemene
{
229 281 equemene
  const int gid = blockIdx.x*blockDim.x+threadIdx.x;
230 281 equemene
  uint size = gridDim.x*blockDim.x;
231 281 equemene
  float A=0.,B=0.;
232 281 equemene
  for (uint i=0; i<size;i++)
233 281 equemene
  {
234 281 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
235 281 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
236 281 equemene
  }
237 281 equemene
  A_g[gid]=A;
238 281 equemene
  B_g[gid]=B;
239 281 equemene
}
240 281 equemene

241 281 equemene
""")
242 281 equemene
    Elapsed=time.time()-TimeIn
243 281 equemene
    print("Definition of kernel : %.3f" % Elapsed)
244 281 equemene
245 281 equemene
    TimeIn=time.time()
246 281 equemene
    MyDFT = mod.get_function("MyDFT")
247 281 equemene
    Elapsed=time.time()-TimeIn
248 281 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
249 281 equemene
250 281 equemene
    TimeIn=time.time()
251 281 equemene
    A_np = np.zeros_like(a_np)
252 281 equemene
    B_np = np.zeros_like(a_np)
253 281 equemene
    Elapsed=time.time()-TimeIn
254 281 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
255 281 equemene
256 281 equemene
    Size=a_np.size
257 281 equemene
    if (Size % Threads != 0):
258 281 equemene
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
259 281 equemene
        TimeIn=time.time()
260 281 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
261 281 equemene
              block=(1,1,1), grid=(a_np.size,1))
262 281 equemene
        Elapsed=time.time()-TimeIn
263 281 equemene
        print("Execution of kernel : %.3f" % Elapsed)
264 281 equemene
    else:
265 281 equemene
        Blocks=int(Size/Threads)
266 281 equemene
        TimeIn=time.time()
267 281 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
268 281 equemene
              block=(Threads,1,1), grid=(Blocks,1))
269 281 equemene
        Elapsed=time.time()-TimeIn
270 281 equemene
        print("Execution of kernel : %.3f" % Elapsed)
271 281 equemene
272 281 equemene
    Context.pop()
273 281 equemene
    Context.detach()
274 281 equemene
275 281 equemene
    return(A_np,B_np)
276 281 equemene
277 281 equemene
import sys
278 281 equemene
import time
279 281 equemene
280 281 equemene
if __name__=='__main__':
281 281 equemene
282 281 equemene
    SIZE=1024
283 281 equemene
    Device=0
284 281 equemene
    NaiveMethod=False
285 281 equemene
    NumpyFFTMethod=True
286 281 equemene
    OpenCLFFTMethod=True
287 281 equemene
    NumpyMethod=False
288 281 equemene
    NumbaMethod=False
289 281 equemene
    OpenCLMethod=False
290 281 equemene
    CUDAMethod=False
291 281 equemene
    Threads=1
292 281 equemene
293 281 equemene
    import getopt
294 281 equemene
295 281 equemene
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
296 281 equemene
297 281 equemene
    try:
298 281 equemene
        opts, args = getopt.getopt(sys.argv[1:],"nyaochs:d:t:",["size=","device="])
299 281 equemene
    except getopt.GetoptError:
300 281 equemene
        print(HowToUse % sys.argv[0])
301 281 equemene
        sys.exit(2)
302 281 equemene
303 281 equemene
    # List of Devices
304 281 equemene
    Devices=[]
305 281 equemene
    Alu={}
306 281 equemene
307 281 equemene
    for opt, arg in opts:
308 281 equemene
        if opt == '-h':
309 281 equemene
            print(HowToUse % sys.argv[0])
310 281 equemene
311 281 equemene
            print("\nInformations about devices detected under OpenCL API:")
312 281 equemene
            # For PyOpenCL import
313 281 equemene
            try:
314 281 equemene
                import pyopencl as cl
315 281 equemene
                Id=0
316 281 equemene
                for platform in cl.get_platforms():
317 281 equemene
                    for device in platform.get_devices():
318 281 equemene
                        #deviceType=cl.device_type.to_string(device.type)
319 281 equemene
                        deviceType="xPU"
320 281 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
321 281 equemene
                        Id=Id+1
322 281 equemene
323 281 equemene
            except:
324 281 equemene
                print("Your platform does not seem to support OpenCL")
325 281 equemene
326 281 equemene
            print("\nInformations about devices detected under CUDA API:")
327 281 equemene
            # For PyCUDA import
328 281 equemene
            try:
329 281 equemene
                import pycuda.driver as cuda
330 281 equemene
                cuda.init()
331 281 equemene
                for Id in range(cuda.Device.count()):
332 281 equemene
                    device=cuda.Device(Id)
333 281 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
334 281 equemene
                print
335 281 equemene
            except:
336 281 equemene
                print("Your platform does not seem to support CUDA")
337 281 equemene
338 281 equemene
            sys.exit()
339 281 equemene
340 281 equemene
        elif opt in ("-d", "--device"):
341 281 equemene
            Device=int(arg)
342 281 equemene
        elif opt in ("-s", "--size"):
343 281 equemene
            SIZE = int(arg)
344 281 equemene
        elif opt in ("-t", "--threads"):
345 281 equemene
            Threads = int(arg)
346 281 equemene
        elif opt in ("-n"):
347 281 equemene
            NaiveMethod=True
348 281 equemene
        elif opt in ("-y"):
349 281 equemene
            NumpyMethod=True
350 281 equemene
        elif opt in ("-a"):
351 281 equemene
            NumbaMethod=True
352 281 equemene
        elif opt in ("-o"):
353 281 equemene
            OpenCLMethod=True
354 281 equemene
        elif opt in ("-c"):
355 281 equemene
            CUDAMethod=True
356 281 equemene
357 281 equemene
    print("Device Selection : %i" % Device)
358 281 equemene
    print("Size of complex vector : %i" % SIZE)
359 281 equemene
    print("DFT Naive computation %s " % NaiveMethod )
360 281 equemene
    print("DFT Numpy computation %s " % NumpyMethod )
361 281 equemene
    print("DFT Numba computation %s " % NumbaMethod )
362 281 equemene
    print("DFT OpenCL computation %s " % OpenCLMethod )
363 281 equemene
    print("DFT CUDA computation %s " % CUDAMethod )
364 281 equemene
365 281 equemene
    if CUDAMethod:
366 281 equemene
        try:
367 281 equemene
            # For PyCUDA import
368 281 equemene
            import pycuda.driver as cuda
369 281 equemene
370 281 equemene
            cuda.init()
371 281 equemene
            for Id in range(cuda.Device.count()):
372 281 equemene
                device=cuda.Device(Id)
373 281 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
374 281 equemene
                if Id in Devices:
375 281 equemene
                    Alu[Id]='GPU'
376 281 equemene
377 281 equemene
        except ImportError:
378 281 equemene
            print("Platform does not seem to support CUDA")
379 281 equemene
380 281 equemene
    if OpenCLMethod:
381 281 equemene
        try:
382 281 equemene
            # For PyOpenCL import
383 281 equemene
            import pyopencl as cl
384 281 equemene
            Id=0
385 281 equemene
            for platform in cl.get_platforms():
386 281 equemene
                for device in platform.get_devices():
387 281 equemene
                    #deviceType=cl.device_type.to_string(device.type)
388 281 equemene
                    deviceType="xPU"
389 281 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
390 281 equemene
391 281 equemene
                    if Id in Devices:
392 281 equemene
                    # Set the Alu as detected Device Type
393 281 equemene
                        Alu[Id]=deviceType
394 281 equemene
                    Id=Id+1
395 281 equemene
        except ImportError:
396 281 equemene
            print("Platform does not seem to support OpenCL")
397 281 equemene
398 281 equemene
399 281 equemene
400 281 equemene
    a_np = np.ones(SIZE).astype(np.float32)
401 281 equemene
    b_np = np.ones(SIZE).astype(np.float32)
402 281 equemene
403 281 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
404 281 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
405 281 equemene
    C_np[0] = np.float32(SIZE)
406 281 equemene
    D_np[0] = np.float32(SIZE)
407 281 equemene
408 281 equemene
    # Native & Naive Implementation
409 281 equemene
    if NaiveMethod:
410 281 equemene
        print("Performing naive implementation")
411 281 equemene
        TimeIn=time.time()
412 281 equemene
        c_np,d_np=MyDFT(a_np,b_np)
413 281 equemene
        NativeElapsed=time.time()-TimeIn
414 281 equemene
        NativeRate=int(SIZE/NativeElapsed)
415 281 equemene
        print("NativeRate: %i" % NativeRate)
416 281 equemene
        print("Precision: ",np.linalg.norm(c_np-C_np),
417 281 equemene
              np.linalg.norm(d_np-D_np))
418 281 equemene
419 281 equemene
    # Native & Numpy Implementation
420 281 equemene
    if NumpyMethod:
421 281 equemene
        print("Performing Numpy implementation")
422 281 equemene
        TimeIn=time.time()
423 281 equemene
        e_np,f_np=NumpyDFT(a_np,b_np)
424 281 equemene
        NumpyElapsed=time.time()-TimeIn
425 281 equemene
        NumpyRate=int(SIZE/NumpyElapsed)
426 281 equemene
        print("NumpyRate: %i" % NumpyRate)
427 281 equemene
        print("Precision: ",np.linalg.norm(e_np-C_np),
428 281 equemene
              np.linalg.norm(f_np-D_np))
429 281 equemene
430 281 equemene
    # Native & Numba Implementation
431 281 equemene
    if NumbaMethod:
432 281 equemene
        print("Performing Numba implementation")
433 281 equemene
        TimeIn=time.time()
434 281 equemene
        g_np,h_np=NumbaDFT(a_np,b_np)
435 281 equemene
        NumbaElapsed=time.time()-TimeIn
436 281 equemene
        NumbaRate=int(SIZE/NumbaElapsed)
437 281 equemene
        print("NumbaRate: %i" % NumbaRate)
438 281 equemene
        print("Precision: ",np.linalg.norm(g_np-C_np),
439 281 equemene
              np.linalg.norm(h_np-D_np))
440 281 equemene
441 281 equemene
    # OpenCL Implementation
442 281 equemene
    if OpenCLMethod:
443 281 equemene
        print("Performing OpenCL implementation")
444 281 equemene
        TimeIn=time.time()
445 281 equemene
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
446 281 equemene
        OpenCLElapsed=time.time()-TimeIn
447 281 equemene
        OpenCLRate=int(SIZE/OpenCLElapsed)
448 281 equemene
        print("OpenCLRate: %i" % OpenCLRate)
449 281 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
450 281 equemene
              np.linalg.norm(j_np-D_np))
451 281 equemene
452 281 equemene
    # CUDA Implementation
453 281 equemene
    if CUDAMethod:
454 281 equemene
        print("Performing CUDA implementation")
455 281 equemene
        TimeIn=time.time()
456 281 equemene
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
457 281 equemene
        CUDAElapsed=time.time()-TimeIn
458 281 equemene
        CUDARate=int(SIZE/CUDAElapsed)
459 281 equemene
        print("CUDARate: %i" % CUDARate)
460 281 equemene
        print("Precision: ",np.linalg.norm(k_np-C_np),
461 281 equemene
              np.linalg.norm(l_np-D_np))
462 281 equemene
463 281 equemene
    if NumpyFFTMethod:
464 281 equemene
        print("Performing NumpyFFT implementation")
465 281 equemene
        TimeIn=time.time()
466 281 equemene
        m_np,n_np=NumpyFFT(a_np,b_np)
467 281 equemene
        NumpyFFTElapsed=time.time()-TimeIn
468 281 equemene
        NumpyFFTRate=int(SIZE/NumpyFFTElapsed)
469 296 equemene
        print("NumpyFFTElapsed: %i" % NumpyFFTElapsed)
470 281 equemene
        print("NumpyFFTRate: %i" % NumpyFFTRate)
471 281 equemene
        print("Precision: ",np.linalg.norm(m_np-C_np),
472 281 equemene
              np.linalg.norm(n_np-D_np))
473 281 equemene
474 281 equemene
    # OpenCL Implementation
475 281 equemene
    if OpenCLFFTMethod:
476 281 equemene
        print("Performing OpenCL implementation")
477 281 equemene
        TimeIn=time.time()
478 281 equemene
        i_np,j_np=OpenCLFFT(a_np,b_np,Device)
479 281 equemene
        OpenCLFFTElapsed=time.time()-TimeIn
480 281 equemene
        OpenCLFFTRate=int(SIZE/OpenCLFFTElapsed)
481 296 equemene
        print("OpenCLElapsed: %i" % OpenCLFFTElapsed)
482 281 equemene
        print("OpenCLRate: %i" % OpenCLFFTRate)
483 281 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
484 281 equemene
              np.linalg.norm(j_np-D_np))
485 281 equemene
486 296 equemene
    if OpenCLFFTMethod and NumpyFFTMethod:
487 296 equemene
        print("NumpyOpenCLRatio: %f" % (OpenCLFFTRate/NumpyFFTRate))