Statistiques
| Révision :

root / ETSN / MyDFT2D.py

Historique | Voir | Annoter | Télécharger (16,44 ko)

1 303 equemene
#!/usr/bin/env python3
2 303 equemene
3 303 equemene
import numpy as np
4 303 equemene
import pyopencl as cl
5 303 equemene
from numpy import pi,cos,sin
6 303 equemene
7 303 equemene
# Naive Discrete Fourier Transform
8 303 equemene
def MyDFT(x,y):
9 303 equemene
    size=x.shape[0]
10 303 equemene
    X=np.zeros(x.shape).astype(np.float32)
11 303 equemene
    Y=np.zeros(x.shape).astype(np.float32)
12 303 equemene
    for k in range(size):
13 303 equemene
        for l in range(size):
14 303 equemene
            for i in range(size):
15 303 equemene
                for j in range(size):
16 303 equemene
                    t=np.float32(2*pi*((i*k)/size+(l*j)/size))
17 303 equemene
                    X[k,l]+=x[i,j]*cos(t)+y[i,j]*sin(t)
18 303 equemene
                    Y[k,l]+=-x[i,j]*sin(t)+y[i,j]*cos(t)
19 303 equemene
    return(X,Y)
20 303 equemene
21 303 equemene
#
22 303 equemene
def NumpyFFT(x,y):
23 303 equemene
    xy=np.csingle(x+1.j*y)
24 303 equemene
    XY=np.fft.fft2(xy)
25 303 equemene
    return(XY.real,XY.imag)
26 303 equemene
27 303 equemene
def OpenCLFFT(x,y,device):
28 303 equemene
    import pyopencl as cl
29 303 equemene
    import pyopencl.array as cla
30 303 equemene
    import time
31 303 equemene
    import gpyfft
32 303 equemene
    from gpyfft.fft import FFT
33 303 equemene
34 303 equemene
    TimeIn=time.time()
35 303 equemene
    Id=0
36 303 equemene
    HasXPU=False
37 303 equemene
    for platform in cl.get_platforms():
38 303 equemene
        for device in platform.get_devices():
39 303 equemene
            if Id==Device:
40 303 equemene
                XPU=device
41 303 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
42 303 equemene
                HasXPU=True
43 303 equemene
            Id+=1
44 303 equemene
            # print(Id)
45 303 equemene
46 303 equemene
    if HasXPU==False:
47 303 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
48 303 equemene
        sys.exit()
49 303 equemene
    Elapsed=time.time()-TimeIn
50 303 equemene
    print("Selection of device : %.3f" % Elapsed)
51 303 equemene
52 303 equemene
    TimeIn=time.time()
53 303 equemene
    try:
54 303 equemene
        ctx = cl.Context(devices=[XPU])
55 303 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
56 303 equemene
    except:
57 303 equemene
        print("Crash during context creation")
58 303 equemene
    Elapsed=time.time()-TimeIn
59 303 equemene
    print("Context initialisation : %.3f" % Elapsed)
60 303 equemene
61 303 equemene
    TimeIn=time.time()
62 303 equemene
    XY_gpu = cla.to_device(queue, np.csingle(x+1.j*y))
63 303 equemene
    Elapsed=time.time()-TimeIn
64 303 equemene
    print("Copy from Host to Device : %.3f" % Elapsed)
65 303 equemene
66 303 equemene
    TimeIn=time.time()
67 303 equemene
    transform = FFT(ctx, queue, XY_gpu, axes=(0,1))
68 303 equemene
    event, = transform.enqueue()
69 303 equemene
    event.wait()
70 303 equemene
    Elapsed=time.time()-TimeIn
71 303 equemene
    print("Compute FFT : %.3f" % Elapsed)
72 303 equemene
    TimeIn=time.time()
73 303 equemene
    XY = XY_gpu.get()
74 303 equemene
    Elapsed=time.time()-TimeIn
75 303 equemene
    print("Copy from Device to Host : %.3f" % Elapsed)
76 303 equemene
77 303 equemene
    return(XY.real,XY.imag)
78 303 equemene
79 303 equemene
# # Numpy Discrete Fourier Transform
80 303 equemene
# def NumpyDFT(x,y):
81 303 equemene
#     size=x.shape[0]
82 303 equemene
#     X=np.zeros([size,size]).astype(np.float32)
83 303 equemene
#     Y=np.zeros([size,size]).astype(np.float32)
84 303 equemene
#     nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
85 303 equemene
#     for k in range(size):
86 303 equemene
#         for l in range(size):
87 303 equemene
#         X[k]=np.sum(np.subtract(np.multiply(np.cos(k*nj),x),np.multiply(np.sin(k*nj),y)))
88 303 equemene
#         Y[k]=np.sum(np.add(np.multiply(np.sin(k*nj),x),np.multiply(np.cos(k*nj),y)))
89 303 equemene
#     return(X,Y)
90 303 equemene
91 303 equemene
# Numba Discrete Fourier Transform
92 303 equemene
import numba
93 303 equemene
@numba.njit(parallel=True)
94 303 equemene
def NumbaDFT(x,y):
95 303 equemene
    size=x.shape[0]
96 303 equemene
    X=np.zeros(x.shape).astype(np.float32)
97 303 equemene
    Y=np.zeros(y.shape).astype(np.float32)
98 303 equemene
    for k in numba.prange(size):
99 303 equemene
        for l in numba.prange(size):
100 303 equemene
            for i in numba.prange(size):
101 303 equemene
                for j in numba.prange(size):
102 303 equemene
                    t=np.float32(2*pi*((i*k)/size+(l*j)/size))
103 303 equemene
                    X[k,l]+=x[i,j]*cos(t)+y[i,j]*sin(t)
104 303 equemene
                    Y[k,l]+=-x[i,j]*sin(t)+y[i,j]*cos(t)
105 303 equemene
    return(X,Y)
106 303 equemene
107 303 equemene
# OpenCL complete operation
108 303 equemene
def OpenCLDFT(a_np,b_np,Device):
109 303 equemene
110 303 equemene
    Id=0
111 303 equemene
    HasXPU=False
112 303 equemene
    for platform in cl.get_platforms():
113 303 equemene
        for device in platform.get_devices():
114 303 equemene
            if Id==Device:
115 303 equemene
                XPU=device
116 303 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
117 303 equemene
                HasXPU=True
118 303 equemene
            Id+=1
119 303 equemene
            # print(Id)
120 303 equemene
121 303 equemene
    if HasXPU==False:
122 303 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
123 303 equemene
        sys.exit()
124 303 equemene
125 303 equemene
    try:
126 303 equemene
        ctx = cl.Context(devices=[XPU])
127 303 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
128 303 equemene
    except:
129 303 equemene
        print("Crash during context creation")
130 303 equemene
131 303 equemene
    TimeIn=time.time()
132 303 equemene
    # Copy from Host to Device using pointers
133 303 equemene
    mf = cl.mem_flags
134 303 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
135 303 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
136 303 equemene
    Elapsed=time.time()-TimeIn
137 303 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
138 303 equemene
139 303 equemene
    TimeIn=time.time()
140 303 equemene
    # Definition of kernel under OpenCL
141 303 equemene
    prg = cl.Program(ctx, """
142 303 equemene

143 303 equemene
#define PI 3.141592653589793
144 303 equemene

145 303 equemene
__kernel void MyDFT(
146 303 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
147 303 equemene
{
148 303 equemene
  int gidx = get_global_id(0);
149 303 equemene
  int gidy = get_global_id(1);
150 303 equemene
  uint size = get_global_size(0);
151 303 equemene
  float A=0.,B=0.;
152 303 equemene
  for (uint i=0; i<size;i++) for (uint j=0; j<size;j++)
153 303 equemene
  {
154 303 equemene
     float angle=2.*PI*((float)(gidx*i)/(float)size+
155 303 equemene
                        (float)(gidy*j)/(float)size);
156 303 equemene
     A+=a_g[i+size*j]*cos(angle)+b_g[i+size*j]*sin(angle);
157 303 equemene
     B+=-a_g[i+size*j]*sin(angle)+b_g[i+size*j]*cos(angle);
158 303 equemene
  }
159 303 equemene
  A_g[gidx+size*gidy]=A;
160 303 equemene
  B_g[gidx+size*gidy]=B;
161 303 equemene
}
162 303 equemene
""").build()
163 303 equemene
    Elapsed=time.time()-TimeIn
164 303 equemene
    print("Building kernels : %.3f" % Elapsed)
165 303 equemene
166 303 equemene
    TimeIn=time.time()
167 303 equemene
    # Memory allocation on Device for result
168 303 equemene
    A_ocl = np.empty_like(a_np)
169 303 equemene
    B_ocl = np.empty_like(a_np)
170 303 equemene
    Elapsed=time.time()-TimeIn
171 303 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
172 303 equemene
173 303 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
174 303 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
175 303 equemene
    Elapsed=time.time()-TimeIn
176 303 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
177 303 equemene
178 303 equemene
    TimeIn=time.time()
179 303 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
180 303 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
181 303 equemene
    Elapsed=time.time()-TimeIn
182 303 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
183 303 equemene
184 303 equemene
    TimeIn=time.time()
185 303 equemene
    # Call of kernel previously defined
186 303 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
187 303 equemene
    #
188 303 equemene
    CallCL.wait()
189 303 equemene
    Elapsed=time.time()-TimeIn
190 303 equemene
    print("Execution of kernel : %.3f" % Elapsed)
191 303 equemene
192 303 equemene
    TimeIn=time.time()
193 303 equemene
    # Copy from Device to Host
194 303 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
195 303 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
196 303 equemene
    Elapsed=time.time()-TimeIn
197 303 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
198 303 equemene
199 303 equemene
    # Liberation of memory
200 303 equemene
    a_g.release()
201 303 equemene
    b_g.release()
202 303 equemene
    A_g.release()
203 303 equemene
    B_g.release()
204 303 equemene
205 303 equemene
    return(A_ocl,B_ocl)
206 303 equemene
207 303 equemene
# CUDA complete operation
208 303 equemene
def CUDADFT(a_np,b_np,Device,Threads):
209 303 equemene
    # import pycuda.autoinit
210 303 equemene
    import pycuda.driver as drv
211 303 equemene
    from pycuda.compiler import SourceModule
212 303 equemene
213 303 equemene
    try:
214 303 equemene
        # For PyCUDA import
215 303 equemene
        import pycuda.driver as cuda
216 303 equemene
        from pycuda.compiler import SourceModule
217 303 equemene
218 303 equemene
        cuda.init()
219 303 equemene
        for Id in range(cuda.Device.count()):
220 303 equemene
            if Id==Device:
221 303 equemene
                XPU=cuda.Device(Id)
222 303 equemene
                print("GPU selected %s" % XPU.name())
223 303 equemene
        print
224 303 equemene
225 303 equemene
    except ImportError:
226 303 equemene
        print("Platform does not seem to support CUDA")
227 303 equemene
228 303 equemene
    Context=XPU.make_context()
229 303 equemene
230 303 equemene
    TimeIn=time.time()
231 303 equemene
    mod = SourceModule("""
232 303 equemene

233 303 equemene
#define PI 3.141592653589793
234 303 equemene

235 303 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
236 303 equemene
{
237 303 equemene
  const int gidx = blockIdx.x*blockDim.x+threadIdx.x;
238 303 equemene
  const int gidy = blockIdx.y*blockDim.y+threadIdx.y;
239 303 equemene
  uint sizex = gridDim.x*blockDim.x;
240 303 equemene
  uint sizey = gridDim.y*blockDim.y;
241 303 equemene
  uint size = gridDim.x*blockDim.x*gridDim.y*blockDim.y;
242 303 equemene
  float A=0.,B=0.;
243 303 equemene
  for (uint i=0; i<sizex;i++) for (uint j=0; j<sizey;j++)
244 303 equemene
  {
245 303 equemene
     float angle=2.*PI*((float)(gidx*i)/(float)sizex+
246 303 equemene
                        (float)(gidy*j)/(float)sizey);
247 303 equemene
     A+=a_g[i+sizex*j]*cos(angle)+b_g[i+sizex*j]*sin(angle);
248 303 equemene
     B+=-a_g[i+sizex*j]*sin(angle)+b_g[i+sizex*j]*cos(angle);
249 303 equemene
  }
250 303 equemene
  A_g[gidx+sizey*gidy]=A;
251 303 equemene
  B_g[gidx+sizey*gidy]=B;
252 303 equemene
}
253 303 equemene

254 303 equemene
""")
255 303 equemene
    Elapsed=time.time()-TimeIn
256 303 equemene
    print("Definition of kernel : %.3f" % Elapsed)
257 303 equemene
258 303 equemene
    TimeIn=time.time()
259 303 equemene
    MyDFT = mod.get_function("MyDFT")
260 303 equemene
    Elapsed=time.time()-TimeIn
261 303 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
262 303 equemene
263 303 equemene
    TimeIn=time.time()
264 303 equemene
    A_np = np.zeros_like(a_np)
265 303 equemene
    B_np = np.zeros_like(a_np)
266 303 equemene
    Elapsed=time.time()-TimeIn
267 303 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
268 303 equemene
269 303 equemene
    Size=a_np.shape
270 303 equemene
    if (Size[0] % Threads != 0):
271 303 equemene
        print("Impossible : %i not multiple of %i..." % (Threads,Size[0]) )
272 303 equemene
        TimeIn=time.time()
273 303 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
274 303 equemene
              block=(1,1,1), grid=Size)
275 303 equemene
        Elapsed=time.time()-TimeIn
276 303 equemene
        print("Execution of kernel : %.3f" % Elapsed)
277 303 equemene
    else:
278 303 equemene
        Blocks=(int(Size[0]/Threads),int(Size[1]/Threads));
279 303 equemene
        TimeIn=time.time()
280 303 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
281 303 equemene
              block=(Threads,Threads,1), grid=Blocks)
282 303 equemene
        Elapsed=time.time()-TimeIn
283 303 equemene
        print("Execution of kernel : %.3f" % Elapsed)
284 303 equemene
285 303 equemene
    Context.pop()
286 303 equemene
    Context.detach()
287 303 equemene
288 303 equemene
    return(A_np,B_np)
289 303 equemene
290 303 equemene
import sys
291 303 equemene
import time
292 303 equemene
293 303 equemene
if __name__=='__main__':
294 303 equemene
295 303 equemene
    SIZE=4
296 303 equemene
    Device=0
297 303 equemene
    NaiveMethod=False
298 303 equemene
    NumpyMethod=False
299 303 equemene
    NumpyFFTMethod=True
300 303 equemene
    NumbaMethod=False
301 303 equemene
    OpenCLMethod=False
302 303 equemene
    OpenCLFFTMethod=True
303 303 equemene
    CUDAMethod=False
304 303 equemene
    Threads=1
305 303 equemene
    Verbose=False
306 303 equemene
307 303 equemene
    import getopt
308 303 equemene
309 303 equemene
    HowToUse='%s -v [Verbose] -n [Naive] -y [numpYFFT] -a [numbA] -o [OpenCL] -g [OpenCLFFT] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
310 303 equemene
311 303 equemene
    try:
312 303 equemene
        opts, args = getopt.getopt(sys.argv[1:],"gvnyaochs:d:t:",["size=","device="])
313 303 equemene
    except getopt.GetoptError:
314 303 equemene
        print(HowToUse % sys.argv[0])
315 303 equemene
        sys.exit(2)
316 303 equemene
317 303 equemene
    # List of Devices
318 303 equemene
    Devices=[]
319 303 equemene
    Alu={}
320 303 equemene
321 303 equemene
    for opt, arg in opts:
322 303 equemene
        if opt == '-h':
323 303 equemene
            print(HowToUse % sys.argv[0])
324 303 equemene
325 303 equemene
            print("\nInformations about devices detected under OpenCL API:")
326 303 equemene
            # For PyOpenCL import
327 303 equemene
            try:
328 303 equemene
                import pyopencl as cl
329 303 equemene
                Id=0
330 303 equemene
                for platform in cl.get_platforms():
331 303 equemene
                    for device in platform.get_devices():
332 303 equemene
                        #deviceType=cl.device_type.to_string(device.type)
333 303 equemene
                        deviceType="xPU"
334 303 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
335 303 equemene
                        Id=Id+1
336 303 equemene
337 303 equemene
            except:
338 303 equemene
                print("Your platform does not seem to support OpenCL")
339 303 equemene
340 303 equemene
            print("\nInformations about devices detected under CUDA API:")
341 303 equemene
            # For PyCUDA import
342 303 equemene
            try:
343 303 equemene
                import pycuda.driver as cuda
344 303 equemene
                cuda.init()
345 303 equemene
                for Id in range(cuda.Device.count()):
346 303 equemene
                    device=cuda.Device(Id)
347 303 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
348 303 equemene
                print
349 303 equemene
            except:
350 303 equemene
                print("Your platform does not seem to support CUDA")
351 303 equemene
352 303 equemene
            sys.exit()
353 303 equemene
354 303 equemene
        elif opt in ("-d", "--device"):
355 303 equemene
            Device=int(arg)
356 303 equemene
        elif opt in ("-s", "--size"):
357 303 equemene
            SIZE = int(arg)
358 303 equemene
        elif opt in ("-t", "--threads"):
359 303 equemene
            Threads = int(arg)
360 303 equemene
        elif opt in ("-n"):
361 303 equemene
            NaiveMethod=True
362 303 equemene
        elif opt in ("-y"):
363 303 equemene
            NumpyFFTMethod=True
364 303 equemene
        elif opt in ("-a"):
365 303 equemene
            NumbaMethod=True
366 303 equemene
        elif opt in ("-o"):
367 303 equemene
            OpenCLMethod=True
368 303 equemene
        elif opt in ("-g"):
369 303 equemene
            OpenCLFFTMethod=True
370 303 equemene
        elif opt in ("-c"):
371 303 equemene
            CUDAMethod=True
372 303 equemene
        elif opt in ("-v"):
373 303 equemene
            Verbose=True
374 303 equemene
375 303 equemene
    print("Device Selection : %i" % Device)
376 303 equemene
    print("Size of complex vector : %i" % SIZE)
377 303 equemene
    print("Verbosity %s " % Verbose )
378 303 equemene
    print("DFT Naive computation %s " % NaiveMethod )
379 303 equemene
    print("DFT Numpy computation %s " % NumpyMethod )
380 303 equemene
    print("FFT Numpy computation %s " % NumpyFFTMethod )
381 303 equemene
    print("DFT Numba computation %s " % NumbaMethod )
382 303 equemene
    print("DFT OpenCL computation %s " % OpenCLMethod )
383 303 equemene
    print("FFT OpenCL computation %s " % OpenCLFFTMethod )
384 303 equemene
    print("DFT CUDA computation %s " % CUDAMethod )
385 303 equemene
386 303 equemene
    if CUDAMethod:
387 303 equemene
        try:
388 303 equemene
            # For PyCUDA import
389 303 equemene
            import pycuda.driver as cuda
390 303 equemene
391 303 equemene
            cuda.init()
392 303 equemene
            for Id in range(cuda.Device.count()):
393 303 equemene
                device=cuda.Device(Id)
394 303 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
395 303 equemene
                if Id in Devices:
396 303 equemene
                    Alu[Id]='GPU'
397 303 equemene
398 303 equemene
        except ImportError:
399 303 equemene
            print("Platform does not seem to support CUDA")
400 303 equemene
401 303 equemene
    if OpenCLMethod:
402 303 equemene
        try:
403 303 equemene
            # For PyOpenCL import
404 303 equemene
            import pyopencl as cl
405 303 equemene
            Id=0
406 303 equemene
            for platform in cl.get_platforms():
407 303 equemene
                for device in platform.get_devices():
408 303 equemene
                    #deviceType=cl.device_type.to_string(device.type)
409 303 equemene
                    deviceType="xPU"
410 303 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
411 303 equemene
412 303 equemene
                    if Id in Devices:
413 303 equemene
                    # Set the Alu as detected Device Type
414 303 equemene
                        Alu[Id]=deviceType
415 303 equemene
                    Id=Id+1
416 303 equemene
        except ImportError:
417 303 equemene
            print("Platform does not seem to support OpenCL")
418 303 equemene
419 303 equemene
420 303 equemene
421 303 equemene
    a_np = np.ones([SIZE,SIZE]).astype(np.float32)
422 303 equemene
    b_np = np.ones([SIZE,SIZE]).astype(np.float32)
423 303 equemene
    # a_np = np.zeros([SIZE,SIZE]).astype(np.float32)
424 303 equemene
    # b_np = np.zeros([SIZE,SIZE]).astype(np.float32)
425 303 equemene
    # a_np[0,0]=1;
426 303 equemene
427 303 equemene
    np.set_printoptions(precision=1,suppress=True)
428 303 equemene
429 303 equemene
    # print(a_np+1.j*b_np)
430 303 equemene
431 303 equemene
    # print(np.fft.fft2(a_np+1.j*b_np))
432 303 equemene
433 303 equemene
    C_np = np.zeros([SIZE,SIZE]).astype(np.float32)
434 303 equemene
    D_np = np.zeros([SIZE,SIZE]).astype(np.float32)
435 303 equemene
    C_np[0,0] = np.float32(SIZE*SIZE)
436 303 equemene
    D_np[0,0] = np.float32(SIZE*SIZE)
437 303 equemene
438 303 equemene
    # Native & Naive Implementation
439 303 equemene
    if NaiveMethod:
440 303 equemene
        print("Performing naive implementation")
441 303 equemene
        TimeIn=time.time()
442 303 equemene
        c_np,d_np=MyDFT(a_np,b_np)
443 303 equemene
        NativeElapsed=time.time()-TimeIn
444 303 equemene
        NativeRate=int(SIZE*SIZE/NativeElapsed)
445 303 equemene
        print("NativeElapsed: %i" % NativeElapsed)
446 303 equemene
        print("NativeRate: %i" % NativeRate)
447 303 equemene
        print("Precision: ",np.linalg.norm(c_np-C_np),
448 303 equemene
              np.linalg.norm(d_np-D_np))
449 303 equemene
        if Verbose:
450 303 equemene
            print(c_np+1.j*d_np)
451 303 equemene
452 303 equemene
    # Native & Numpy Implementation
453 303 equemene
    if NumpyFFTMethod:
454 303 equemene
        print("Performing Numpy FFT implementation")
455 303 equemene
        TimeIn=time.time()
456 303 equemene
        e_np,f_np=NumpyFFT(a_np,b_np)
457 303 equemene
        NumpyFFTElapsed=time.time()-TimeIn
458 303 equemene
        NumpyFFTRate=int(SIZE*SIZE/NumpyFFTElapsed)
459 303 equemene
        print("NumpyFFTElapsed: %i" % NumpyFFTElapsed)
460 303 equemene
        print("NumpyFFTRate: %i" % NumpyFFTRate)
461 303 equemene
        print("Precision: ",np.linalg.norm(e_np-C_np),
462 303 equemene
              np.linalg.norm(f_np-D_np))
463 303 equemene
        if Verbose:
464 303 equemene
            print(e_np+1.j*f_np)
465 303 equemene
466 303 equemene
    # Native & Numba Implementation
467 303 equemene
    if NumbaMethod:
468 303 equemene
        print("Performing Numba implementation")
469 303 equemene
        TimeIn=time.time()
470 303 equemene
        g_np,h_np=NumbaDFT(a_np,b_np)
471 303 equemene
        NumbaElapsed=time.time()-TimeIn
472 303 equemene
        NumbaRate=int(SIZE*SIZE/NumbaElapsed)
473 303 equemene
        print("NumbaElapsed: %i" % NumbaElapsed)
474 303 equemene
        print("NumbaRate: %i" % NumbaRate)
475 303 equemene
        print("Precision: ",np.linalg.norm(g_np-C_np),
476 303 equemene
              np.linalg.norm(h_np-D_np))
477 303 equemene
        if Verbose:
478 303 equemene
            print(g_np+1.j*h_np)
479 303 equemene
480 303 equemene
    # OpenCL Implementation
481 303 equemene
    if OpenCLMethod:
482 303 equemene
        print("Performing OpenCL implementation")
483 303 equemene
        TimeIn=time.time()
484 303 equemene
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
485 303 equemene
        OpenCLElapsed=time.time()-TimeIn
486 303 equemene
        OpenCLRate=int(SIZE*SIZE/OpenCLElapsed)
487 303 equemene
        print("OpenCLElapsed: %i" % OpenCLElapsed)
488 303 equemene
        print("OpenCLRate: %i" % OpenCLRate)
489 303 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
490 303 equemene
              np.linalg.norm(j_np-D_np))
491 303 equemene
        if Verbose:
492 303 equemene
            print(i_np+1.j*j_np)
493 303 equemene
494 303 equemene
    # CUDA Implementation
495 303 equemene
    if CUDAMethod:
496 303 equemene
        print("Performing CUDA implementation")
497 303 equemene
        TimeIn=time.time()
498 303 equemene
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
499 303 equemene
        CUDAElapsed=time.time()-TimeIn
500 303 equemene
        CUDARate=int(SIZE*SIZE/CUDAElapsed)
501 303 equemene
        print("CUDAElapsed: %i" % CUDAElapsed)
502 303 equemene
        print("CUDARate: %i" % CUDARate)
503 303 equemene
        print("Precision: ",np.linalg.norm(k_np-C_np),
504 303 equemene
              np.linalg.norm(l_np-D_np))
505 303 equemene
        if Verbose:
506 303 equemene
            print(k_np+1.j*l_np)
507 303 equemene
508 303 equemene
    # OpenCL Implementation
509 303 equemene
    if OpenCLFFTMethod:
510 303 equemene
        print("Performing OpenCL FFT implementation")
511 303 equemene
        TimeIn=time.time()
512 303 equemene
        m_np,n_np=OpenCLFFT(a_np,b_np,Device)
513 303 equemene
        OpenCLFFTElapsed=time.time()-TimeIn
514 303 equemene
        OpenCLFFTRate=int(SIZE*SIZE/OpenCLFFTElapsed)
515 303 equemene
        print("OpenCLFFTElapsed: %i" % OpenCLFFTElapsed)
516 303 equemene
        print("OpenCLFFTRate: %i" % OpenCLFFTRate)
517 303 equemene
        print("Precision: ",np.linalg.norm(m_np-C_np),
518 303 equemene
              np.linalg.norm(n_np-D_np))
519 303 equemene
        if Verbose:
520 303 equemene
            print(m_np+1.j*n_np)