Statistiques
| Révision :

root / ETSN / MyDFT_9.py @ 299

Historique | Voir | Annoter | Télécharger (12,52 ko)

1 274 equemene
#!/usr/bin/env python3
2 274 equemene
3 274 equemene
import numpy as np
4 274 equemene
import pyopencl as cl
5 274 equemene
from numpy import pi,cos,sin
6 274 equemene
7 274 equemene
# Naive Discrete Fourier Transform
8 274 equemene
def MyDFT(x,y):
9 274 equemene
    size=x.shape[0]
10 274 equemene
    X=np.zeros(size).astype(np.float32)
11 274 equemene
    Y=np.zeros(size).astype(np.float32)
12 274 equemene
    for i in range(size):
13 274 equemene
        for j in range(size):
14 274 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 274 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 274 equemene
    return(X,Y)
17 274 equemene
18 274 equemene
# Numpy Discrete Fourier Transform
19 274 equemene
def NumpyDFT(x,y):
20 274 equemene
    size=x.shape[0]
21 274 equemene
    X=np.zeros(size).astype(np.float32)
22 274 equemene
    Y=np.zeros(size).astype(np.float32)
23 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 274 equemene
    for i in range(size):
25 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 274 equemene
    return(X,Y)
28 274 equemene
29 274 equemene
# Numba Discrete Fourier Transform
30 274 equemene
import numba
31 274 equemene
@numba.njit(parallel=True)
32 274 equemene
def NumbaDFT(x,y):
33 274 equemene
    size=x.shape[0]
34 274 equemene
    X=np.zeros(size).astype(np.float32)
35 274 equemene
    Y=np.zeros(size).astype(np.float32)
36 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 274 equemene
    for i in numba.prange(size):
38 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 274 equemene
    return(X,Y)
41 274 equemene
42 274 equemene
# OpenCL complete operation
43 274 equemene
def OpenCLDFT(a_np,b_np,Device):
44 274 equemene
45 274 equemene
    Id=0
46 274 equemene
    HasXPU=False
47 274 equemene
    for platform in cl.get_platforms():
48 274 equemene
        for device in platform.get_devices():
49 274 equemene
            if Id==Device:
50 274 equemene
                XPU=device
51 274 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
52 274 equemene
                HasXPU=True
53 274 equemene
            Id+=1
54 274 equemene
            # print(Id)
55 274 equemene
56 274 equemene
    if HasXPU==False:
57 274 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58 274 equemene
        sys.exit()
59 274 equemene
60 274 equemene
    try:
61 274 equemene
        ctx = cl.Context(devices=[XPU])
62 274 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63 274 equemene
    except:
64 274 equemene
        print("Crash during context creation")
65 274 equemene
66 274 equemene
    TimeIn=time.time()
67 274 equemene
    # Copy from Host to Device using pointers
68 274 equemene
    mf = cl.mem_flags
69 274 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70 274 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71 274 equemene
    Elapsed=time.time()-TimeIn
72 274 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73 274 equemene
74 274 equemene
    TimeIn=time.time()
75 274 equemene
    # Definition of kernel under OpenCL
76 274 equemene
    prg = cl.Program(ctx, """
77 274 equemene

78 274 equemene
#define PI 3.141592653589793
79 274 equemene

80 274 equemene
__kernel void MyDFT(
81 274 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82 274 equemene
{
83 274 equemene
  int gid = get_global_id(0);
84 274 equemene
  uint size = get_global_size(0);
85 274 equemene
  float A=0.,B=0.;
86 274 equemene
  for (uint i=0; i<size;i++)
87 274 equemene
  {
88 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90 274 equemene
  }
91 274 equemene
  A_g[gid]=A;
92 274 equemene
  B_g[gid]=B;
93 274 equemene
}
94 274 equemene
""").build()
95 274 equemene
    Elapsed=time.time()-TimeIn
96 274 equemene
    print("Building kernels : %.3f" % Elapsed)
97 274 equemene
98 274 equemene
    TimeIn=time.time()
99 274 equemene
    # Memory allocation on Device for result
100 274 equemene
    A_ocl = np.empty_like(a_np)
101 274 equemene
    B_ocl = np.empty_like(a_np)
102 274 equemene
    Elapsed=time.time()-TimeIn
103 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
104 274 equemene
105 274 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106 274 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107 274 equemene
    Elapsed=time.time()-TimeIn
108 274 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
109 274 equemene
110 274 equemene
    TimeIn=time.time()
111 274 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
112 274 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113 274 equemene
    Elapsed=time.time()-TimeIn
114 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
115 274 equemene
116 274 equemene
    TimeIn=time.time()
117 274 equemene
    # Call of kernel previously defined
118 274 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119 274 equemene
    #
120 274 equemene
    CallCL.wait()
121 274 equemene
    Elapsed=time.time()-TimeIn
122 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
123 274 equemene
124 274 equemene
    TimeIn=time.time()
125 274 equemene
    # Copy from Device to Host
126 274 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
127 274 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
128 274 equemene
    Elapsed=time.time()-TimeIn
129 274 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130 274 equemene
131 275 equemene
    # Liberation of memory
132 274 equemene
    a_g.release()
133 274 equemene
    b_g.release()
134 274 equemene
    A_g.release()
135 274 equemene
    B_g.release()
136 274 equemene
137 274 equemene
    return(A_ocl,B_ocl)
138 274 equemene
139 277 equemene
# CUDA complete operation
140 277 equemene
def CUDADFT(a_np,b_np,Device,Threads):
141 274 equemene
    # import pycuda.autoinit
142 274 equemene
    import pycuda.driver as drv
143 274 equemene
    from pycuda.compiler import SourceModule
144 274 equemene
145 274 equemene
    try:
146 274 equemene
        # For PyCUDA import
147 274 equemene
        import pycuda.driver as cuda
148 274 equemene
        from pycuda.compiler import SourceModule
149 274 equemene
150 274 equemene
        cuda.init()
151 274 equemene
        for Id in range(cuda.Device.count()):
152 274 equemene
            if Id==Device:
153 274 equemene
                XPU=cuda.Device(Id)
154 274 equemene
                print("GPU selected %s" % XPU.name())
155 274 equemene
        print
156 274 equemene
157 274 equemene
    except ImportError:
158 274 equemene
        print("Platform does not seem to support CUDA")
159 274 equemene
160 274 equemene
    Context=XPU.make_context()
161 274 equemene
162 274 equemene
    TimeIn=time.time()
163 274 equemene
    mod = SourceModule("""
164 274 equemene

165 274 equemene
#define PI 3.141592653589793
166 274 equemene

167 274 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
168 274 equemene
{
169 277 equemene
  const int gid = blockIdx.x*blockDim.x+threadIdx.x;
170 277 equemene
  uint size = gridDim.x*blockDim.x;
171 274 equemene
  float A=0.,B=0.;
172 274 equemene
  for (uint i=0; i<size;i++)
173 274 equemene
  {
174 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
175 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
176 274 equemene
  }
177 274 equemene
  A_g[gid]=A;
178 274 equemene
  B_g[gid]=B;
179 274 equemene
}
180 274 equemene

181 274 equemene
""")
182 274 equemene
    Elapsed=time.time()-TimeIn
183 274 equemene
    print("Definition of kernel : %.3f" % Elapsed)
184 274 equemene
185 274 equemene
    TimeIn=time.time()
186 274 equemene
    MyDFT = mod.get_function("MyDFT")
187 274 equemene
    Elapsed=time.time()-TimeIn
188 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
189 274 equemene
190 274 equemene
    TimeIn=time.time()
191 274 equemene
    A_np = np.zeros_like(a_np)
192 274 equemene
    B_np = np.zeros_like(a_np)
193 274 equemene
    Elapsed=time.time()-TimeIn
194 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
195 274 equemene
196 274 equemene
    Size=a_np.size
197 274 equemene
    if (Size % Threads != 0):
198 274 equemene
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
199 274 equemene
        TimeIn=time.time()
200 274 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
201 274 equemene
              block=(1,1,1), grid=(a_np.size,1))
202 274 equemene
        Elapsed=time.time()-TimeIn
203 274 equemene
        print("Execution of kernel : %.3f" % Elapsed)
204 274 equemene
    else:
205 274 equemene
        Blocks=int(Size/Threads)
206 274 equemene
        TimeIn=time.time()
207 274 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
208 274 equemene
              block=(Threads,1,1), grid=(Blocks,1))
209 274 equemene
        Elapsed=time.time()-TimeIn
210 274 equemene
        print("Execution of kernel : %.3f" % Elapsed)
211 274 equemene
212 274 equemene
    Context.pop()
213 274 equemene
    Context.detach()
214 274 equemene
215 274 equemene
    return(A_np,B_np)
216 274 equemene
217 274 equemene
import sys
218 274 equemene
import time
219 274 equemene
220 274 equemene
if __name__=='__main__':
221 274 equemene
222 274 equemene
    SIZE=1024
223 274 equemene
    Device=0
224 274 equemene
    NaiveMethod=False
225 280 equemene
    NumpyMethod=False
226 274 equemene
    NumbaMethod=False
227 274 equemene
    OpenCLMethod=True
228 274 equemene
    CUDAMethod=False
229 274 equemene
    Threads=1
230 274 equemene
231 274 equemene
    import getopt
232 274 equemene
233 274 equemene
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
234 274 equemene
235 274 equemene
    try:
236 274 equemene
        opts, args = getopt.getopt(sys.argv[1:],"nyaochs:d:t:",["size=","device="])
237 274 equemene
    except getopt.GetoptError:
238 274 equemene
        print(HowToUse % sys.argv[0])
239 274 equemene
        sys.exit(2)
240 274 equemene
241 274 equemene
    # List of Devices
242 274 equemene
    Devices=[]
243 274 equemene
    Alu={}
244 274 equemene
245 274 equemene
    for opt, arg in opts:
246 274 equemene
        if opt == '-h':
247 274 equemene
            print(HowToUse % sys.argv[0])
248 274 equemene
249 274 equemene
            print("\nInformations about devices detected under OpenCL API:")
250 274 equemene
            # For PyOpenCL import
251 274 equemene
            try:
252 274 equemene
                import pyopencl as cl
253 274 equemene
                Id=0
254 274 equemene
                for platform in cl.get_platforms():
255 274 equemene
                    for device in platform.get_devices():
256 274 equemene
                        #deviceType=cl.device_type.to_string(device.type)
257 274 equemene
                        deviceType="xPU"
258 274 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
259 274 equemene
                        Id=Id+1
260 274 equemene
261 274 equemene
            except:
262 274 equemene
                print("Your platform does not seem to support OpenCL")
263 274 equemene
264 274 equemene
            print("\nInformations about devices detected under CUDA API:")
265 274 equemene
            # For PyCUDA import
266 274 equemene
            try:
267 274 equemene
                import pycuda.driver as cuda
268 274 equemene
                cuda.init()
269 274 equemene
                for Id in range(cuda.Device.count()):
270 274 equemene
                    device=cuda.Device(Id)
271 274 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
272 274 equemene
                print
273 274 equemene
            except:
274 274 equemene
                print("Your platform does not seem to support CUDA")
275 274 equemene
276 274 equemene
            sys.exit()
277 274 equemene
278 274 equemene
        elif opt in ("-d", "--device"):
279 274 equemene
            Device=int(arg)
280 274 equemene
        elif opt in ("-s", "--size"):
281 274 equemene
            SIZE = int(arg)
282 274 equemene
        elif opt in ("-t", "--threads"):
283 274 equemene
            Threads = int(arg)
284 274 equemene
        elif opt in ("-n"):
285 274 equemene
            NaiveMethod=True
286 274 equemene
        elif opt in ("-y"):
287 274 equemene
            NumpyMethod=True
288 274 equemene
        elif opt in ("-a"):
289 274 equemene
            NumbaMethod=True
290 274 equemene
        elif opt in ("-o"):
291 274 equemene
            OpenCLMethod=True
292 274 equemene
        elif opt in ("-c"):
293 274 equemene
            CUDAMethod=True
294 274 equemene
295 274 equemene
    print("Device Selection : %i" % Device)
296 274 equemene
    print("Size of complex vector : %i" % SIZE)
297 274 equemene
    print("DFT Naive computation %s " % NaiveMethod )
298 274 equemene
    print("DFT Numpy computation %s " % NumpyMethod )
299 274 equemene
    print("DFT Numba computation %s " % NumbaMethod )
300 274 equemene
    print("DFT OpenCL computation %s " % OpenCLMethod )
301 274 equemene
    print("DFT CUDA computation %s " % CUDAMethod )
302 274 equemene
303 274 equemene
    if CUDAMethod:
304 274 equemene
        try:
305 274 equemene
            # For PyCUDA import
306 274 equemene
            import pycuda.driver as cuda
307 274 equemene
308 274 equemene
            cuda.init()
309 274 equemene
            for Id in range(cuda.Device.count()):
310 274 equemene
                device=cuda.Device(Id)
311 274 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
312 274 equemene
                if Id in Devices:
313 274 equemene
                    Alu[Id]='GPU'
314 274 equemene
315 274 equemene
        except ImportError:
316 274 equemene
            print("Platform does not seem to support CUDA")
317 274 equemene
318 274 equemene
    if OpenCLMethod:
319 274 equemene
        try:
320 274 equemene
            # For PyOpenCL import
321 274 equemene
            import pyopencl as cl
322 274 equemene
            Id=0
323 274 equemene
            for platform in cl.get_platforms():
324 274 equemene
                for device in platform.get_devices():
325 274 equemene
                    #deviceType=cl.device_type.to_string(device.type)
326 274 equemene
                    deviceType="xPU"
327 274 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
328 274 equemene
329 274 equemene
                    if Id in Devices:
330 274 equemene
                    # Set the Alu as detected Device Type
331 274 equemene
                        Alu[Id]=deviceType
332 274 equemene
                    Id=Id+1
333 274 equemene
        except ImportError:
334 274 equemene
            print("Platform does not seem to support OpenCL")
335 274 equemene
336 274 equemene
337 274 equemene
338 274 equemene
    a_np = np.ones(SIZE).astype(np.float32)
339 274 equemene
    b_np = np.ones(SIZE).astype(np.float32)
340 274 equemene
341 274 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
342 274 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
343 274 equemene
    C_np[0] = np.float32(SIZE)
344 274 equemene
    D_np[0] = np.float32(SIZE)
345 274 equemene
346 274 equemene
    # Native & Naive Implementation
347 274 equemene
    if NaiveMethod:
348 274 equemene
        print("Performing naive implementation")
349 274 equemene
        TimeIn=time.time()
350 274 equemene
        c_np,d_np=MyDFT(a_np,b_np)
351 274 equemene
        NativeElapsed=time.time()-TimeIn
352 274 equemene
        NativeRate=int(SIZE/NativeElapsed)
353 274 equemene
        print("NativeRate: %i" % NativeRate)
354 274 equemene
        print("Precision: ",np.linalg.norm(c_np-C_np),
355 274 equemene
              np.linalg.norm(d_np-D_np))
356 274 equemene
357 274 equemene
    # Native & Numpy Implementation
358 274 equemene
    if NumpyMethod:
359 274 equemene
        print("Performing Numpy implementation")
360 274 equemene
        TimeIn=time.time()
361 274 equemene
        e_np,f_np=NumpyDFT(a_np,b_np)
362 274 equemene
        NumpyElapsed=time.time()-TimeIn
363 274 equemene
        NumpyRate=int(SIZE/NumpyElapsed)
364 274 equemene
        print("NumpyRate: %i" % NumpyRate)
365 274 equemene
        print("Precision: ",np.linalg.norm(e_np-C_np),
366 274 equemene
              np.linalg.norm(f_np-D_np))
367 274 equemene
368 274 equemene
    # Native & Numba Implementation
369 274 equemene
    if NumbaMethod:
370 274 equemene
        print("Performing Numba implementation")
371 274 equemene
        TimeIn=time.time()
372 274 equemene
        g_np,h_np=NumbaDFT(a_np,b_np)
373 274 equemene
        NumbaElapsed=time.time()-TimeIn
374 274 equemene
        NumbaRate=int(SIZE/NumbaElapsed)
375 274 equemene
        print("NumbaRate: %i" % NumbaRate)
376 274 equemene
        print("Precision: ",np.linalg.norm(g_np-C_np),
377 274 equemene
              np.linalg.norm(h_np-D_np))
378 274 equemene
379 274 equemene
    # OpenCL Implementation
380 274 equemene
    if OpenCLMethod:
381 274 equemene
        print("Performing OpenCL implementation")
382 274 equemene
        TimeIn=time.time()
383 274 equemene
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
384 274 equemene
        OpenCLElapsed=time.time()-TimeIn
385 274 equemene
        OpenCLRate=int(SIZE/OpenCLElapsed)
386 274 equemene
        print("OpenCLRate: %i" % OpenCLRate)
387 274 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
388 274 equemene
              np.linalg.norm(j_np-D_np))
389 274 equemene
390 274 equemene
    # CUDA Implementation
391 274 equemene
    if CUDAMethod:
392 274 equemene
        print("Performing CUDA implementation")
393 274 equemene
        TimeIn=time.time()
394 274 equemene
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
395 274 equemene
        CUDAElapsed=time.time()-TimeIn
396 274 equemene
        CUDARate=int(SIZE/CUDAElapsed)
397 274 equemene
        print("CUDARate: %i" % CUDARate)
398 274 equemene
        print("Precision: ",np.linalg.norm(k_np-C_np),
399 274 equemene
              np.linalg.norm(l_np-D_np))