Statistiques
| Révision :

root / ETSN / MyDFT_9.py @ 274

Historique | Voir | Annoter | Télécharger (12,47 ko)

1 274 equemene
#!/usr/bin/env python3
2 274 equemene
3 274 equemene
import numpy as np
4 274 equemene
import pyopencl as cl
5 274 equemene
from numpy import pi,cos,sin
6 274 equemene
7 274 equemene
# Naive Discrete Fourier Transform
8 274 equemene
def MyDFT(x,y):
9 274 equemene
    size=x.shape[0]
10 274 equemene
    X=np.zeros(size).astype(np.float32)
11 274 equemene
    Y=np.zeros(size).astype(np.float32)
12 274 equemene
    for i in range(size):
13 274 equemene
        for j in range(size):
14 274 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 274 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 274 equemene
    return(X,Y)
17 274 equemene
18 274 equemene
# Numpy Discrete Fourier Transform
19 274 equemene
def NumpyDFT(x,y):
20 274 equemene
    size=x.shape[0]
21 274 equemene
    X=np.zeros(size).astype(np.float32)
22 274 equemene
    Y=np.zeros(size).astype(np.float32)
23 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 274 equemene
    for i in range(size):
25 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 274 equemene
    return(X,Y)
28 274 equemene
29 274 equemene
# Numba Discrete Fourier Transform
30 274 equemene
import numba
31 274 equemene
@numba.njit(parallel=True)
32 274 equemene
def NumbaDFT(x,y):
33 274 equemene
    size=x.shape[0]
34 274 equemene
    X=np.zeros(size).astype(np.float32)
35 274 equemene
    Y=np.zeros(size).astype(np.float32)
36 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 274 equemene
    for i in numba.prange(size):
38 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 274 equemene
    return(X,Y)
41 274 equemene
42 274 equemene
# OpenCL complete operation
43 274 equemene
def OpenCLDFT(a_np,b_np,Device):
44 274 equemene
45 274 equemene
    Id=0
46 274 equemene
    HasXPU=False
47 274 equemene
    for platform in cl.get_platforms():
48 274 equemene
        for device in platform.get_devices():
49 274 equemene
            if Id==Device:
50 274 equemene
                XPU=device
51 274 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
52 274 equemene
                HasXPU=True
53 274 equemene
            Id+=1
54 274 equemene
            # print(Id)
55 274 equemene
56 274 equemene
    if HasXPU==False:
57 274 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58 274 equemene
        sys.exit()
59 274 equemene
60 274 equemene
    try:
61 274 equemene
        ctx = cl.Context(devices=[XPU])
62 274 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63 274 equemene
    except:
64 274 equemene
        print("Crash during context creation")
65 274 equemene
66 274 equemene
    TimeIn=time.time()
67 274 equemene
    # Copy from Host to Device using pointers
68 274 equemene
    mf = cl.mem_flags
69 274 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70 274 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71 274 equemene
    Elapsed=time.time()-TimeIn
72 274 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73 274 equemene
74 274 equemene
    TimeIn=time.time()
75 274 equemene
    # Definition of kernel under OpenCL
76 274 equemene
    prg = cl.Program(ctx, """
77 274 equemene

78 274 equemene
#define PI 3.141592653589793
79 274 equemene

80 274 equemene
__kernel void MyDFT(
81 274 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82 274 equemene
{
83 274 equemene
  int gid = get_global_id(0);
84 274 equemene
  uint size = get_global_size(0);
85 274 equemene
  float A=0.,B=0.;
86 274 equemene
  for (uint i=0; i<size;i++)
87 274 equemene
  {
88 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90 274 equemene
  }
91 274 equemene
  A_g[gid]=A;
92 274 equemene
  B_g[gid]=B;
93 274 equemene
}
94 274 equemene
""").build()
95 274 equemene
    Elapsed=time.time()-TimeIn
96 274 equemene
    print("Building kernels : %.3f" % Elapsed)
97 274 equemene
98 274 equemene
    TimeIn=time.time()
99 274 equemene
    # Memory allocation on Device for result
100 274 equemene
    A_ocl = np.empty_like(a_np)
101 274 equemene
    B_ocl = np.empty_like(a_np)
102 274 equemene
    Elapsed=time.time()-TimeIn
103 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
104 274 equemene
105 274 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106 274 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107 274 equemene
    Elapsed=time.time()-TimeIn
108 274 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
109 274 equemene
110 274 equemene
    TimeIn=time.time()
111 274 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
112 274 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113 274 equemene
    Elapsed=time.time()-TimeIn
114 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
115 274 equemene
116 274 equemene
    TimeIn=time.time()
117 274 equemene
    # Call of kernel previously defined
118 274 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119 274 equemene
    #
120 274 equemene
    CallCL.wait()
121 274 equemene
    Elapsed=time.time()-TimeIn
122 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
123 274 equemene
124 274 equemene
    TimeIn=time.time()
125 274 equemene
    # Copy from Device to Host
126 274 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
127 274 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
128 274 equemene
    Elapsed=time.time()-TimeIn
129 274 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130 274 equemene
131 274 equemene
    a_g.release()
132 274 equemene
    b_g.release()
133 274 equemene
    A_g.release()
134 274 equemene
    B_g.release()
135 274 equemene
136 274 equemene
    return(A_ocl,B_ocl)
137 274 equemene
138 274 equemene
# CUDA Silly complete operation
139 274 equemene
def CUDADFT(a_np,b_np,Device,THreads):
140 274 equemene
    # import pycuda.autoinit
141 274 equemene
    import pycuda.driver as drv
142 274 equemene
    from pycuda.compiler import SourceModule
143 274 equemene
144 274 equemene
    try:
145 274 equemene
        # For PyCUDA import
146 274 equemene
        import pycuda.driver as cuda
147 274 equemene
        from pycuda.compiler import SourceModule
148 274 equemene
149 274 equemene
        cuda.init()
150 274 equemene
        for Id in range(cuda.Device.count()):
151 274 equemene
            if Id==Device:
152 274 equemene
                XPU=cuda.Device(Id)
153 274 equemene
                print("GPU selected %s" % XPU.name())
154 274 equemene
        print
155 274 equemene
156 274 equemene
    except ImportError:
157 274 equemene
        print("Platform does not seem to support CUDA")
158 274 equemene
159 274 equemene
    Context=XPU.make_context()
160 274 equemene
161 274 equemene
    TimeIn=time.time()
162 274 equemene
    mod = SourceModule("""
163 274 equemene

164 274 equemene
#define PI 3.141592653589793
165 274 equemene

166 274 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
167 274 equemene
{
168 274 equemene
  const int gid = blockIdx.x;
169 274 equemene
  uint size = gridDim.x;
170 274 equemene
  float A=0.,B=0.;
171 274 equemene
  for (uint i=0; i<size;i++)
172 274 equemene
  {
173 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
174 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
175 274 equemene
  }
176 274 equemene
  A_g[gid]=A;
177 274 equemene
  B_g[gid]=B;
178 274 equemene
}
179 274 equemene

180 274 equemene
""")
181 274 equemene
    Elapsed=time.time()-TimeIn
182 274 equemene
    print("Definition of kernel : %.3f" % Elapsed)
183 274 equemene
184 274 equemene
    TimeIn=time.time()
185 274 equemene
    MyDFT = mod.get_function("MyDFT")
186 274 equemene
    Elapsed=time.time()-TimeIn
187 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
188 274 equemene
189 274 equemene
    TimeIn=time.time()
190 274 equemene
    A_np = np.zeros_like(a_np)
191 274 equemene
    B_np = np.zeros_like(a_np)
192 274 equemene
    Elapsed=time.time()-TimeIn
193 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
194 274 equemene
195 274 equemene
    Size=a_np.size
196 274 equemene
    if (Size % Threads != 0):
197 274 equemene
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
198 274 equemene
        TimeIn=time.time()
199 274 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
200 274 equemene
              block=(1,1,1), grid=(a_np.size,1))
201 274 equemene
        Elapsed=time.time()-TimeIn
202 274 equemene
        print("Execution of kernel : %.3f" % Elapsed)
203 274 equemene
    else:
204 274 equemene
        Blocks=int(Size/Threads)
205 274 equemene
        TimeIn=time.time()
206 274 equemene
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
207 274 equemene
              block=(Threads,1,1), grid=(Blocks,1))
208 274 equemene
        Elapsed=time.time()-TimeIn
209 274 equemene
        print("Execution of kernel : %.3f" % Elapsed)
210 274 equemene
211 274 equemene
    Context.pop()
212 274 equemene
    Context.detach()
213 274 equemene
214 274 equemene
    return(A_np,B_np)
215 274 equemene
216 274 equemene
import sys
217 274 equemene
import time
218 274 equemene
219 274 equemene
if __name__=='__main__':
220 274 equemene
221 274 equemene
    SIZE=1024
222 274 equemene
    Device=0
223 274 equemene
    NaiveMethod=False
224 274 equemene
    NumpyMethod=True
225 274 equemene
    NumbaMethod=False
226 274 equemene
    OpenCLMethod=True
227 274 equemene
    CUDAMethod=False
228 274 equemene
    Threads=1
229 274 equemene
230 274 equemene
    import getopt
231 274 equemene
232 274 equemene
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
233 274 equemene
234 274 equemene
    try:
235 274 equemene
        opts, args = getopt.getopt(sys.argv[1:],"nyaochs:d:t:",["size=","device="])
236 274 equemene
    except getopt.GetoptError:
237 274 equemene
        print(HowToUse % sys.argv[0])
238 274 equemene
        sys.exit(2)
239 274 equemene
240 274 equemene
    # List of Devices
241 274 equemene
    Devices=[]
242 274 equemene
    Alu={}
243 274 equemene
244 274 equemene
    for opt, arg in opts:
245 274 equemene
        if opt == '-h':
246 274 equemene
            print(HowToUse % sys.argv[0])
247 274 equemene
248 274 equemene
            print("\nInformations about devices detected under OpenCL API:")
249 274 equemene
            # For PyOpenCL import
250 274 equemene
            try:
251 274 equemene
                import pyopencl as cl
252 274 equemene
                Id=0
253 274 equemene
                for platform in cl.get_platforms():
254 274 equemene
                    for device in platform.get_devices():
255 274 equemene
                        #deviceType=cl.device_type.to_string(device.type)
256 274 equemene
                        deviceType="xPU"
257 274 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
258 274 equemene
                        Id=Id+1
259 274 equemene
260 274 equemene
            except:
261 274 equemene
                print("Your platform does not seem to support OpenCL")
262 274 equemene
263 274 equemene
            print("\nInformations about devices detected under CUDA API:")
264 274 equemene
            # For PyCUDA import
265 274 equemene
            try:
266 274 equemene
                import pycuda.driver as cuda
267 274 equemene
                cuda.init()
268 274 equemene
                for Id in range(cuda.Device.count()):
269 274 equemene
                    device=cuda.Device(Id)
270 274 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
271 274 equemene
                print
272 274 equemene
            except:
273 274 equemene
                print("Your platform does not seem to support CUDA")
274 274 equemene
275 274 equemene
            sys.exit()
276 274 equemene
277 274 equemene
        elif opt in ("-d", "--device"):
278 274 equemene
            Device=int(arg)
279 274 equemene
        elif opt in ("-s", "--size"):
280 274 equemene
            SIZE = int(arg)
281 274 equemene
        elif opt in ("-t", "--threads"):
282 274 equemene
            Threads = int(arg)
283 274 equemene
        elif opt in ("-n"):
284 274 equemene
            NaiveMethod=True
285 274 equemene
        elif opt in ("-y"):
286 274 equemene
            NumpyMethod=True
287 274 equemene
        elif opt in ("-a"):
288 274 equemene
            NumbaMethod=True
289 274 equemene
        elif opt in ("-o"):
290 274 equemene
            OpenCLMethod=True
291 274 equemene
        elif opt in ("-c"):
292 274 equemene
            CUDAMethod=True
293 274 equemene
294 274 equemene
    print("Device Selection : %i" % Device)
295 274 equemene
    print("Size of complex vector : %i" % SIZE)
296 274 equemene
    print("DFT Naive computation %s " % NaiveMethod )
297 274 equemene
    print("DFT Numpy computation %s " % NumpyMethod )
298 274 equemene
    print("DFT Numba computation %s " % NumbaMethod )
299 274 equemene
    print("DFT OpenCL computation %s " % OpenCLMethod )
300 274 equemene
    print("DFT CUDA computation %s " % CUDAMethod )
301 274 equemene
302 274 equemene
    if CUDAMethod:
303 274 equemene
        try:
304 274 equemene
            # For PyCUDA import
305 274 equemene
            import pycuda.driver as cuda
306 274 equemene
307 274 equemene
            cuda.init()
308 274 equemene
            for Id in range(cuda.Device.count()):
309 274 equemene
                device=cuda.Device(Id)
310 274 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
311 274 equemene
                if Id in Devices:
312 274 equemene
                    Alu[Id]='GPU'
313 274 equemene
314 274 equemene
        except ImportError:
315 274 equemene
            print("Platform does not seem to support CUDA")
316 274 equemene
317 274 equemene
    if OpenCLMethod:
318 274 equemene
        try:
319 274 equemene
            # For PyOpenCL import
320 274 equemene
            import pyopencl as cl
321 274 equemene
            Id=0
322 274 equemene
            for platform in cl.get_platforms():
323 274 equemene
                for device in platform.get_devices():
324 274 equemene
                    #deviceType=cl.device_type.to_string(device.type)
325 274 equemene
                    deviceType="xPU"
326 274 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
327 274 equemene
328 274 equemene
                    if Id in Devices:
329 274 equemene
                    # Set the Alu as detected Device Type
330 274 equemene
                        Alu[Id]=deviceType
331 274 equemene
                    Id=Id+1
332 274 equemene
        except ImportError:
333 274 equemene
            print("Platform does not seem to support OpenCL")
334 274 equemene
335 274 equemene
336 274 equemene
337 274 equemene
    a_np = np.ones(SIZE).astype(np.float32)
338 274 equemene
    b_np = np.ones(SIZE).astype(np.float32)
339 274 equemene
340 274 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
341 274 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
342 274 equemene
    C_np[0] = np.float32(SIZE)
343 274 equemene
    D_np[0] = np.float32(SIZE)
344 274 equemene
345 274 equemene
    # Native & Naive Implementation
346 274 equemene
    if NaiveMethod:
347 274 equemene
        print("Performing naive implementation")
348 274 equemene
        TimeIn=time.time()
349 274 equemene
        c_np,d_np=MyDFT(a_np,b_np)
350 274 equemene
        NativeElapsed=time.time()-TimeIn
351 274 equemene
        NativeRate=int(SIZE/NativeElapsed)
352 274 equemene
        print("NativeRate: %i" % NativeRate)
353 274 equemene
        print("Precision: ",np.linalg.norm(c_np-C_np),
354 274 equemene
              np.linalg.norm(d_np-D_np))
355 274 equemene
356 274 equemene
    # Native & Numpy Implementation
357 274 equemene
    if NumpyMethod:
358 274 equemene
        print("Performing Numpy implementation")
359 274 equemene
        TimeIn=time.time()
360 274 equemene
        e_np,f_np=NumpyDFT(a_np,b_np)
361 274 equemene
        NumpyElapsed=time.time()-TimeIn
362 274 equemene
        NumpyRate=int(SIZE/NumpyElapsed)
363 274 equemene
        print("NumpyRate: %i" % NumpyRate)
364 274 equemene
        print("Precision: ",np.linalg.norm(e_np-C_np),
365 274 equemene
              np.linalg.norm(f_np-D_np))
366 274 equemene
367 274 equemene
    # Native & Numba Implementation
368 274 equemene
    if NumbaMethod:
369 274 equemene
        print("Performing Numba implementation")
370 274 equemene
        TimeIn=time.time()
371 274 equemene
        g_np,h_np=NumbaDFT(a_np,b_np)
372 274 equemene
        NumbaElapsed=time.time()-TimeIn
373 274 equemene
        NumbaRate=int(SIZE/NumbaElapsed)
374 274 equemene
        print("NumbaRate: %i" % NumbaRate)
375 274 equemene
        print("Precision: ",np.linalg.norm(g_np-C_np),
376 274 equemene
              np.linalg.norm(h_np-D_np))
377 274 equemene
378 274 equemene
    # OpenCL Implementation
379 274 equemene
    if OpenCLMethod:
380 274 equemene
        print("Performing OpenCL implementation")
381 274 equemene
        TimeIn=time.time()
382 274 equemene
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
383 274 equemene
        OpenCLElapsed=time.time()-TimeIn
384 274 equemene
        OpenCLRate=int(SIZE/OpenCLElapsed)
385 274 equemene
        print("OpenCLRate: %i" % OpenCLRate)
386 274 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
387 274 equemene
              np.linalg.norm(j_np-D_np))
388 274 equemene
389 274 equemene
    # CUDA Implementation
390 274 equemene
    if CUDAMethod:
391 274 equemene
        print("Performing CUDA implementation")
392 274 equemene
        TimeIn=time.time()
393 274 equemene
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
394 274 equemene
        CUDAElapsed=time.time()-TimeIn
395 274 equemene
        CUDARate=int(SIZE/CUDAElapsed)
396 274 equemene
        print("CUDARate: %i" % CUDARate)
397 274 equemene
        print("Precision: ",np.linalg.norm(k_np-C_np),
398 274 equemene
              np.linalg.norm(l_np-D_np))