Statistiques
| Révision :

root / ETSN / MyDFT_8.py

Historique | Voir | Annoter | Télécharger (11,29 ko)

1 274 equemene
#!/usr/bin/env python3
2 274 equemene
3 274 equemene
import numpy as np
4 274 equemene
import pyopencl as cl
5 274 equemene
from numpy import pi,cos,sin
6 274 equemene
7 274 equemene
# Naive Discrete Fourier Transform
8 274 equemene
def MyDFT(x,y):
9 274 equemene
    size=x.shape[0]
10 274 equemene
    X=np.zeros(size).astype(np.float32)
11 274 equemene
    Y=np.zeros(size).astype(np.float32)
12 274 equemene
    for i in range(size):
13 274 equemene
        for j in range(size):
14 300 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)+y[j]*sin(2.*pi*i*j/size)
15 300 equemene
            Y[i]=Y[i]-x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 274 equemene
    return(X,Y)
17 274 equemene
18 274 equemene
# Numpy Discrete Fourier Transform
19 274 equemene
def NumpyDFT(x,y):
20 274 equemene
    size=x.shape[0]
21 274 equemene
    X=np.zeros(size).astype(np.float32)
22 274 equemene
    Y=np.zeros(size).astype(np.float32)
23 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 274 equemene
    for i in range(size):
25 300 equemene
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 300 equemene
        Y[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),y),np.multiply(np.sin(i*nj),x)))
27 274 equemene
    return(X,Y)
28 274 equemene
29 274 equemene
# Numba Discrete Fourier Transform
30 274 equemene
import numba
31 274 equemene
@numba.njit(parallel=True)
32 274 equemene
def NumbaDFT(x,y):
33 274 equemene
    size=x.shape[0]
34 274 equemene
    X=np.zeros(size).astype(np.float32)
35 274 equemene
    Y=np.zeros(size).astype(np.float32)
36 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 274 equemene
    for i in numba.prange(size):
38 300 equemene
        X[i]=np.sum(np.add(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 300 equemene
        Y[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),y),np.multiply(np.sin(i*nj),x)))
40 274 equemene
    return(X,Y)
41 274 equemene
42 274 equemene
# OpenCL complete operation
43 274 equemene
def OpenCLDFT(a_np,b_np,Device):
44 274 equemene
45 274 equemene
    Id=0
46 274 equemene
    HasXPU=False
47 274 equemene
    for platform in cl.get_platforms():
48 274 equemene
        for device in platform.get_devices():
49 274 equemene
            if Id==Device:
50 274 equemene
                XPU=device
51 274 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
52 274 equemene
                HasXPU=True
53 274 equemene
            Id+=1
54 274 equemene
            # print(Id)
55 274 equemene
56 274 equemene
    if HasXPU==False:
57 274 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58 274 equemene
        sys.exit()
59 274 equemene
60 274 equemene
    try:
61 274 equemene
        ctx = cl.Context(devices=[XPU])
62 274 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63 274 equemene
    except:
64 274 equemene
        print("Crash during context creation")
65 274 equemene
66 274 equemene
    TimeIn=time.time()
67 274 equemene
    # Copy from Host to Device using pointers
68 274 equemene
    mf = cl.mem_flags
69 274 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70 274 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71 274 equemene
    Elapsed=time.time()-TimeIn
72 274 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73 274 equemene
74 274 equemene
    TimeIn=time.time()
75 274 equemene
    # Definition of kernel under OpenCL
76 274 equemene
    prg = cl.Program(ctx, """
77 274 equemene

78 274 equemene
#define PI 3.141592653589793
79 274 equemene

80 274 equemene
__kernel void MyDFT(
81 274 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82 274 equemene
{
83 274 equemene
  int gid = get_global_id(0);
84 274 equemene
  uint size = get_global_size(0);
85 274 equemene
  float A=0.,B=0.;
86 274 equemene
  for (uint i=0; i<size;i++)
87 274 equemene
  {
88 300 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89 300 equemene
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90 274 equemene
  }
91 274 equemene
  A_g[gid]=A;
92 274 equemene
  B_g[gid]=B;
93 274 equemene
}
94 274 equemene
""").build()
95 274 equemene
    Elapsed=time.time()-TimeIn
96 274 equemene
    print("Building kernels : %.3f" % Elapsed)
97 274 equemene
98 274 equemene
    TimeIn=time.time()
99 274 equemene
    # Memory allocation on Device for result
100 274 equemene
    A_ocl = np.empty_like(a_np)
101 274 equemene
    B_ocl = np.empty_like(a_np)
102 274 equemene
    Elapsed=time.time()-TimeIn
103 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
104 274 equemene
105 274 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106 274 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107 274 equemene
    Elapsed=time.time()-TimeIn
108 274 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
109 274 equemene
110 274 equemene
    TimeIn=time.time()
111 274 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
112 274 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113 274 equemene
    Elapsed=time.time()-TimeIn
114 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
115 274 equemene
116 274 equemene
    TimeIn=time.time()
117 274 equemene
    # Call of kernel previously defined
118 274 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119 274 equemene
    #
120 274 equemene
    CallCL.wait()
121 274 equemene
    Elapsed=time.time()-TimeIn
122 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
123 274 equemene
124 274 equemene
    TimeIn=time.time()
125 274 equemene
    # Copy from Device to Host
126 274 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
127 274 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
128 274 equemene
    Elapsed=time.time()-TimeIn
129 274 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130 274 equemene
131 275 equemene
    # Liberation of memory
132 274 equemene
    a_g.release()
133 274 equemene
    b_g.release()
134 274 equemene
    A_g.release()
135 274 equemene
    B_g.release()
136 274 equemene
137 274 equemene
    return(A_ocl,B_ocl)
138 274 equemene
139 274 equemene
# CUDA Silly complete operation
140 274 equemene
def CUDADFT(a_np,b_np,Device):
141 274 equemene
    # import pycuda.autoinit
142 274 equemene
    import pycuda.driver as drv
143 274 equemene
    from pycuda.compiler import SourceModule
144 274 equemene
145 274 equemene
    try:
146 274 equemene
        # For PyCUDA import
147 274 equemene
        import pycuda.driver as cuda
148 274 equemene
        from pycuda.compiler import SourceModule
149 274 equemene
150 274 equemene
        cuda.init()
151 274 equemene
        for Id in range(cuda.Device.count()):
152 274 equemene
            if Id==Device:
153 274 equemene
                XPU=cuda.Device(Id)
154 274 equemene
                print("GPU selected %s" % XPU.name())
155 274 equemene
        print
156 274 equemene
157 274 equemene
    except ImportError:
158 274 equemene
        print("Platform does not seem to support CUDA")
159 274 equemene
160 274 equemene
    Context=XPU.make_context()
161 274 equemene
162 274 equemene
    TimeIn=time.time()
163 274 equemene
    mod = SourceModule("""
164 274 equemene

165 274 equemene
#define PI 3.141592653589793
166 274 equemene

167 274 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
168 274 equemene
{
169 274 equemene
  const int gid = blockIdx.x;
170 274 equemene
  uint size = gridDim.x;
171 274 equemene
  float A=0.,B=0.;
172 274 equemene
  for (uint i=0; i<size;i++)
173 274 equemene
  {
174 300 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
175 300 equemene
     B+=-a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
176 274 equemene
  }
177 274 equemene
  A_g[gid]=A;
178 274 equemene
  B_g[gid]=B;
179 274 equemene
}
180 274 equemene

181 274 equemene
""")
182 274 equemene
    Elapsed=time.time()-TimeIn
183 274 equemene
    print("Definition of kernel : %.3f" % Elapsed)
184 274 equemene
185 274 equemene
    TimeIn=time.time()
186 274 equemene
    MyDFT = mod.get_function("MyDFT")
187 274 equemene
    Elapsed=time.time()-TimeIn
188 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
189 274 equemene
190 274 equemene
    TimeIn=time.time()
191 274 equemene
    A_np = np.zeros_like(a_np)
192 274 equemene
    B_np = np.zeros_like(a_np)
193 274 equemene
    Elapsed=time.time()-TimeIn
194 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
195 274 equemene
196 274 equemene
    TimeIn=time.time()
197 274 equemene
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
198 274 equemene
          block=(1,1,1), grid=(a_np.size,1))
199 274 equemene
    Elapsed=time.time()-TimeIn
200 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
201 274 equemene
202 274 equemene
    Context.pop()
203 274 equemene
    Context.detach()
204 274 equemene
205 274 equemene
    return(A_np,B_np)
206 274 equemene
207 274 equemene
import sys
208 274 equemene
import time
209 274 equemene
210 274 equemene
if __name__=='__main__':
211 274 equemene
212 274 equemene
    GpuStyle='OpenCL'
213 274 equemene
    SIZE=1024
214 274 equemene
    Device=0
215 274 equemene
216 274 equemene
    import getopt
217 274 equemene
218 274 equemene
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
219 274 equemene
220 274 equemene
    try:
221 274 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
222 274 equemene
    except getopt.GetoptError:
223 274 equemene
        print(HowToUse % sys.argv[0])
224 274 equemene
        sys.exit(2)
225 274 equemene
226 274 equemene
    # List of Devices
227 274 equemene
    Devices=[]
228 274 equemene
    Alu={}
229 274 equemene
230 274 equemene
    for opt, arg in opts:
231 274 equemene
        if opt == '-h':
232 274 equemene
            print(HowToUse % sys.argv[0])
233 274 equemene
234 274 equemene
            print("\nInformations about devices detected under OpenCL API:")
235 274 equemene
            # For PyOpenCL import
236 274 equemene
            try:
237 274 equemene
                import pyopencl as cl
238 274 equemene
                Id=0
239 274 equemene
                for platform in cl.get_platforms():
240 274 equemene
                    for device in platform.get_devices():
241 274 equemene
                        #deviceType=cl.device_type.to_string(device.type)
242 274 equemene
                        deviceType="xPU"
243 274 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
244 274 equemene
                        Id=Id+1
245 274 equemene
246 274 equemene
            except:
247 274 equemene
                print("Your platform does not seem to support OpenCL")
248 274 equemene
249 274 equemene
            print("\nInformations about devices detected under CUDA API:")
250 274 equemene
            # For PyCUDA import
251 274 equemene
            try:
252 274 equemene
                import pycuda.driver as cuda
253 274 equemene
                cuda.init()
254 274 equemene
                for Id in range(cuda.Device.count()):
255 274 equemene
                    device=cuda.Device(Id)
256 274 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
257 274 equemene
                print
258 274 equemene
            except:
259 274 equemene
                print("Your platform does not seem to support CUDA")
260 274 equemene
261 274 equemene
            sys.exit()
262 274 equemene
263 274 equemene
        elif opt in ("-d", "--device"):
264 274 equemene
            Device=int(arg)
265 274 equemene
        elif opt in ("-g", "--gpustyle"):
266 274 equemene
            GpuStyle = arg
267 274 equemene
        elif opt in ("-s", "--size"):
268 274 equemene
            SIZE = int(arg)
269 274 equemene
270 274 equemene
    print("Device Selection : %i" % Device)
271 274 equemene
    print("GpuStyle used : %s" % GpuStyle)
272 274 equemene
    print("Size of complex vector : %i" % SIZE)
273 274 equemene
274 274 equemene
    if GpuStyle=='CUDA':
275 274 equemene
        try:
276 274 equemene
            # For PyCUDA import
277 274 equemene
            import pycuda.driver as cuda
278 274 equemene
279 274 equemene
            cuda.init()
280 274 equemene
            for Id in range(cuda.Device.count()):
281 274 equemene
                device=cuda.Device(Id)
282 274 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
283 274 equemene
                if Id in Devices:
284 274 equemene
                    Alu[Id]='GPU'
285 274 equemene
286 274 equemene
        except ImportError:
287 274 equemene
            print("Platform does not seem to support CUDA")
288 274 equemene
289 274 equemene
    if GpuStyle=='OpenCL':
290 274 equemene
        try:
291 274 equemene
            # For PyOpenCL import
292 274 equemene
            import pyopencl as cl
293 274 equemene
            Id=0
294 274 equemene
            for platform in cl.get_platforms():
295 274 equemene
                for device in platform.get_devices():
296 274 equemene
                    #deviceType=cl.device_type.to_string(device.type)
297 274 equemene
                    deviceType="xPU"
298 274 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
299 274 equemene
300 274 equemene
                    if Id in Devices:
301 274 equemene
                    # Set the Alu as detected Device Type
302 274 equemene
                        Alu[Id]=deviceType
303 274 equemene
                    Id=Id+1
304 274 equemene
        except ImportError:
305 274 equemene
            print("Platform does not seem to support OpenCL")
306 274 equemene
307 274 equemene
308 274 equemene
309 274 equemene
    a_np = np.ones(SIZE).astype(np.float32)
310 274 equemene
    b_np = np.ones(SIZE).astype(np.float32)
311 274 equemene
312 274 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
313 274 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
314 274 equemene
    C_np[0] = np.float32(SIZE)
315 274 equemene
    D_np[0] = np.float32(SIZE)
316 274 equemene
317 274 equemene
    # # Native & Naive Implementation
318 274 equemene
    # print("Performing naive implementation")
319 274 equemene
    # TimeIn=time.time()
320 274 equemene
    # c_np,d_np=MyDFT(a_np,b_np)
321 274 equemene
    # NativeElapsed=time.time()-TimeIn
322 274 equemene
    # NativeRate=int(SIZE/NativeElapsed)
323 274 equemene
    # print("NativeRate: %i" % NativeRate)
324 274 equemene
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np))
325 274 equemene
326 274 equemene
    # # Native & Numpy Implementation
327 274 equemene
    # print("Performing Numpy implementation")
328 274 equemene
    # TimeIn=time.time()
329 274 equemene
    # e_np,f_np=NumpyDFT(a_np,b_np)
330 274 equemene
    # NumpyElapsed=time.time()-TimeIn
331 274 equemene
    # NumpyRate=int(SIZE/NumpyElapsed)
332 274 equemene
    # print("NumpyRate: %i" % NumpyRate)
333 274 equemene
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np))
334 274 equemene
335 274 equemene
    # # Native & Numba Implementation
336 274 equemene
    # print("Performing Numba implementation")
337 274 equemene
    # TimeIn=time.time()
338 274 equemene
    # g_np,h_np=NumbaDFT(a_np,b_np)
339 274 equemene
    # NumbaElapsed=time.time()-TimeIn
340 274 equemene
    # NumbaRate=int(SIZE/NumbaElapsed)
341 274 equemene
    # print("NumbaRate: %i" % NumbaRate)
342 274 equemene
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np))
343 274 equemene
344 274 equemene
    # OpenCL Implementation
345 274 equemene
    if GpuStyle=='OpenCL':
346 274 equemene
        print("Performing OpenCL implementation")
347 274 equemene
        TimeIn=time.time()
348 274 equemene
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
349 274 equemene
        OpenCLElapsed=time.time()-TimeIn
350 274 equemene
        OpenCLRate=int(SIZE/OpenCLElapsed)
351 274 equemene
        print("OpenCLRate: %i" % OpenCLRate)
352 274 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
353 274 equemene
              np.linalg.norm(j_np-D_np))
354 274 equemene
355 274 equemene
    # CUDA Implementation
356 274 equemene
    if GpuStyle=='CUDA':
357 274 equemene
        print("Performing CUDA implementation")
358 274 equemene
        TimeIn=time.time()
359 274 equemene
        k_np,l_np=CUDADFT(a_np,b_np,Device)
360 274 equemene
        CUDAElapsed=time.time()-TimeIn
361 274 equemene
        CUDARate=int(SIZE/CUDAElapsed)
362 274 equemene
        print("CUDARate: %i" % CUDARate)
363 274 equemene
        print("Precision: ",np.linalg.norm(k_np-C_np),
364 274 equemene
              np.linalg.norm(l_np-D_np))