Statistiques
| Révision :

root / ETSN / MyDFT_7.py @ 275

Historique | Voir | Annoter | Télécharger (10,74 ko)

1 274 equemene
#!/usr/bin/env python3
2 274 equemene
3 274 equemene
import numpy as np
4 274 equemene
import pyopencl as cl
5 274 equemene
from numpy import pi,cos,sin
6 274 equemene
7 274 equemene
# Naive Discrete Fourier Transform
8 274 equemene
def MyDFT(x,y):
9 274 equemene
    size=x.shape[0]
10 274 equemene
    X=np.zeros(size).astype(np.float32)
11 274 equemene
    Y=np.zeros(size).astype(np.float32)
12 274 equemene
    for i in range(size):
13 274 equemene
        for j in range(size):
14 274 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 274 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 274 equemene
    return(X,Y)
17 274 equemene
18 274 equemene
# Numpy Discrete Fourier Transform
19 274 equemene
def NumpyDFT(x,y):
20 274 equemene
    size=x.shape[0]
21 274 equemene
    X=np.zeros(size).astype(np.float32)
22 274 equemene
    Y=np.zeros(size).astype(np.float32)
23 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 274 equemene
    for i in range(size):
25 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 274 equemene
    return(X,Y)
28 274 equemene
29 274 equemene
# Numba Discrete Fourier Transform
30 274 equemene
import numba
31 274 equemene
@numba.njit(parallel=True)
32 274 equemene
def NumbaDFT(x,y):
33 274 equemene
    size=x.shape[0]
34 274 equemene
    X=np.zeros(size).astype(np.float32)
35 274 equemene
    Y=np.zeros(size).astype(np.float32)
36 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 274 equemene
    for i in numba.prange(size):
38 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 274 equemene
    return(X,Y)
41 274 equemene
42 274 equemene
# OpenCL complete operation
43 274 equemene
def OpenCLDFT(a_np,b_np,Device):
44 274 equemene
45 274 equemene
    Id=0
46 274 equemene
    HasXPU=False
47 274 equemene
    for platform in cl.get_platforms():
48 274 equemene
        for device in platform.get_devices():
49 274 equemene
            if Id==Device:
50 274 equemene
                XPU=device
51 274 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
52 274 equemene
                HasXPU=True
53 274 equemene
            Id+=1
54 274 equemene
            # print(Id)
55 274 equemene
56 274 equemene
    if HasXPU==False:
57 274 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58 274 equemene
        sys.exit()
59 274 equemene
60 274 equemene
    try:
61 274 equemene
        ctx = cl.Context(devices=[XPU])
62 274 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63 274 equemene
    except:
64 274 equemene
        print("Crash during context creation")
65 274 equemene
66 274 equemene
    TimeIn=time.time()
67 274 equemene
    # Copy from Host to Device using pointers
68 274 equemene
    mf = cl.mem_flags
69 274 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70 274 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71 274 equemene
    Elapsed=time.time()-TimeIn
72 274 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73 274 equemene
74 274 equemene
    TimeIn=time.time()
75 274 equemene
    # Definition of kernel under OpenCL
76 274 equemene
    prg = cl.Program(ctx, """
77 274 equemene

78 274 equemene
#define PI 3.141592653589793
79 274 equemene

80 274 equemene
__kernel void MyDFT(
81 274 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82 274 equemene
{
83 274 equemene
  int gid = get_global_id(0);
84 274 equemene
  uint size = get_global_size(0);
85 274 equemene
  float A=0.,B=0.;
86 274 equemene
  for (uint i=0; i<size;i++)
87 274 equemene
  {
88 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90 274 equemene
  }
91 274 equemene
  A_g[gid]=A;
92 274 equemene
  B_g[gid]=B;
93 274 equemene
}
94 274 equemene
""").build()
95 274 equemene
    Elapsed=time.time()-TimeIn
96 274 equemene
    print("Building kernels : %.3f" % Elapsed)
97 274 equemene
98 274 equemene
    TimeIn=time.time()
99 274 equemene
    # Memory allocation on Device for result
100 274 equemene
    A_ocl = np.empty_like(a_np)
101 274 equemene
    B_ocl = np.empty_like(a_np)
102 274 equemene
    Elapsed=time.time()-TimeIn
103 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
104 274 equemene
105 274 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106 274 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107 274 equemene
    Elapsed=time.time()-TimeIn
108 274 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
109 274 equemene
110 274 equemene
    TimeIn=time.time()
111 274 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
112 274 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113 274 equemene
    Elapsed=time.time()-TimeIn
114 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
115 274 equemene
116 274 equemene
    TimeIn=time.time()
117 274 equemene
    # Call of kernel previously defined
118 274 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119 274 equemene
    #
120 274 equemene
    CallCL.wait()
121 274 equemene
    Elapsed=time.time()-TimeIn
122 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
123 274 equemene
124 274 equemene
    TimeIn=time.time()
125 274 equemene
    # Copy from Device to Host
126 274 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
127 274 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
128 274 equemene
    Elapsed=time.time()-TimeIn
129 274 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130 274 equemene
131 275 equemene
    # Liberation of memory
132 274 equemene
    a_g.release()
133 274 equemene
    b_g.release()
134 274 equemene
    A_g.release()
135 274 equemene
    B_g.release()
136 274 equemene
137 274 equemene
    return(A_ocl,B_ocl)
138 274 equemene
139 274 equemene
# CUDA Silly complete operation
140 274 equemene
def CUDADFT(a_np,b_np):
141 274 equemene
    import pycuda.autoinit
142 274 equemene
    import pycuda.driver as drv
143 274 equemene
144 274 equemene
    from pycuda.compiler import SourceModule
145 274 equemene
    TimeIn=time.time()
146 274 equemene
    mod = SourceModule("""
147 274 equemene

148 274 equemene
#define PI 3.141592653589793
149 274 equemene

150 274 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
151 274 equemene
{
152 274 equemene
  const int gid = blockIdx.x;
153 274 equemene
  uint size = gridDim.x;
154 274 equemene
  float A=0.,B=0.;
155 274 equemene
  for (uint i=0; i<size;i++)
156 274 equemene
  {
157 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
158 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
159 274 equemene
  }
160 274 equemene
  A_g[gid]=A;
161 274 equemene
  B_g[gid]=B;
162 274 equemene
}
163 274 equemene

164 274 equemene
""")
165 274 equemene
    Elapsed=time.time()-TimeIn
166 274 equemene
    print("Definition of kernel : %.3f" % Elapsed)
167 274 equemene
168 274 equemene
    TimeIn=time.time()
169 274 equemene
    MyDFT = mod.get_function("MyDFT")
170 274 equemene
    Elapsed=time.time()-TimeIn
171 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
172 274 equemene
173 274 equemene
    TimeIn=time.time()
174 274 equemene
    A_np = np.zeros_like(a_np)
175 274 equemene
    B_np = np.zeros_like(a_np)
176 274 equemene
    Elapsed=time.time()-TimeIn
177 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
178 274 equemene
179 274 equemene
    TimeIn=time.time()
180 274 equemene
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
181 274 equemene
          block=(1,1,1), grid=(a_np.size,1))
182 274 equemene
    Elapsed=time.time()-TimeIn
183 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
184 274 equemene
    return(A_np,B_np)
185 274 equemene
186 274 equemene
import sys
187 274 equemene
import time
188 274 equemene
189 274 equemene
if __name__=='__main__':
190 274 equemene
191 274 equemene
    GpuStyle='OpenCL'
192 274 equemene
    SIZE=1024
193 274 equemene
    Device=0
194 274 equemene
195 274 equemene
    import getopt
196 274 equemene
197 274 equemene
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
198 274 equemene
199 274 equemene
    try:
200 274 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
201 274 equemene
    except getopt.GetoptError:
202 274 equemene
        print(HowToUse % sys.argv[0])
203 274 equemene
        sys.exit(2)
204 274 equemene
205 274 equemene
    # List of Devices
206 274 equemene
    Devices=[]
207 274 equemene
    Alu={}
208 274 equemene
209 274 equemene
    for opt, arg in opts:
210 274 equemene
        if opt == '-h':
211 274 equemene
            print(HowToUse % sys.argv[0])
212 274 equemene
213 274 equemene
            print("\nInformations about devices detected under OpenCL API:")
214 274 equemene
            # For PyOpenCL import
215 274 equemene
            try:
216 274 equemene
                import pyopencl as cl
217 274 equemene
                Id=0
218 274 equemene
                for platform in cl.get_platforms():
219 274 equemene
                    for device in platform.get_devices():
220 274 equemene
                        #deviceType=cl.device_type.to_string(device.type)
221 274 equemene
                        deviceType="xPU"
222 274 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
223 274 equemene
                        Id=Id+1
224 274 equemene
225 274 equemene
            except:
226 274 equemene
                print("Your platform does not seem to support OpenCL")
227 274 equemene
228 274 equemene
            print("\nInformations about devices detected under CUDA API:")
229 274 equemene
            # For PyCUDA import
230 274 equemene
            try:
231 274 equemene
                import pycuda.driver as cuda
232 274 equemene
                cuda.init()
233 274 equemene
                for Id in range(cuda.Device.count()):
234 274 equemene
                    device=cuda.Device(Id)
235 274 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
236 274 equemene
                print
237 274 equemene
            except:
238 274 equemene
                print("Your platform does not seem to support CUDA")
239 274 equemene
240 274 equemene
            sys.exit()
241 274 equemene
242 274 equemene
        elif opt in ("-d", "--device"):
243 274 equemene
            Device=int(arg)
244 274 equemene
        elif opt in ("-g", "--gpustyle"):
245 274 equemene
            GpuStyle = arg
246 274 equemene
        elif opt in ("-s", "--size"):
247 274 equemene
            SIZE = int(arg)
248 274 equemene
249 274 equemene
    print("Device Selection : %i" % Device)
250 274 equemene
    print("GpuStyle used : %s" % GpuStyle)
251 274 equemene
    print("Size of complex vector : %i" % SIZE)
252 274 equemene
253 274 equemene
    if GpuStyle=='CUDA':
254 274 equemene
        try:
255 274 equemene
            # For PyCUDA import
256 274 equemene
            import pycuda.driver as cuda
257 274 equemene
258 274 equemene
            cuda.init()
259 274 equemene
            for Id in range(cuda.Device.count()):
260 274 equemene
                device=cuda.Device(Id)
261 274 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
262 274 equemene
                if Id in Devices:
263 274 equemene
                    Alu[Id]='GPU'
264 274 equemene
265 274 equemene
        except ImportError:
266 274 equemene
            print("Platform does not seem to support CUDA")
267 274 equemene
268 274 equemene
    if GpuStyle=='OpenCL':
269 274 equemene
        try:
270 274 equemene
            # For PyOpenCL import
271 274 equemene
            import pyopencl as cl
272 274 equemene
            Id=0
273 274 equemene
            for platform in cl.get_platforms():
274 274 equemene
                for device in platform.get_devices():
275 274 equemene
                    #deviceType=cl.device_type.to_string(device.type)
276 274 equemene
                    deviceType="xPU"
277 274 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
278 274 equemene
279 274 equemene
                    if Id in Devices:
280 274 equemene
                    # Set the Alu as detected Device Type
281 274 equemene
                        Alu[Id]=deviceType
282 274 equemene
                    Id=Id+1
283 274 equemene
        except ImportError:
284 274 equemene
            print("Platform does not seem to support OpenCL")
285 274 equemene
286 274 equemene
287 274 equemene
288 274 equemene
    a_np = np.ones(SIZE).astype(np.float32)
289 274 equemene
    b_np = np.ones(SIZE).astype(np.float32)
290 274 equemene
291 274 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
292 274 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
293 274 equemene
    C_np[0] = np.float32(SIZE)
294 274 equemene
    D_np[0] = np.float32(SIZE)
295 274 equemene
296 274 equemene
    # # Native & Naive Implementation
297 274 equemene
    # print("Performing naive implementation")
298 274 equemene
    # TimeIn=time.time()
299 274 equemene
    # c_np,d_np=MyDFT(a_np,b_np)
300 274 equemene
    # NativeElapsed=time.time()-TimeIn
301 274 equemene
    # NativeRate=int(SIZE/NativeElapsed)
302 274 equemene
    # print("NativeRate: %i" % NativeRate)
303 274 equemene
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np))
304 274 equemene
305 274 equemene
    # # Native & Numpy Implementation
306 274 equemene
    # print("Performing Numpy implementation")
307 274 equemene
    # TimeIn=time.time()
308 274 equemene
    # e_np,f_np=NumpyDFT(a_np,b_np)
309 274 equemene
    # NumpyElapsed=time.time()-TimeIn
310 274 equemene
    # NumpyRate=int(SIZE/NumpyElapsed)
311 274 equemene
    # print("NumpyRate: %i" % NumpyRate)
312 274 equemene
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np))
313 274 equemene
314 274 equemene
    # # Native & Numba Implementation
315 274 equemene
    # print("Performing Numba implementation")
316 274 equemene
    # TimeIn=time.time()
317 274 equemene
    # g_np,h_np=NumbaDFT(a_np,b_np)
318 274 equemene
    # NumbaElapsed=time.time()-TimeIn
319 274 equemene
    # NumbaRate=int(SIZE/NumbaElapsed)
320 274 equemene
    # print("NumbaRate: %i" % NumbaRate)
321 274 equemene
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np))
322 274 equemene
323 274 equemene
    # OpenCL Implementation
324 274 equemene
    if GpuStyle=='OpenCL':
325 274 equemene
        print("Performing OpenCL implementation")
326 274 equemene
        TimeIn=time.time()
327 274 equemene
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
328 274 equemene
        OpenCLElapsed=time.time()-TimeIn
329 274 equemene
        OpenCLRate=int(SIZE/OpenCLElapsed)
330 274 equemene
        print("OpenCLRate: %i" % OpenCLRate)
331 274 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
332 274 equemene
              np.linalg.norm(j_np-D_np))
333 274 equemene
334 274 equemene
    # # CUDA Implementation
335 274 equemene
    # print("Performing CUDA implementation")
336 274 equemene
    # TimeIn=time.time()
337 274 equemene
    # k_np,l_np=CUDADFT(a_np,b_np)
338 274 equemene
    # CUDAElapsed=time.time()-TimeIn
339 274 equemene
    # CUDARate=int(SIZE/CUDAElapsed)
340 274 equemene
    # print("CUDARate: %i" % CUDARate)
341 274 equemene
    # print("Precision: ",np.linalg.norm(k_np-C_np),np.linalg.norm(l_np-D_np))