Statistiques
| Révision :

root / ETSN / MyDFT_8.py @ 274

Historique | Voir | Annoter | Télécharger (11,26 ko)

1 274 equemene
#!/usr/bin/env python3
2 274 equemene
3 274 equemene
import numpy as np
4 274 equemene
import pyopencl as cl
5 274 equemene
from numpy import pi,cos,sin
6 274 equemene
7 274 equemene
# Naive Discrete Fourier Transform
8 274 equemene
def MyDFT(x,y):
9 274 equemene
    size=x.shape[0]
10 274 equemene
    X=np.zeros(size).astype(np.float32)
11 274 equemene
    Y=np.zeros(size).astype(np.float32)
12 274 equemene
    for i in range(size):
13 274 equemene
        for j in range(size):
14 274 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 274 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 274 equemene
    return(X,Y)
17 274 equemene
18 274 equemene
# Numpy Discrete Fourier Transform
19 274 equemene
def NumpyDFT(x,y):
20 274 equemene
    size=x.shape[0]
21 274 equemene
    X=np.zeros(size).astype(np.float32)
22 274 equemene
    Y=np.zeros(size).astype(np.float32)
23 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 274 equemene
    for i in range(size):
25 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 274 equemene
    return(X,Y)
28 274 equemene
29 274 equemene
# Numba Discrete Fourier Transform
30 274 equemene
import numba
31 274 equemene
@numba.njit(parallel=True)
32 274 equemene
def NumbaDFT(x,y):
33 274 equemene
    size=x.shape[0]
34 274 equemene
    X=np.zeros(size).astype(np.float32)
35 274 equemene
    Y=np.zeros(size).astype(np.float32)
36 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 274 equemene
    for i in numba.prange(size):
38 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 274 equemene
    return(X,Y)
41 274 equemene
42 274 equemene
# OpenCL complete operation
43 274 equemene
def OpenCLDFT(a_np,b_np,Device):
44 274 equemene
45 274 equemene
    Id=0
46 274 equemene
    HasXPU=False
47 274 equemene
    for platform in cl.get_platforms():
48 274 equemene
        for device in platform.get_devices():
49 274 equemene
            if Id==Device:
50 274 equemene
                XPU=device
51 274 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
52 274 equemene
                HasXPU=True
53 274 equemene
            Id+=1
54 274 equemene
            # print(Id)
55 274 equemene
56 274 equemene
    if HasXPU==False:
57 274 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58 274 equemene
        sys.exit()
59 274 equemene
60 274 equemene
    try:
61 274 equemene
        ctx = cl.Context(devices=[XPU])
62 274 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63 274 equemene
    except:
64 274 equemene
        print("Crash during context creation")
65 274 equemene
66 274 equemene
    TimeIn=time.time()
67 274 equemene
    # Copy from Host to Device using pointers
68 274 equemene
    mf = cl.mem_flags
69 274 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70 274 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71 274 equemene
    Elapsed=time.time()-TimeIn
72 274 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73 274 equemene
74 274 equemene
    TimeIn=time.time()
75 274 equemene
    # Definition of kernel under OpenCL
76 274 equemene
    prg = cl.Program(ctx, """
77 274 equemene

78 274 equemene
#define PI 3.141592653589793
79 274 equemene

80 274 equemene
__kernel void MyDFT(
81 274 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82 274 equemene
{
83 274 equemene
  int gid = get_global_id(0);
84 274 equemene
  uint size = get_global_size(0);
85 274 equemene
  float A=0.,B=0.;
86 274 equemene
  for (uint i=0; i<size;i++)
87 274 equemene
  {
88 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90 274 equemene
  }
91 274 equemene
  A_g[gid]=A;
92 274 equemene
  B_g[gid]=B;
93 274 equemene
}
94 274 equemene
""").build()
95 274 equemene
    Elapsed=time.time()-TimeIn
96 274 equemene
    print("Building kernels : %.3f" % Elapsed)
97 274 equemene
98 274 equemene
    TimeIn=time.time()
99 274 equemene
    # Memory allocation on Device for result
100 274 equemene
    A_ocl = np.empty_like(a_np)
101 274 equemene
    B_ocl = np.empty_like(a_np)
102 274 equemene
    Elapsed=time.time()-TimeIn
103 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
104 274 equemene
105 274 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106 274 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107 274 equemene
    Elapsed=time.time()-TimeIn
108 274 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
109 274 equemene
110 274 equemene
    TimeIn=time.time()
111 274 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
112 274 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113 274 equemene
    Elapsed=time.time()-TimeIn
114 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
115 274 equemene
116 274 equemene
    TimeIn=time.time()
117 274 equemene
    # Call of kernel previously defined
118 274 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119 274 equemene
    #
120 274 equemene
    CallCL.wait()
121 274 equemene
    Elapsed=time.time()-TimeIn
122 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
123 274 equemene
124 274 equemene
    TimeIn=time.time()
125 274 equemene
    # Copy from Device to Host
126 274 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
127 274 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
128 274 equemene
    Elapsed=time.time()-TimeIn
129 274 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130 274 equemene
131 274 equemene
    a_g.release()
132 274 equemene
    b_g.release()
133 274 equemene
    A_g.release()
134 274 equemene
    B_g.release()
135 274 equemene
136 274 equemene
    return(A_ocl,B_ocl)
137 274 equemene
138 274 equemene
# CUDA Silly complete operation
139 274 equemene
def CUDADFT(a_np,b_np,Device):
140 274 equemene
    # import pycuda.autoinit
141 274 equemene
    import pycuda.driver as drv
142 274 equemene
    from pycuda.compiler import SourceModule
143 274 equemene
144 274 equemene
    try:
145 274 equemene
        # For PyCUDA import
146 274 equemene
        import pycuda.driver as cuda
147 274 equemene
        from pycuda.compiler import SourceModule
148 274 equemene
149 274 equemene
        cuda.init()
150 274 equemene
        for Id in range(cuda.Device.count()):
151 274 equemene
            if Id==Device:
152 274 equemene
                XPU=cuda.Device(Id)
153 274 equemene
                print("GPU selected %s" % XPU.name())
154 274 equemene
        print
155 274 equemene
156 274 equemene
    except ImportError:
157 274 equemene
        print("Platform does not seem to support CUDA")
158 274 equemene
159 274 equemene
    Context=XPU.make_context()
160 274 equemene
161 274 equemene
    TimeIn=time.time()
162 274 equemene
    mod = SourceModule("""
163 274 equemene

164 274 equemene
#define PI 3.141592653589793
165 274 equemene

166 274 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
167 274 equemene
{
168 274 equemene
  const int gid = blockIdx.x;
169 274 equemene
  uint size = gridDim.x;
170 274 equemene
  float A=0.,B=0.;
171 274 equemene
  for (uint i=0; i<size;i++)
172 274 equemene
  {
173 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
174 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
175 274 equemene
  }
176 274 equemene
  A_g[gid]=A;
177 274 equemene
  B_g[gid]=B;
178 274 equemene
}
179 274 equemene

180 274 equemene
""")
181 274 equemene
    Elapsed=time.time()-TimeIn
182 274 equemene
    print("Definition of kernel : %.3f" % Elapsed)
183 274 equemene
184 274 equemene
    TimeIn=time.time()
185 274 equemene
    MyDFT = mod.get_function("MyDFT")
186 274 equemene
    Elapsed=time.time()-TimeIn
187 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
188 274 equemene
189 274 equemene
    TimeIn=time.time()
190 274 equemene
    A_np = np.zeros_like(a_np)
191 274 equemene
    B_np = np.zeros_like(a_np)
192 274 equemene
    Elapsed=time.time()-TimeIn
193 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
194 274 equemene
195 274 equemene
    TimeIn=time.time()
196 274 equemene
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
197 274 equemene
          block=(1,1,1), grid=(a_np.size,1))
198 274 equemene
    Elapsed=time.time()-TimeIn
199 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
200 274 equemene
201 274 equemene
    Context.pop()
202 274 equemene
    Context.detach()
203 274 equemene
204 274 equemene
    return(A_np,B_np)
205 274 equemene
206 274 equemene
import sys
207 274 equemene
import time
208 274 equemene
209 274 equemene
if __name__=='__main__':
210 274 equemene
211 274 equemene
    GpuStyle='OpenCL'
212 274 equemene
    SIZE=1024
213 274 equemene
    Device=0
214 274 equemene
215 274 equemene
    import getopt
216 274 equemene
217 274 equemene
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
218 274 equemene
219 274 equemene
    try:
220 274 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
221 274 equemene
    except getopt.GetoptError:
222 274 equemene
        print(HowToUse % sys.argv[0])
223 274 equemene
        sys.exit(2)
224 274 equemene
225 274 equemene
    # List of Devices
226 274 equemene
    Devices=[]
227 274 equemene
    Alu={}
228 274 equemene
229 274 equemene
    for opt, arg in opts:
230 274 equemene
        if opt == '-h':
231 274 equemene
            print(HowToUse % sys.argv[0])
232 274 equemene
233 274 equemene
            print("\nInformations about devices detected under OpenCL API:")
234 274 equemene
            # For PyOpenCL import
235 274 equemene
            try:
236 274 equemene
                import pyopencl as cl
237 274 equemene
                Id=0
238 274 equemene
                for platform in cl.get_platforms():
239 274 equemene
                    for device in platform.get_devices():
240 274 equemene
                        #deviceType=cl.device_type.to_string(device.type)
241 274 equemene
                        deviceType="xPU"
242 274 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
243 274 equemene
                        Id=Id+1
244 274 equemene
245 274 equemene
            except:
246 274 equemene
                print("Your platform does not seem to support OpenCL")
247 274 equemene
248 274 equemene
            print("\nInformations about devices detected under CUDA API:")
249 274 equemene
            # For PyCUDA import
250 274 equemene
            try:
251 274 equemene
                import pycuda.driver as cuda
252 274 equemene
                cuda.init()
253 274 equemene
                for Id in range(cuda.Device.count()):
254 274 equemene
                    device=cuda.Device(Id)
255 274 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
256 274 equemene
                print
257 274 equemene
            except:
258 274 equemene
                print("Your platform does not seem to support CUDA")
259 274 equemene
260 274 equemene
            sys.exit()
261 274 equemene
262 274 equemene
        elif opt in ("-d", "--device"):
263 274 equemene
            Device=int(arg)
264 274 equemene
        elif opt in ("-g", "--gpustyle"):
265 274 equemene
            GpuStyle = arg
266 274 equemene
        elif opt in ("-s", "--size"):
267 274 equemene
            SIZE = int(arg)
268 274 equemene
269 274 equemene
    print("Device Selection : %i" % Device)
270 274 equemene
    print("GpuStyle used : %s" % GpuStyle)
271 274 equemene
    print("Size of complex vector : %i" % SIZE)
272 274 equemene
273 274 equemene
    if GpuStyle=='CUDA':
274 274 equemene
        try:
275 274 equemene
            # For PyCUDA import
276 274 equemene
            import pycuda.driver as cuda
277 274 equemene
278 274 equemene
            cuda.init()
279 274 equemene
            for Id in range(cuda.Device.count()):
280 274 equemene
                device=cuda.Device(Id)
281 274 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
282 274 equemene
                if Id in Devices:
283 274 equemene
                    Alu[Id]='GPU'
284 274 equemene
285 274 equemene
        except ImportError:
286 274 equemene
            print("Platform does not seem to support CUDA")
287 274 equemene
288 274 equemene
    if GpuStyle=='OpenCL':
289 274 equemene
        try:
290 274 equemene
            # For PyOpenCL import
291 274 equemene
            import pyopencl as cl
292 274 equemene
            Id=0
293 274 equemene
            for platform in cl.get_platforms():
294 274 equemene
                for device in platform.get_devices():
295 274 equemene
                    #deviceType=cl.device_type.to_string(device.type)
296 274 equemene
                    deviceType="xPU"
297 274 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
298 274 equemene
299 274 equemene
                    if Id in Devices:
300 274 equemene
                    # Set the Alu as detected Device Type
301 274 equemene
                        Alu[Id]=deviceType
302 274 equemene
                    Id=Id+1
303 274 equemene
        except ImportError:
304 274 equemene
            print("Platform does not seem to support OpenCL")
305 274 equemene
306 274 equemene
307 274 equemene
308 274 equemene
    a_np = np.ones(SIZE).astype(np.float32)
309 274 equemene
    b_np = np.ones(SIZE).astype(np.float32)
310 274 equemene
311 274 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
312 274 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
313 274 equemene
    C_np[0] = np.float32(SIZE)
314 274 equemene
    D_np[0] = np.float32(SIZE)
315 274 equemene
316 274 equemene
    # # Native & Naive Implementation
317 274 equemene
    # print("Performing naive implementation")
318 274 equemene
    # TimeIn=time.time()
319 274 equemene
    # c_np,d_np=MyDFT(a_np,b_np)
320 274 equemene
    # NativeElapsed=time.time()-TimeIn
321 274 equemene
    # NativeRate=int(SIZE/NativeElapsed)
322 274 equemene
    # print("NativeRate: %i" % NativeRate)
323 274 equemene
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np))
324 274 equemene
325 274 equemene
    # # Native & Numpy Implementation
326 274 equemene
    # print("Performing Numpy implementation")
327 274 equemene
    # TimeIn=time.time()
328 274 equemene
    # e_np,f_np=NumpyDFT(a_np,b_np)
329 274 equemene
    # NumpyElapsed=time.time()-TimeIn
330 274 equemene
    # NumpyRate=int(SIZE/NumpyElapsed)
331 274 equemene
    # print("NumpyRate: %i" % NumpyRate)
332 274 equemene
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np))
333 274 equemene
334 274 equemene
    # # Native & Numba Implementation
335 274 equemene
    # print("Performing Numba implementation")
336 274 equemene
    # TimeIn=time.time()
337 274 equemene
    # g_np,h_np=NumbaDFT(a_np,b_np)
338 274 equemene
    # NumbaElapsed=time.time()-TimeIn
339 274 equemene
    # NumbaRate=int(SIZE/NumbaElapsed)
340 274 equemene
    # print("NumbaRate: %i" % NumbaRate)
341 274 equemene
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np))
342 274 equemene
343 274 equemene
    # OpenCL Implementation
344 274 equemene
    if GpuStyle=='OpenCL':
345 274 equemene
        print("Performing OpenCL implementation")
346 274 equemene
        TimeIn=time.time()
347 274 equemene
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
348 274 equemene
        OpenCLElapsed=time.time()-TimeIn
349 274 equemene
        OpenCLRate=int(SIZE/OpenCLElapsed)
350 274 equemene
        print("OpenCLRate: %i" % OpenCLRate)
351 274 equemene
        print("Precision: ",np.linalg.norm(i_np-C_np),
352 274 equemene
              np.linalg.norm(j_np-D_np))
353 274 equemene
354 274 equemene
    # CUDA Implementation
355 274 equemene
    if GpuStyle=='CUDA':
356 274 equemene
        print("Performing CUDA implementation")
357 274 equemene
        TimeIn=time.time()
358 274 equemene
        k_np,l_np=CUDADFT(a_np,b_np,Device)
359 274 equemene
        CUDAElapsed=time.time()-TimeIn
360 274 equemene
        CUDARate=int(SIZE/CUDAElapsed)
361 274 equemene
        print("CUDARate: %i" % CUDARate)
362 274 equemene
        print("Precision: ",np.linalg.norm(k_np-C_np),
363 274 equemene
              np.linalg.norm(l_np-D_np))