Statistiques
| Révision :

root / ETSN / MyDFT_6.py @ 280

Historique | Voir | Annoter | Télécharger (10,2 ko)

1 274 equemene
#!/usr/bin/env python3
2 274 equemene
3 274 equemene
import numpy as np
4 274 equemene
import pyopencl as cl
5 274 equemene
from numpy import pi,cos,sin
6 274 equemene
7 274 equemene
# Naive Discrete Fourier Transform
8 274 equemene
def MyDFT(x,y):
9 274 equemene
    size=x.shape[0]
10 274 equemene
    X=np.zeros(size).astype(np.float32)
11 274 equemene
    Y=np.zeros(size).astype(np.float32)
12 274 equemene
    for i in range(size):
13 274 equemene
        for j in range(size):
14 274 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 274 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 274 equemene
    return(X,Y)
17 274 equemene
18 274 equemene
# Numpy Discrete Fourier Transform
19 274 equemene
def NumpyDFT(x,y):
20 274 equemene
    size=x.shape[0]
21 274 equemene
    X=np.zeros(size).astype(np.float32)
22 274 equemene
    Y=np.zeros(size).astype(np.float32)
23 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 274 equemene
    for i in range(size):
25 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 274 equemene
    return(X,Y)
28 274 equemene
29 274 equemene
# Numba Discrete Fourier Transform
30 274 equemene
import numba
31 274 equemene
@numba.njit(parallel=True)
32 274 equemene
def NumbaDFT(x,y):
33 274 equemene
    size=x.shape[0]
34 274 equemene
    X=np.zeros(size).astype(np.float32)
35 274 equemene
    Y=np.zeros(size).astype(np.float32)
36 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 274 equemene
    for i in numba.prange(size):
38 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 274 equemene
    return(X,Y)
41 274 equemene
42 274 equemene
# OpenCL complete operation
43 274 equemene
def OpenCLDFT(a_np,b_np):
44 274 equemene
45 274 equemene
    # Context creation
46 274 equemene
    ctx = cl.create_some_context()
47 274 equemene
    # Every process is stored in a queue
48 274 equemene
    queue = cl.CommandQueue(ctx)
49 274 equemene
50 274 equemene
    TimeIn=time.time()
51 274 equemene
    # Copy from Host to Device using pointers
52 274 equemene
    mf = cl.mem_flags
53 274 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
54 274 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
55 274 equemene
    Elapsed=time.time()-TimeIn
56 274 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
57 274 equemene
58 274 equemene
    TimeIn=time.time()
59 274 equemene
    # Definition of kernel under OpenCL
60 274 equemene
    prg = cl.Program(ctx, """
61 274 equemene

62 274 equemene
#define PI 3.141592653589793
63 274 equemene

64 274 equemene
__kernel void MyDFT(
65 274 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
66 274 equemene
{
67 274 equemene
  int gid = get_global_id(0);
68 274 equemene
  uint size = get_global_size(0);
69 274 equemene
  float A=0.,B=0.;
70 274 equemene
  for (uint i=0; i<size;i++)
71 274 equemene
  {
72 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
73 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
74 274 equemene
  }
75 274 equemene
  A_g[gid]=A;
76 274 equemene
  B_g[gid]=B;
77 274 equemene
}
78 274 equemene
""").build()
79 274 equemene
    Elapsed=time.time()-TimeIn
80 274 equemene
    print("Building kernels : %.3f" % Elapsed)
81 274 equemene
82 274 equemene
    TimeIn=time.time()
83 274 equemene
    # Memory allocation on Device for result
84 274 equemene
    A_ocl = np.empty_like(a_np)
85 274 equemene
    B_ocl = np.empty_like(a_np)
86 274 equemene
    Elapsed=time.time()-TimeIn
87 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
88 274 equemene
89 274 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
90 274 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
91 274 equemene
    Elapsed=time.time()-TimeIn
92 274 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
93 274 equemene
94 274 equemene
    TimeIn=time.time()
95 274 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
96 274 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
97 274 equemene
    Elapsed=time.time()-TimeIn
98 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
99 274 equemene
100 274 equemene
    TimeIn=time.time()
101 274 equemene
    # Call of kernel previously defined
102 274 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
103 274 equemene
    #
104 274 equemene
    CallCL.wait()
105 274 equemene
    Elapsed=time.time()-TimeIn
106 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
107 274 equemene
108 274 equemene
    TimeIn=time.time()
109 274 equemene
    # Copy from Device to Host
110 274 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
111 274 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
112 274 equemene
    Elapsed=time.time()-TimeIn
113 274 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
114 274 equemene
115 275 equemene
    # Liberation of memory
116 275 equemene
    a_g.release()
117 275 equemene
    b_g.release()
118 275 equemene
    A_g.release()
119 275 equemene
    B_g.release()
120 275 equemene
121 274 equemene
    return(A_ocl,B_ocl)
122 274 equemene
123 274 equemene
# CUDA Silly complete operation
124 274 equemene
def CUDADFT(a_np,b_np):
125 274 equemene
    import pycuda.autoinit
126 274 equemene
    import pycuda.driver as drv
127 274 equemene
    import numpy
128 274 equemene
129 274 equemene
    from pycuda.compiler import SourceModule
130 274 equemene
    TimeIn=time.time()
131 274 equemene
    mod = SourceModule("""
132 274 equemene

133 274 equemene
#define PI 3.141592653589793
134 274 equemene

135 274 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
136 274 equemene
{
137 274 equemene
  const int gid = blockIdx.x;
138 274 equemene
  uint size = gridDim.x;
139 274 equemene
  float A=0.,B=0.;
140 274 equemene
  for (uint i=0; i<size;i++)
141 274 equemene
  {
142 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
143 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
144 274 equemene
  }
145 274 equemene
  A_g[gid]=A;
146 274 equemene
  B_g[gid]=B;
147 274 equemene
}
148 274 equemene

149 274 equemene
""")
150 274 equemene
    Elapsed=time.time()-TimeIn
151 274 equemene
    print("Definition of kernel : %.3f" % Elapsed)
152 274 equemene
153 274 equemene
    TimeIn=time.time()
154 274 equemene
    MyDFT = mod.get_function("MyDFT")
155 274 equemene
    Elapsed=time.time()-TimeIn
156 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
157 274 equemene
158 274 equemene
    TimeIn=time.time()
159 274 equemene
    A_np = numpy.zeros_like(a_np)
160 274 equemene
    B_np = numpy.zeros_like(a_np)
161 274 equemene
    Elapsed=time.time()-TimeIn
162 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
163 274 equemene
164 274 equemene
    TimeIn=time.time()
165 274 equemene
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
166 274 equemene
          block=(1,1,1), grid=(a_np.size,1))
167 274 equemene
    Elapsed=time.time()-TimeIn
168 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
169 274 equemene
    return(A_np,B_np)
170 274 equemene
171 274 equemene
import sys
172 274 equemene
import time
173 274 equemene
174 274 equemene
if __name__=='__main__':
175 274 equemene
176 274 equemene
    GpuStyle='OpenCL'
177 274 equemene
    SIZE=1024
178 274 equemene
    Device=0
179 274 equemene
180 274 equemene
    import getopt
181 274 equemene
182 274 equemene
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
183 274 equemene
184 274 equemene
    try:
185 274 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
186 274 equemene
    except getopt.GetoptError:
187 274 equemene
        print(HowToUse % sys.argv[0])
188 274 equemene
        sys.exit(2)
189 274 equemene
190 274 equemene
    # List of Devices
191 274 equemene
    Devices=[]
192 274 equemene
    Alu={}
193 274 equemene
194 274 equemene
    for opt, arg in opts:
195 274 equemene
        if opt == '-h':
196 274 equemene
            print(HowToUse % sys.argv[0])
197 274 equemene
198 274 equemene
            print("\nInformations about devices detected under OpenCL API:")
199 274 equemene
            # For PyOpenCL import
200 274 equemene
            try:
201 274 equemene
                import pyopencl as cl
202 274 equemene
                Id=0
203 274 equemene
                for platform in cl.get_platforms():
204 274 equemene
                    for device in platform.get_devices():
205 274 equemene
                        #deviceType=cl.device_type.to_string(device.type)
206 274 equemene
                        deviceType="xPU"
207 274 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
208 274 equemene
                        Id=Id+1
209 274 equemene
210 274 equemene
            except:
211 274 equemene
                print("Your platform does not seem to support OpenCL")
212 274 equemene
213 274 equemene
            print("\nInformations about devices detected under CUDA API:")
214 274 equemene
            # For PyCUDA import
215 274 equemene
            try:
216 274 equemene
                import pycuda.driver as cuda
217 274 equemene
                cuda.init()
218 274 equemene
                for Id in range(cuda.Device.count()):
219 274 equemene
                    device=cuda.Device(Id)
220 274 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
221 274 equemene
                print
222 274 equemene
            except:
223 274 equemene
                print("Your platform does not seem to support CUDA")
224 274 equemene
225 274 equemene
            sys.exit()
226 274 equemene
227 274 equemene
        elif opt in ("-d", "--device"):
228 274 equemene
            Device=int(arg)
229 274 equemene
        elif opt in ("-g", "--gpustyle"):
230 274 equemene
            GpuStyle = arg
231 274 equemene
        elif opt in ("-s", "--size"):
232 274 equemene
            SIZE = int(arg)
233 274 equemene
234 274 equemene
    print("Device Selection : %i" % Device)
235 274 equemene
    print("GpuStyle used : %s" % GpuStyle)
236 274 equemene
    print("Size of complex vector : %i" % SIZE)
237 274 equemene
238 274 equemene
    if GpuStyle=='CUDA':
239 274 equemene
        try:
240 274 equemene
            # For PyCUDA import
241 274 equemene
            import pycuda.driver as cuda
242 274 equemene
243 274 equemene
            cuda.init()
244 274 equemene
            for Id in range(cuda.Device.count()):
245 274 equemene
                device=cuda.Device(Id)
246 274 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
247 274 equemene
                if Id in Devices:
248 274 equemene
                    Alu[Id]='GPU'
249 274 equemene
250 274 equemene
        except ImportError:
251 274 equemene
            print("Platform does not seem to support CUDA")
252 274 equemene
253 274 equemene
    if GpuStyle=='OpenCL':
254 274 equemene
        try:
255 274 equemene
            # For PyOpenCL import
256 274 equemene
            import pyopencl as cl
257 274 equemene
            Id=0
258 274 equemene
            for platform in cl.get_platforms():
259 274 equemene
                for device in platform.get_devices():
260 274 equemene
                    #deviceType=cl.device_type.to_string(device.type)
261 274 equemene
                    deviceType="xPU"
262 274 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
263 274 equemene
264 274 equemene
                    if Id in Devices:
265 274 equemene
                    # Set the Alu as detected Device Type
266 274 equemene
                        Alu[Id]=deviceType
267 274 equemene
                    Id=Id+1
268 274 equemene
        except ImportError:
269 274 equemene
            print("Platform does not seem to support OpenCL")
270 274 equemene
271 274 equemene
272 274 equemene
273 274 equemene
    a_np = np.ones(SIZE).astype(np.float32)
274 274 equemene
    b_np = np.ones(SIZE).astype(np.float32)
275 274 equemene
276 274 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
277 274 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
278 274 equemene
    C_np[0] = np.float32(SIZE)
279 274 equemene
    D_np[0] = np.float32(SIZE)
280 274 equemene
281 274 equemene
    # # Native & Naive Implementation
282 274 equemene
    # print("Performing naive implementation")
283 274 equemene
    # TimeIn=time.time()
284 274 equemene
    # c_np,d_np=MyDFT(a_np,b_np)
285 274 equemene
    # NativeElapsed=time.time()-TimeIn
286 274 equemene
    # NativeRate=int(SIZE/NativeElapsed)
287 274 equemene
    # print("NativeRate: %i" % NativeRate)
288 274 equemene
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np))
289 274 equemene
290 274 equemene
    # # Native & Numpy Implementation
291 274 equemene
    # print("Performing Numpy implementation")
292 274 equemene
    # TimeIn=time.time()
293 274 equemene
    # e_np,f_np=NumpyDFT(a_np,b_np)
294 274 equemene
    # NumpyElapsed=time.time()-TimeIn
295 274 equemene
    # NumpyRate=int(SIZE/NumpyElapsed)
296 274 equemene
    # print("NumpyRate: %i" % NumpyRate)
297 274 equemene
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np))
298 274 equemene
299 274 equemene
    # # Native & Numba Implementation
300 274 equemene
    # print("Performing Numba implementation")
301 274 equemene
    # TimeIn=time.time()
302 274 equemene
    # g_np,h_np=NumbaDFT(a_np,b_np)
303 274 equemene
    # NumbaElapsed=time.time()-TimeIn
304 274 equemene
    # NumbaRate=int(SIZE/NumbaElapsed)
305 274 equemene
    # print("NumbaRate: %i" % NumbaRate)
306 274 equemene
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np))
307 274 equemene
308 274 equemene
    # # OpenCL Implementation
309 274 equemene
    # print("Performing OpenCL implementation")
310 274 equemene
    # TimeIn=time.time()
311 274 equemene
    # i_np,j_np=OpenCLDFT(a_np,b_np)
312 274 equemene
    # OpenCLElapsed=time.time()-TimeIn
313 274 equemene
    # OpenCLRate=int(SIZE/OpenCLElapsed)
314 274 equemene
    # print("OpenCLRate: %i" % OpenCLRate)
315 274 equemene
    # print("Precision: ",np.linalg.norm(i_np-C_np),np.linalg.norm(j_np-D_np))
316 274 equemene
317 274 equemene
    # # CUDA Implementation
318 274 equemene
    # print("Performing CUDA implementation")
319 274 equemene
    # TimeIn=time.time()
320 274 equemene
    # k_np,l_np=CUDADFT(a_np,b_np)
321 274 equemene
    # CUDAElapsed=time.time()-TimeIn
322 274 equemene
    # CUDARate=int(SIZE/CUDAElapsed)
323 274 equemene
    # print("CUDARate: %i" % CUDARate)
324 274 equemene
    # print("Precision: ",np.linalg.norm(k_np-C_np),np.linalg.norm(l_np-D_np))