Statistiques
| Révision :

root / ETSN / MyDFT_5.py @ 277

Historique | Voir | Annoter | Télécharger (7,22 ko)

1 271 equemene
#!/usr/bin/env python3
2 271 equemene
3 271 equemene
import numpy as np
4 271 equemene
import pyopencl as cl
5 271 equemene
from numpy import pi,cos,sin
6 271 equemene
7 271 equemene
# Naive Discrete Fourier Transform
8 271 equemene
def MyDFT(x,y):
9 271 equemene
    size=x.shape[0]
10 271 equemene
    X=np.zeros(size).astype(np.float32)
11 271 equemene
    Y=np.zeros(size).astype(np.float32)
12 271 equemene
    for i in range(size):
13 271 equemene
        for j in range(size):
14 271 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 271 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 271 equemene
    return(X,Y)
17 271 equemene
18 271 equemene
# Numpy Discrete Fourier Transform
19 271 equemene
def NumpyDFT(x,y):
20 271 equemene
    size=x.shape[0]
21 271 equemene
    X=np.zeros(size).astype(np.float32)
22 271 equemene
    Y=np.zeros(size).astype(np.float32)
23 271 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 271 equemene
    for i in range(size):
25 271 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 271 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 271 equemene
    return(X,Y)
28 271 equemene
29 271 equemene
# Numba Discrete Fourier Transform
30 271 equemene
import numba
31 271 equemene
@numba.njit(parallel=True)
32 271 equemene
def NumbaDFT(x,y):
33 271 equemene
    size=x.shape[0]
34 271 equemene
    X=np.zeros(size).astype(np.float32)
35 271 equemene
    Y=np.zeros(size).astype(np.float32)
36 271 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 271 equemene
    for i in numba.prange(size):
38 271 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 271 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 271 equemene
    return(X,Y)
41 271 equemene
42 271 equemene
# OpenCL complete operation
43 271 equemene
def OpenCLDFT(a_np,b_np):
44 271 equemene
45 271 equemene
    # Context creation
46 271 equemene
    ctx = cl.create_some_context()
47 271 equemene
    # Every process is stored in a queue
48 271 equemene
    queue = cl.CommandQueue(ctx)
49 271 equemene
50 271 equemene
    TimeIn=time.time()
51 271 equemene
    # Copy from Host to Device using pointers
52 271 equemene
    mf = cl.mem_flags
53 271 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
54 271 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
55 271 equemene
    Elapsed=time.time()-TimeIn
56 271 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
57 271 equemene
58 271 equemene
    TimeIn=time.time()
59 271 equemene
    # Definition of kernel under OpenCL
60 271 equemene
    prg = cl.Program(ctx, """
61 271 equemene

62 271 equemene
#define PI 3.141592653589793
63 271 equemene

64 271 equemene
__kernel void MyDFT(
65 271 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
66 271 equemene
{
67 271 equemene
  int gid = get_global_id(0);
68 271 equemene
  uint size = get_global_size(0);
69 271 equemene
  float A=0.,B=0.;
70 271 equemene
  for (uint i=0; i<size;i++)
71 271 equemene
  {
72 271 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
73 271 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
74 271 equemene
  }
75 271 equemene
  A_g[gid]=A;
76 271 equemene
  B_g[gid]=B;
77 271 equemene
}
78 271 equemene
""").build()
79 271 equemene
    Elapsed=time.time()-TimeIn
80 271 equemene
    print("Building kernels : %.3f" % Elapsed)
81 271 equemene
82 271 equemene
    TimeIn=time.time()
83 271 equemene
    # Memory allocation on Device for result
84 271 equemene
    A_ocl = np.empty_like(a_np)
85 271 equemene
    B_ocl = np.empty_like(a_np)
86 271 equemene
    Elapsed=time.time()-TimeIn
87 271 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
88 271 equemene
89 271 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
90 271 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
91 271 equemene
    Elapsed=time.time()-TimeIn
92 271 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
93 271 equemene
94 271 equemene
    TimeIn=time.time()
95 271 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
96 271 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
97 271 equemene
    Elapsed=time.time()-TimeIn
98 271 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
99 271 equemene
100 271 equemene
    TimeIn=time.time()
101 271 equemene
    # Call of kernel previously defined
102 271 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
103 271 equemene
    #
104 271 equemene
    CallCL.wait()
105 271 equemene
    Elapsed=time.time()-TimeIn
106 271 equemene
    print("Execution of kernel : %.3f" % Elapsed)
107 271 equemene
108 271 equemene
    TimeIn=time.time()
109 271 equemene
    # Copy from Device to Host
110 271 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
111 271 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
112 271 equemene
    Elapsed=time.time()-TimeIn
113 271 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
114 271 equemene
115 275 equemene
    # Liberation of memory
116 275 equemene
    a_g.release()
117 275 equemene
    b_g.release()
118 275 equemene
    A_g.release()
119 275 equemene
    B_g.release()
120 275 equemene
121 271 equemene
    return(A_ocl,B_ocl)
122 271 equemene
123 271 equemene
# CUDA Silly complete operation
124 271 equemene
def CUDADFT(a_np,b_np):
125 271 equemene
    import pycuda.autoinit
126 271 equemene
    import pycuda.driver as drv
127 271 equemene
    import numpy
128 271 equemene
129 271 equemene
    from pycuda.compiler import SourceModule
130 271 equemene
    TimeIn=time.time()
131 271 equemene
    mod = SourceModule("""
132 271 equemene

133 271 equemene
#define PI 3.141592653589793
134 271 equemene

135 271 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
136 271 equemene
{
137 271 equemene
  const int gid = blockIdx.x;
138 271 equemene
  uint size = gridDim.x;
139 271 equemene
  float A=0.,B=0.;
140 271 equemene
  for (uint i=0; i<size;i++)
141 271 equemene
  {
142 271 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
143 271 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
144 271 equemene
  }
145 271 equemene
  A_g[gid]=A;
146 271 equemene
  B_g[gid]=B;
147 271 equemene
}
148 271 equemene

149 271 equemene
""")
150 271 equemene
    Elapsed=time.time()-TimeIn
151 271 equemene
    print("Definition of kernel : %.3f" % Elapsed)
152 271 equemene
153 271 equemene
    TimeIn=time.time()
154 271 equemene
    MyDFT = mod.get_function("MyDFT")
155 271 equemene
    Elapsed=time.time()-TimeIn
156 271 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
157 271 equemene
158 271 equemene
    TimeIn=time.time()
159 271 equemene
    A_np = numpy.zeros_like(a_np)
160 271 equemene
    B_np = numpy.zeros_like(a_np)
161 271 equemene
    Elapsed=time.time()-TimeIn
162 271 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
163 271 equemene
164 271 equemene
    TimeIn=time.time()
165 271 equemene
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
166 271 equemene
          block=(1,1,1), grid=(a_np.size,1))
167 271 equemene
    Elapsed=time.time()-TimeIn
168 271 equemene
    print("Execution of kernel : %.3f" % Elapsed)
169 271 equemene
    return(A_np,B_np)
170 271 equemene
171 271 equemene
import sys
172 271 equemene
import time
173 271 equemene
174 271 equemene
if __name__=='__main__':
175 271 equemene
176 271 equemene
    # Size of input vectors definition based on stdin
177 271 equemene
    import sys
178 271 equemene
    try:
179 271 equemene
        SIZE=int(sys.argv[1])
180 271 equemene
        print("Size of vectors set to %i" % SIZE)
181 271 equemene
    except:
182 271 equemene
        SIZE=256
183 271 equemene
        print("Size of vectors set to default size %i" % SIZE)
184 271 equemene
185 271 equemene
    a_np = np.ones(SIZE).astype(np.float32)
186 271 equemene
    b_np = np.ones(SIZE).astype(np.float32)
187 271 equemene
188 271 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
189 271 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
190 271 equemene
    C_np[0] = np.float32(SIZE)
191 271 equemene
    D_np[0] = np.float32(SIZE)
192 271 equemene
193 272 equemene
    # # Native & Naive Implementation
194 272 equemene
    # print("Performing naive implementation")
195 272 equemene
    # TimeIn=time.time()
196 272 equemene
    # c_np,d_np=MyDFT(a_np,b_np)
197 272 equemene
    # NativeElapsed=time.time()-TimeIn
198 272 equemene
    # NativeRate=int(SIZE/NativeElapsed)
199 272 equemene
    # print("NativeRate: %i" % NativeRate)
200 272 equemene
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np))
201 271 equemene
202 271 equemene
    # Native & Numpy Implementation
203 271 equemene
    print("Performing Numpy implementation")
204 271 equemene
    TimeIn=time.time()
205 271 equemene
    e_np,f_np=NumpyDFT(a_np,b_np)
206 271 equemene
    NumpyElapsed=time.time()-TimeIn
207 271 equemene
    NumpyRate=int(SIZE/NumpyElapsed)
208 271 equemene
    print("NumpyRate: %i" % NumpyRate)
209 271 equemene
    print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np))
210 271 equemene
211 271 equemene
    # Native & Numba Implementation
212 271 equemene
    print("Performing Numba implementation")
213 271 equemene
    TimeIn=time.time()
214 271 equemene
    g_np,h_np=NumbaDFT(a_np,b_np)
215 271 equemene
    NumbaElapsed=time.time()-TimeIn
216 271 equemene
    NumbaRate=int(SIZE/NumbaElapsed)
217 271 equemene
    print("NumbaRate: %i" % NumbaRate)
218 271 equemene
    print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np))
219 271 equemene
220 271 equemene
    # OpenCL Implementation
221 273 equemene
    print("Performing OpenCL implementation")
222 271 equemene
    TimeIn=time.time()
223 271 equemene
    i_np,j_np=OpenCLDFT(a_np,b_np)
224 271 equemene
    OpenCLElapsed=time.time()-TimeIn
225 271 equemene
    OpenCLRate=int(SIZE/OpenCLElapsed)
226 271 equemene
    print("OpenCLRate: %i" % OpenCLRate)
227 271 equemene
    print("Precision: ",np.linalg.norm(i_np-C_np),np.linalg.norm(j_np-D_np))
228 271 equemene
229 271 equemene
    # CUDA Implementation
230 273 equemene
    print("Performing CUDA implementation")
231 271 equemene
    TimeIn=time.time()
232 271 equemene
    k_np,l_np=CUDADFT(a_np,b_np)
233 271 equemene
    CUDAElapsed=time.time()-TimeIn
234 271 equemene
    CUDARate=int(SIZE/CUDAElapsed)
235 271 equemene
    print("CUDARate: %i" % CUDARate)
236 271 equemene
    print("Precision: ",np.linalg.norm(k_np-C_np),np.linalg.norm(l_np-D_np))