Statistiques
| Révision :

root / ETSN / MyDFT_5b.py @ 275

Historique | Voir | Annoter | Télécharger (7,33 ko)

1 271 equemene
#!/usr/bin/env python3
2 271 equemene
3 271 equemene
import numpy as np
4 271 equemene
import pyopencl as cl
5 271 equemene
from numpy import pi,cos,sin
6 271 equemene
7 271 equemene
# Naive Discrete Fourier Transform
8 271 equemene
def MyDFT(x,y):
9 271 equemene
    size=x.shape[0]
10 271 equemene
    X=np.zeros(size).astype(np.float32)
11 271 equemene
    Y=np.zeros(size).astype(np.float32)
12 271 equemene
    for i in range(size):
13 271 equemene
        for j in range(size):
14 271 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 271 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 271 equemene
    return(X,Y)
17 271 equemene
18 271 equemene
# Numpy Discrete Fourier Transform
19 271 equemene
def NumpyDFT(x,y):
20 271 equemene
    size=x.shape[0]
21 271 equemene
    X=np.zeros(size).astype(np.float32)
22 271 equemene
    Y=np.zeros(size).astype(np.float32)
23 271 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 271 equemene
    for i in range(size):
25 271 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 271 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 271 equemene
    return(X,Y)
28 271 equemene
29 271 equemene
# Numba Discrete Fourier Transform
30 271 equemene
import numba
31 271 equemene
@numba.njit(parallel=True)
32 271 equemene
def NumbaDFT(x,y):
33 271 equemene
    size=x.shape[0]
34 271 equemene
    X=np.zeros(size).astype(np.float32)
35 271 equemene
    Y=np.zeros(size).astype(np.float32)
36 271 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 271 equemene
    for i in numba.prange(size):
38 271 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 271 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 271 equemene
    return(X,Y)
41 271 equemene
42 271 equemene
# OpenCL complete operation
43 271 equemene
def OpenCLDFT(a_np,b_np):
44 271 equemene
45 271 equemene
    # Context creation
46 271 equemene
    ctx = cl.create_some_context()
47 271 equemene
    # Every process is stored in a queue
48 271 equemene
    queue = cl.CommandQueue(ctx)
49 271 equemene
50 271 equemene
    TimeIn=time.time()
51 271 equemene
    # Copy from Host to Device using pointers
52 271 equemene
    mf = cl.mem_flags
53 271 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
54 271 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
55 271 equemene
    Elapsed=time.time()-TimeIn
56 271 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
57 271 equemene
58 271 equemene
    TimeIn=time.time()
59 271 equemene
    # Definition of kernel under OpenCL
60 271 equemene
    prg = cl.Program(ctx, """
61 271 equemene

62 271 equemene
#define PI 3.141592653589793
63 271 equemene

64 271 equemene
__kernel void MyDFT(
65 271 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
66 271 equemene
{
67 271 equemene
  int gid = get_global_id(0);
68 271 equemene
  uint size = get_global_size(0);
69 271 equemene
  float A=0.,B=0.;
70 271 equemene
  for (uint i=0; i<size;i++)
71 271 equemene
  {
72 271 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
73 271 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
74 271 equemene
  }
75 271 equemene
  A_g[gid]=A;
76 271 equemene
  B_g[gid]=B;
77 271 equemene
}
78 271 equemene
""").build()
79 271 equemene
    Elapsed=time.time()-TimeIn
80 271 equemene
    print("Building kernels : %.3f" % Elapsed)
81 271 equemene
82 271 equemene
    TimeIn=time.time()
83 271 equemene
    # Memory allocation on Device for result
84 271 equemene
    A_ocl = np.empty_like(a_np)
85 271 equemene
    B_ocl = np.empty_like(a_np)
86 271 equemene
    Elapsed=time.time()-TimeIn
87 271 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
88 271 equemene
89 271 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
90 271 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
91 271 equemene
    Elapsed=time.time()-TimeIn
92 271 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
93 271 equemene
94 271 equemene
    TimeIn=time.time()
95 271 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
96 271 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
97 271 equemene
    Elapsed=time.time()-TimeIn
98 271 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
99 271 equemene
100 271 equemene
    TimeIn=time.time()
101 271 equemene
    # Call of kernel previously defined
102 271 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
103 271 equemene
    #
104 271 equemene
    CallCL.wait()
105 271 equemene
    Elapsed=time.time()-TimeIn
106 271 equemene
    print("Execution of kernel : %.3f" % Elapsed)
107 271 equemene
108 271 equemene
    TimeIn=time.time()
109 271 equemene
    # Copy from Device to Host
110 271 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
111 271 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
112 271 equemene
    Elapsed=time.time()-TimeIn
113 271 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
114 271 equemene
115 271 equemene
    return(A_ocl,B_ocl)
116 271 equemene
117 271 equemene
# CUDA Silly complete operation
118 271 equemene
def CUDADFT(a_np,b_np):
119 271 equemene
    import pycuda.autoinit
120 271 equemene
    import pycuda.driver as drv
121 271 equemene
    import numpy
122 271 equemene
123 271 equemene
    from pycuda.compiler import SourceModule
124 271 equemene
    TimeIn=time.time()
125 271 equemene
    mod = SourceModule("""
126 271 equemene

127 271 equemene
#define PI 3.141592653589793
128 271 equemene

129 271 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
130 271 equemene
{
131 271 equemene
//  const int gid = blockIdx.x;
132 271 equemene
//  uint size = gridDim.x;
133 271 equemene
  const int gid = threadIdx.x+blockIdx.x*blockDim.x;
134 271 equemene
  uint size = gridDim.x*blockDim.x;
135 271 equemene
  float A=0.,B=0.;
136 271 equemene
  for (uint i=0; i<size;i++)
137 271 equemene
  {
138 271 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
139 271 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
140 271 equemene
  }
141 271 equemene
  A_g[gid]=A;
142 271 equemene
  B_g[gid]=B;
143 271 equemene
}
144 271 equemene

145 271 equemene
""")
146 271 equemene
    Elapsed=time.time()-TimeIn
147 271 equemene
    print("Definition of kernel : %.3f" % Elapsed)
148 271 equemene
149 271 equemene
    TimeIn=time.time()
150 271 equemene
    MyDFT = mod.get_function("MyDFT")
151 271 equemene
    Elapsed=time.time()-TimeIn
152 271 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
153 271 equemene
154 271 equemene
    TimeIn=time.time()
155 273 equemene
    A_np = np.zeros_like(a_np)
156 273 equemene
    B_np = np.zeros_like(a_np)
157 271 equemene
    Elapsed=time.time()-TimeIn
158 271 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
159 271 equemene
160 271 equemene
    TimeIn=time.time()
161 271 equemene
    # MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
162 271 equemene
    #       block=(1,1,1), grid=(a_np.size,1))
163 271 equemene
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
164 271 equemene
          block=(1024,1,1), grid=(int(a_np.size/1024),1))
165 271 equemene
    Elapsed=time.time()-TimeIn
166 271 equemene
    print("Execution of kernel : %.3f" % Elapsed)
167 271 equemene
    return(A_np,B_np)
168 271 equemene
169 271 equemene
import sys
170 271 equemene
import time
171 271 equemene
172 271 equemene
if __name__=='__main__':
173 271 equemene
174 271 equemene
    # Size of input vectors definition based on stdin
175 271 equemene
    import sys
176 271 equemene
    try:
177 271 equemene
        SIZE=int(sys.argv[1])
178 271 equemene
        print("Size of vectors set to %i" % SIZE)
179 271 equemene
    except:
180 271 equemene
        SIZE=256
181 271 equemene
        print("Size of vectors set to default size %i" % SIZE)
182 271 equemene
183 271 equemene
    a_np = np.ones(SIZE).astype(np.float32)
184 271 equemene
    b_np = np.ones(SIZE).astype(np.float32)
185 271 equemene
186 271 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
187 271 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
188 271 equemene
    C_np[0] = np.float32(SIZE)
189 271 equemene
    D_np[0] = np.float32(SIZE)
190 271 equemene
191 272 equemene
    # # Native & Naive Implementation
192 272 equemene
    # print("Performing naive implementation")
193 272 equemene
    # TimeIn=time.time()
194 272 equemene
    # c_np,d_np=MyDFT(a_np,b_np)
195 272 equemene
    # NativeElapsed=time.time()-TimeIn
196 272 equemene
    # NativeRate=int(SIZE/NativeElapsed)
197 272 equemene
    # print("NativeRate: %i" % NativeRate)
198 272 equemene
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np))
199 271 equemene
200 271 equemene
    # Native & Numpy Implementation
201 271 equemene
    print("Performing Numpy implementation")
202 271 equemene
    TimeIn=time.time()
203 271 equemene
    e_np,f_np=NumpyDFT(a_np,b_np)
204 271 equemene
    NumpyElapsed=time.time()-TimeIn
205 271 equemene
    NumpyRate=int(SIZE/NumpyElapsed)
206 271 equemene
    print("NumpyRate: %i" % NumpyRate)
207 271 equemene
    print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np))
208 271 equemene
209 271 equemene
    # Native & Numba Implementation
210 271 equemene
    print("Performing Numba implementation")
211 271 equemene
    TimeIn=time.time()
212 271 equemene
    g_np,h_np=NumbaDFT(a_np,b_np)
213 271 equemene
    NumbaElapsed=time.time()-TimeIn
214 271 equemene
    NumbaRate=int(SIZE/NumbaElapsed)
215 271 equemene
    print("NumbaRate: %i" % NumbaRate)
216 271 equemene
    print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np))
217 271 equemene
218 271 equemene
    # OpenCL Implementation
219 273 equemene
    print("Performing OpenCL implementation")
220 271 equemene
    TimeIn=time.time()
221 271 equemene
    i_np,j_np=OpenCLDFT(a_np,b_np)
222 271 equemene
    OpenCLElapsed=time.time()-TimeIn
223 271 equemene
    OpenCLRate=int(SIZE/OpenCLElapsed)
224 271 equemene
    print("OpenCLRate: %i" % OpenCLRate)
225 271 equemene
    print("Precision: ",np.linalg.norm(i_np-C_np),np.linalg.norm(j_np-D_np))
226 271 equemene
227 271 equemene
    # CUDA Implementation
228 273 equemene
    print("Performing CUDA implementation")
229 271 equemene
    TimeIn=time.time()
230 271 equemene
    k_np,l_np=CUDADFT(a_np,b_np)
231 271 equemene
    CUDAElapsed=time.time()-TimeIn
232 271 equemene
    CUDARate=int(SIZE/CUDAElapsed)
233 271 equemene
    print("CUDARate: %i" % CUDARate)
234 271 equemene
    print("Precision: ",np.linalg.norm(k_np-C_np),np.linalg.norm(l_np-D_np))