Statistiques
| Révision :

root / ETSN / MyDFT_6.py @ 274

Historique | Voir | Annoter | Télécharger (10,1 ko)

1 274 equemene
#!/usr/bin/env python3
2 274 equemene
3 274 equemene
import numpy as np
4 274 equemene
import pyopencl as cl
5 274 equemene
from numpy import pi,cos,sin
6 274 equemene
7 274 equemene
# Naive Discrete Fourier Transform
8 274 equemene
def MyDFT(x,y):
9 274 equemene
    size=x.shape[0]
10 274 equemene
    X=np.zeros(size).astype(np.float32)
11 274 equemene
    Y=np.zeros(size).astype(np.float32)
12 274 equemene
    for i in range(size):
13 274 equemene
        for j in range(size):
14 274 equemene
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15 274 equemene
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16 274 equemene
    return(X,Y)
17 274 equemene
18 274 equemene
# Numpy Discrete Fourier Transform
19 274 equemene
def NumpyDFT(x,y):
20 274 equemene
    size=x.shape[0]
21 274 equemene
    X=np.zeros(size).astype(np.float32)
22 274 equemene
    Y=np.zeros(size).astype(np.float32)
23 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24 274 equemene
    for i in range(size):
25 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27 274 equemene
    return(X,Y)
28 274 equemene
29 274 equemene
# Numba Discrete Fourier Transform
30 274 equemene
import numba
31 274 equemene
@numba.njit(parallel=True)
32 274 equemene
def NumbaDFT(x,y):
33 274 equemene
    size=x.shape[0]
34 274 equemene
    X=np.zeros(size).astype(np.float32)
35 274 equemene
    Y=np.zeros(size).astype(np.float32)
36 274 equemene
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37 274 equemene
    for i in numba.prange(size):
38 274 equemene
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39 274 equemene
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40 274 equemene
    return(X,Y)
41 274 equemene
42 274 equemene
# OpenCL complete operation
43 274 equemene
def OpenCLDFT(a_np,b_np):
44 274 equemene
45 274 equemene
    # Context creation
46 274 equemene
    ctx = cl.create_some_context()
47 274 equemene
    # Every process is stored in a queue
48 274 equemene
    queue = cl.CommandQueue(ctx)
49 274 equemene
50 274 equemene
    TimeIn=time.time()
51 274 equemene
    # Copy from Host to Device using pointers
52 274 equemene
    mf = cl.mem_flags
53 274 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
54 274 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
55 274 equemene
    Elapsed=time.time()-TimeIn
56 274 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
57 274 equemene
58 274 equemene
    TimeIn=time.time()
59 274 equemene
    # Definition of kernel under OpenCL
60 274 equemene
    prg = cl.Program(ctx, """
61 274 equemene

62 274 equemene
#define PI 3.141592653589793
63 274 equemene

64 274 equemene
__kernel void MyDFT(
65 274 equemene
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
66 274 equemene
{
67 274 equemene
  int gid = get_global_id(0);
68 274 equemene
  uint size = get_global_size(0);
69 274 equemene
  float A=0.,B=0.;
70 274 equemene
  for (uint i=0; i<size;i++)
71 274 equemene
  {
72 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
73 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
74 274 equemene
  }
75 274 equemene
  A_g[gid]=A;
76 274 equemene
  B_g[gid]=B;
77 274 equemene
}
78 274 equemene
""").build()
79 274 equemene
    Elapsed=time.time()-TimeIn
80 274 equemene
    print("Building kernels : %.3f" % Elapsed)
81 274 equemene
82 274 equemene
    TimeIn=time.time()
83 274 equemene
    # Memory allocation on Device for result
84 274 equemene
    A_ocl = np.empty_like(a_np)
85 274 equemene
    B_ocl = np.empty_like(a_np)
86 274 equemene
    Elapsed=time.time()-TimeIn
87 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
88 274 equemene
89 274 equemene
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
90 274 equemene
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
91 274 equemene
    Elapsed=time.time()-TimeIn
92 274 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
93 274 equemene
94 274 equemene
    TimeIn=time.time()
95 274 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
96 274 equemene
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
97 274 equemene
    Elapsed=time.time()-TimeIn
98 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
99 274 equemene
100 274 equemene
    TimeIn=time.time()
101 274 equemene
    # Call of kernel previously defined
102 274 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
103 274 equemene
    #
104 274 equemene
    CallCL.wait()
105 274 equemene
    Elapsed=time.time()-TimeIn
106 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
107 274 equemene
108 274 equemene
    TimeIn=time.time()
109 274 equemene
    # Copy from Device to Host
110 274 equemene
    cl.enqueue_copy(queue, A_ocl, A_g)
111 274 equemene
    cl.enqueue_copy(queue, B_ocl, B_g)
112 274 equemene
    Elapsed=time.time()-TimeIn
113 274 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
114 274 equemene
115 274 equemene
    return(A_ocl,B_ocl)
116 274 equemene
117 274 equemene
# CUDA Silly complete operation
118 274 equemene
def CUDADFT(a_np,b_np):
119 274 equemene
    import pycuda.autoinit
120 274 equemene
    import pycuda.driver as drv
121 274 equemene
    import numpy
122 274 equemene
123 274 equemene
    from pycuda.compiler import SourceModule
124 274 equemene
    TimeIn=time.time()
125 274 equemene
    mod = SourceModule("""
126 274 equemene

127 274 equemene
#define PI 3.141592653589793
128 274 equemene

129 274 equemene
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
130 274 equemene
{
131 274 equemene
  const int gid = blockIdx.x;
132 274 equemene
  uint size = gridDim.x;
133 274 equemene
  float A=0.,B=0.;
134 274 equemene
  for (uint i=0; i<size;i++)
135 274 equemene
  {
136 274 equemene
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
137 274 equemene
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
138 274 equemene
  }
139 274 equemene
  A_g[gid]=A;
140 274 equemene
  B_g[gid]=B;
141 274 equemene
}
142 274 equemene

143 274 equemene
""")
144 274 equemene
    Elapsed=time.time()-TimeIn
145 274 equemene
    print("Definition of kernel : %.3f" % Elapsed)
146 274 equemene
147 274 equemene
    TimeIn=time.time()
148 274 equemene
    MyDFT = mod.get_function("MyDFT")
149 274 equemene
    Elapsed=time.time()-TimeIn
150 274 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
151 274 equemene
152 274 equemene
    TimeIn=time.time()
153 274 equemene
    A_np = numpy.zeros_like(a_np)
154 274 equemene
    B_np = numpy.zeros_like(a_np)
155 274 equemene
    Elapsed=time.time()-TimeIn
156 274 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
157 274 equemene
158 274 equemene
    TimeIn=time.time()
159 274 equemene
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
160 274 equemene
          block=(1,1,1), grid=(a_np.size,1))
161 274 equemene
    Elapsed=time.time()-TimeIn
162 274 equemene
    print("Execution of kernel : %.3f" % Elapsed)
163 274 equemene
    return(A_np,B_np)
164 274 equemene
165 274 equemene
import sys
166 274 equemene
import time
167 274 equemene
168 274 equemene
if __name__=='__main__':
169 274 equemene
170 274 equemene
    GpuStyle='OpenCL'
171 274 equemene
    SIZE=1024
172 274 equemene
    Device=0
173 274 equemene
174 274 equemene
    import getopt
175 274 equemene
176 274 equemene
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
177 274 equemene
178 274 equemene
    try:
179 274 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
180 274 equemene
    except getopt.GetoptError:
181 274 equemene
        print(HowToUse % sys.argv[0])
182 274 equemene
        sys.exit(2)
183 274 equemene
184 274 equemene
    # List of Devices
185 274 equemene
    Devices=[]
186 274 equemene
    Alu={}
187 274 equemene
188 274 equemene
    for opt, arg in opts:
189 274 equemene
        if opt == '-h':
190 274 equemene
            print(HowToUse % sys.argv[0])
191 274 equemene
192 274 equemene
            print("\nInformations about devices detected under OpenCL API:")
193 274 equemene
            # For PyOpenCL import
194 274 equemene
            try:
195 274 equemene
                import pyopencl as cl
196 274 equemene
                Id=0
197 274 equemene
                for platform in cl.get_platforms():
198 274 equemene
                    for device in platform.get_devices():
199 274 equemene
                        #deviceType=cl.device_type.to_string(device.type)
200 274 equemene
                        deviceType="xPU"
201 274 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
202 274 equemene
                        Id=Id+1
203 274 equemene
204 274 equemene
            except:
205 274 equemene
                print("Your platform does not seem to support OpenCL")
206 274 equemene
207 274 equemene
            print("\nInformations about devices detected under CUDA API:")
208 274 equemene
            # For PyCUDA import
209 274 equemene
            try:
210 274 equemene
                import pycuda.driver as cuda
211 274 equemene
                cuda.init()
212 274 equemene
                for Id in range(cuda.Device.count()):
213 274 equemene
                    device=cuda.Device(Id)
214 274 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
215 274 equemene
                print
216 274 equemene
            except:
217 274 equemene
                print("Your platform does not seem to support CUDA")
218 274 equemene
219 274 equemene
            sys.exit()
220 274 equemene
221 274 equemene
        elif opt in ("-d", "--device"):
222 274 equemene
            Device=int(arg)
223 274 equemene
        elif opt in ("-g", "--gpustyle"):
224 274 equemene
            GpuStyle = arg
225 274 equemene
        elif opt in ("-s", "--size"):
226 274 equemene
            SIZE = int(arg)
227 274 equemene
228 274 equemene
    print("Device Selection : %i" % Device)
229 274 equemene
    print("GpuStyle used : %s" % GpuStyle)
230 274 equemene
    print("Size of complex vector : %i" % SIZE)
231 274 equemene
232 274 equemene
    if GpuStyle=='CUDA':
233 274 equemene
        try:
234 274 equemene
            # For PyCUDA import
235 274 equemene
            import pycuda.driver as cuda
236 274 equemene
237 274 equemene
            cuda.init()
238 274 equemene
            for Id in range(cuda.Device.count()):
239 274 equemene
                device=cuda.Device(Id)
240 274 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
241 274 equemene
                if Id in Devices:
242 274 equemene
                    Alu[Id]='GPU'
243 274 equemene
244 274 equemene
        except ImportError:
245 274 equemene
            print("Platform does not seem to support CUDA")
246 274 equemene
247 274 equemene
    if GpuStyle=='OpenCL':
248 274 equemene
        try:
249 274 equemene
            # For PyOpenCL import
250 274 equemene
            import pyopencl as cl
251 274 equemene
            Id=0
252 274 equemene
            for platform in cl.get_platforms():
253 274 equemene
                for device in platform.get_devices():
254 274 equemene
                    #deviceType=cl.device_type.to_string(device.type)
255 274 equemene
                    deviceType="xPU"
256 274 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
257 274 equemene
258 274 equemene
                    if Id in Devices:
259 274 equemene
                    # Set the Alu as detected Device Type
260 274 equemene
                        Alu[Id]=deviceType
261 274 equemene
                    Id=Id+1
262 274 equemene
        except ImportError:
263 274 equemene
            print("Platform does not seem to support OpenCL")
264 274 equemene
265 274 equemene
266 274 equemene
267 274 equemene
    a_np = np.ones(SIZE).astype(np.float32)
268 274 equemene
    b_np = np.ones(SIZE).astype(np.float32)
269 274 equemene
270 274 equemene
    C_np = np.zeros(SIZE).astype(np.float32)
271 274 equemene
    D_np = np.zeros(SIZE).astype(np.float32)
272 274 equemene
    C_np[0] = np.float32(SIZE)
273 274 equemene
    D_np[0] = np.float32(SIZE)
274 274 equemene
275 274 equemene
    # # Native & Naive Implementation
276 274 equemene
    # print("Performing naive implementation")
277 274 equemene
    # TimeIn=time.time()
278 274 equemene
    # c_np,d_np=MyDFT(a_np,b_np)
279 274 equemene
    # NativeElapsed=time.time()-TimeIn
280 274 equemene
    # NativeRate=int(SIZE/NativeElapsed)
281 274 equemene
    # print("NativeRate: %i" % NativeRate)
282 274 equemene
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np))
283 274 equemene
284 274 equemene
    # # Native & Numpy Implementation
285 274 equemene
    # print("Performing Numpy implementation")
286 274 equemene
    # TimeIn=time.time()
287 274 equemene
    # e_np,f_np=NumpyDFT(a_np,b_np)
288 274 equemene
    # NumpyElapsed=time.time()-TimeIn
289 274 equemene
    # NumpyRate=int(SIZE/NumpyElapsed)
290 274 equemene
    # print("NumpyRate: %i" % NumpyRate)
291 274 equemene
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np))
292 274 equemene
293 274 equemene
    # # Native & Numba Implementation
294 274 equemene
    # print("Performing Numba implementation")
295 274 equemene
    # TimeIn=time.time()
296 274 equemene
    # g_np,h_np=NumbaDFT(a_np,b_np)
297 274 equemene
    # NumbaElapsed=time.time()-TimeIn
298 274 equemene
    # NumbaRate=int(SIZE/NumbaElapsed)
299 274 equemene
    # print("NumbaRate: %i" % NumbaRate)
300 274 equemene
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np))
301 274 equemene
302 274 equemene
    # # OpenCL Implementation
303 274 equemene
    # print("Performing OpenCL implementation")
304 274 equemene
    # TimeIn=time.time()
305 274 equemene
    # i_np,j_np=OpenCLDFT(a_np,b_np)
306 274 equemene
    # OpenCLElapsed=time.time()-TimeIn
307 274 equemene
    # OpenCLRate=int(SIZE/OpenCLElapsed)
308 274 equemene
    # print("OpenCLRate: %i" % OpenCLRate)
309 274 equemene
    # print("Precision: ",np.linalg.norm(i_np-C_np),np.linalg.norm(j_np-D_np))
310 274 equemene
311 274 equemene
    # # CUDA Implementation
312 274 equemene
    # print("Performing CUDA implementation")
313 274 equemene
    # TimeIn=time.time()
314 274 equemene
    # k_np,l_np=CUDADFT(a_np,b_np)
315 274 equemene
    # CUDAElapsed=time.time()-TimeIn
316 274 equemene
    # CUDARate=int(SIZE/CUDAElapsed)
317 274 equemene
    # print("CUDARate: %i" % CUDARate)
318 274 equemene
    # print("Precision: ",np.linalg.norm(k_np-C_np),np.linalg.norm(l_np-D_np))