Statistiques
| Révision :

root / ETSN / MySteps_5c.py @ 297

Historique | Voir | Annoter | Télécharger (10,36 ko)

1 269 equemene
#!/usr/bin/env python3
2 269 equemene
3 269 equemene
import numpy as np
4 269 equemene
import pyopencl as cl
5 269 equemene
6 269 equemene
# piling 16 arithmetical functions
7 269 equemene
def MySillyFunction(x):
8 269 equemene
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9 269 equemene
10 269 equemene
# Native Operation under Numpy (for prototyping & tests
11 269 equemene
def NativeAddition(a_np,b_np):
12 269 equemene
    return(a_np+b_np)
13 269 equemene
14 269 equemene
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15 269 equemene
def NativeSillyAddition(a_np,b_np):
16 269 equemene
    return(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_np))))))))))))))))+MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_np)))))))))))))))))
17 269 equemene
18 269 equemene
# CUDA complete operation
19 269 equemene
def CUDAAddition(a_np,b_np):
20 269 equemene
    import pycuda.autoinit
21 269 equemene
    import pycuda.driver as drv
22 269 equemene
    import numpy
23 269 equemene
24 269 equemene
    from pycuda.compiler import SourceModule
25 269 equemene
    mod = SourceModule("""
26 269 equemene
    __global__ void sum(float *dest, float *a, float *b)
27 269 equemene
{
28 269 equemene
  // const int i = threadIdx.x;
29 269 equemene
  const int i = blockIdx.x;
30 269 equemene
  dest[i] = a[i] + b[i];
31 269 equemene
}
32 269 equemene
""")
33 269 equemene
34 269 equemene
    # sum = mod.get_function("sum")
35 269 equemene
    sum = mod.get_function("sum")
36 269 equemene
37 269 equemene
    res_np = numpy.zeros_like(a_np)
38 269 equemene
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
39 269 equemene
        block=(1,1,1), grid=(a_np.size,1))
40 269 equemene
    return(res_np)
41 269 equemene
42 269 equemene
# CUDA Silly complete operation
43 269 equemene
def CUDASillyAddition(a_np,b_np):
44 269 equemene
    import pycuda.autoinit
45 269 equemene
    import pycuda.driver as drv
46 269 equemene
    import numpy
47 269 equemene
48 269 equemene
    from pycuda.compiler import SourceModule
49 269 equemene
    mod = SourceModule("""
50 269 equemene
__device__ float MySillyFunction(float x)
51 269 equemene
{
52 269 equemene
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
53 269 equemene
}
54 269 equemene

55 269 equemene
__global__ void sillysum(float *dest, float *a, float *b)
56 269 equemene
{
57 269 equemene
  const int i = blockIdx.x;
58 269 equemene
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
59 269 equemene
}
60 269 equemene

61 269 equemene
__global__ void hybridsillysum(float *dest, float *a, float *b)
62 269 equemene
{
63 269 equemene
  const int i = threadIdx.x+blockDim.x*blockIdx.x;
64 269 equemene
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
65 269 equemene
}
66 269 equemene
""")
67 269 equemene
68 269 equemene
    # sum = mod.get_function("sum")
69 269 equemene
    # sillysum = mod.get_function("sillysum")
70 269 equemene
    hybridsillysum = mod.get_function("hybridsillysum")
71 269 equemene
72 269 equemene
    res_np = numpy.zeros_like(a_np)
73 269 equemene
    threads=1024
74 269 equemene
    blocks=int(a_np.size/threads)
75 269 equemene
    # sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
76 269 equemene
    #          block=(threads,1,1), grid=(blocks,1))
77 269 equemene
    hybridsillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
78 269 equemene
                   block=(threads,1,1), grid=(blocks,1))
79 269 equemene
    return(res_np)
80 269 equemene
81 269 equemene
# OpenCL complete operation
82 269 equemene
def OpenCLAddition(a_np,b_np):
83 269 equemene
84 269 equemene
    # Context creation
85 269 equemene
    ctx = cl.create_some_context()
86 269 equemene
    # Every process is stored in a queue
87 269 equemene
    queue = cl.CommandQueue(ctx)
88 269 equemene
89 269 equemene
    TimeIn=time.time()
90 269 equemene
    # Copy from Host to Device using pointers
91 269 equemene
    mf = cl.mem_flags
92 269 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
93 269 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
94 269 equemene
    Elapsed=time.time()-TimeIn
95 269 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
96 269 equemene
97 269 equemene
    TimeIn=time.time()
98 269 equemene
    # Definition of kernel under OpenCL
99 269 equemene
    prg = cl.Program(ctx, """
100 269 equemene
__kernel void sum(
101 269 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
102 269 equemene
{
103 269 equemene
  int gid = get_global_id(0);
104 269 equemene
  res_g[gid] = a_g[gid] + b_g[gid];
105 269 equemene
}
106 269 equemene
""").build()
107 269 equemene
    Elapsed=time.time()-TimeIn
108 269 equemene
    print("Building kernels : %.3f" % Elapsed)
109 269 equemene
110 269 equemene
    TimeIn=time.time()
111 269 equemene
    # Memory allocation on Device for result
112 269 equemene
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
113 269 equemene
    Elapsed=time.time()-TimeIn
114 269 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
115 269 equemene
116 269 equemene
    TimeIn=time.time()
117 269 equemene
    # Synthesis of function "sum" inside Kernel Sources
118 269 equemene
    knl = prg.sum  # Use this Kernel object for repeated calls
119 269 equemene
    Elapsed=time.time()-TimeIn
120 269 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
121 269 equemene
122 269 equemene
    TimeIn=time.time()
123 269 equemene
    # Call of kernel previously defined
124 269 equemene
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
125 269 equemene
    Elapsed=time.time()-TimeIn
126 269 equemene
    print("Execution of kernel : %.3f" % Elapsed)
127 269 equemene
128 269 equemene
    TimeIn=time.time()
129 269 equemene
    # Creation of vector for result with same size as input vectors
130 269 equemene
    res_np = np.empty_like(a_np)
131 269 equemene
    Elapsed=time.time()-TimeIn
132 269 equemene
    print("Allocation on Host for results: %.3f" % Elapsed)
133 269 equemene
134 269 equemene
    TimeIn=time.time()
135 269 equemene
    # Copy from Device to Host
136 269 equemene
    cl.enqueue_copy(queue, res_np, res_g)
137 269 equemene
    Elapsed=time.time()-TimeIn
138 269 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
139 269 equemene
140 275 equemene
    # Liberation of memory
141 275 equemene
    a_g.release()
142 275 equemene
    b_g.release()
143 275 equemene
    res_g.release()
144 275 equemene
145 269 equemene
    return(res_np)
146 269 equemene
147 269 equemene
# OpenCL complete operation
148 269 equemene
def OpenCLSillyAddition(a_np,b_np):
149 269 equemene
150 269 equemene
    # Context creation
151 269 equemene
    ctx = cl.create_some_context()
152 269 equemene
    # Every process is stored in a queue
153 269 equemene
    queue = cl.CommandQueue(ctx)
154 269 equemene
155 269 equemene
    TimeIn=time.time()
156 269 equemene
    # Copy from Host to Device using pointers
157 269 equemene
    mf = cl.mem_flags
158 269 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
159 269 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
160 269 equemene
    Elapsed=time.time()-TimeIn
161 269 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
162 269 equemene
163 269 equemene
    TimeIn=time.time()
164 269 equemene
    # Definition of kernel under OpenCL
165 269 equemene
    prg = cl.Program(ctx, """
166 269 equemene

167 269 equemene
float MySillyFunction(float x)
168 269 equemene
{
169 269 equemene
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
170 269 equemene
}
171 269 equemene

172 269 equemene
__kernel void sillysum(
173 269 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
174 269 equemene
{
175 269 equemene
  int gid = get_global_id(0);
176 269 equemene
  res_g[gid] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_g[gid])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_g[gid]))))))))))))))));
177 269 equemene
}
178 269 equemene

179 269 equemene
__kernel void sum(
180 269 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
181 269 equemene
{
182 269 equemene
  int gid = get_global_id(0);
183 269 equemene
  res_g[gid] = a_g[gid] + b_g[gid];
184 269 equemene
}
185 269 equemene
""").build()
186 269 equemene
    Elapsed=time.time()-TimeIn
187 269 equemene
    print("Building kernels : %.3f" % Elapsed)
188 269 equemene
189 269 equemene
    TimeIn=time.time()
190 269 equemene
    # Memory allocation on Device for result
191 269 equemene
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
192 269 equemene
    Elapsed=time.time()-TimeIn
193 269 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
194 269 equemene
195 269 equemene
    TimeIn=time.time()
196 269 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
197 269 equemene
    knl = prg.sillysum  # Use this Kernel object for repeated calls
198 269 equemene
    Elapsed=time.time()-TimeIn
199 269 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
200 269 equemene
201 269 equemene
    TimeIn=time.time()
202 269 equemene
    # Call of kernel previously defined
203 269 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
204 269 equemene
    #
205 269 equemene
    CallCL.wait()
206 269 equemene
    Elapsed=time.time()-TimeIn
207 269 equemene
    print("Execution of kernel : %.3f" % Elapsed)
208 269 equemene
209 269 equemene
    TimeIn=time.time()
210 269 equemene
    # Creation of vector for result with same size as input vectors
211 269 equemene
    res_np = np.empty_like(a_np)
212 269 equemene
    Elapsed=time.time()-TimeIn
213 269 equemene
    print("Allocation on Host for results: %.3f" % Elapsed)
214 269 equemene
215 269 equemene
    TimeIn=time.time()
216 269 equemene
    # Copy from Device to Host
217 269 equemene
    cl.enqueue_copy(queue, res_np, res_g)
218 269 equemene
    Elapsed=time.time()-TimeIn
219 269 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
220 269 equemene
221 275 equemene
    # Liberation of memory
222 275 equemene
    a_g.release()
223 275 equemene
    b_g.release()
224 275 equemene
    res_g.release()
225 275 equemene
226 269 equemene
    return(res_np)
227 269 equemene
228 269 equemene
import sys
229 269 equemene
import time
230 269 equemene
231 269 equemene
if __name__=='__main__':
232 269 equemene
233 269 equemene
    # Size of input vectors definition based on stdin
234 269 equemene
    import sys
235 269 equemene
    try:
236 269 equemene
        SIZE=int(sys.argv[1])
237 269 equemene
        print("Size of vectors set to %i" % SIZE)
238 269 equemene
    except:
239 269 equemene
        SIZE=50000
240 269 equemene
        print("Size of vectors set to default size %i" % SIZE)
241 269 equemene
242 269 equemene
    a_np = np.random.rand(SIZE).astype(np.float32)
243 269 equemene
    b_np = np.random.rand(SIZE).astype(np.float32)
244 269 equemene
245 269 equemene
    # Native Implementation
246 269 equemene
    TimeIn=time.time()
247 269 equemene
    # res_np=NativeAddition(a_np,b_np)
248 269 equemene
    res_np=NativeSillyAddition(a_np,b_np)
249 269 equemene
    NativeElapsed=time.time()-TimeIn
250 269 equemene
    NativeRate=int(SIZE/NativeElapsed)
251 269 equemene
    print("NativeRate: %i" % NativeRate)
252 269 equemene
253 269 equemene
    # OpenCL Implementation
254 269 equemene
    TimeIn=time.time()
255 269 equemene
    # res_cl=OpenCLAddition(a_np,b_np)
256 269 equemene
    res_cl=OpenCLSillyAddition(a_np,b_np)
257 269 equemene
    OpenCLElapsed=time.time()-TimeIn
258 269 equemene
    OpenCLRate=int(SIZE/OpenCLElapsed)
259 269 equemene
    print("OpenCLRate: %i" % OpenCLRate)
260 269 equemene
261 269 equemene
    # CUDA Implementation
262 269 equemene
    TimeIn=time.time()
263 269 equemene
    # res_cuda=CUDAAddition(a_np,b_np)
264 269 equemene
    res_cuda=CUDASillyAddition(a_np,b_np)
265 269 equemene
    CUDAElapsed=time.time()-TimeIn
266 269 equemene
    CUDARate=int(SIZE/CUDAElapsed)
267 269 equemene
    print("CUDARate: %i" % CUDARate)
268 269 equemene
269 269 equemene
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
270 269 equemene
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
271 269 equemene
272 269 equemene
    # Check on CPU with Numpy:
273 269 equemene
    print(res_cl - res_np)
274 269 equemene
    print(np.linalg.norm(res_cl - res_np))
275 269 equemene
    try:
276 269 equemene
        assert np.allclose(res_np, res_cl)
277 269 equemene
    except:
278 269 equemene
        print("Results between Native & OpenCL seem to be too different!")
279 269 equemene
280 269 equemene
    # Check on CPU with Numpy:
281 269 equemene
    print(res_cuda - res_np)
282 269 equemene
    print(np.linalg.norm(res_cuda - res_np))
283 269 equemene
    try:
284 269 equemene
        assert np.allclose(res_np, res_cuda)
285 269 equemene
    except:
286 269 equemene
        print("Results between Native & CUDA seem to be too different!")