Statistiques
| Révision :

root / ETSN / MySteps_3.py @ 288

Historique | Voir | Annoter | Télécharger (6,89 ko)

1 268 equemene
#!/usr/bin/env python3
2 268 equemene
3 268 equemene
import numpy as np
4 268 equemene
import pyopencl as cl
5 268 equemene
6 268 equemene
# piling 16 arithmetical functions
7 268 equemene
def MySillyFunction(x):
8 268 equemene
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9 268 equemene
10 268 equemene
# Native Operation under Numpy (for prototyping & tests
11 268 equemene
def NativeAddition(a_np,b_np):
12 268 equemene
    return(a_np+b_np)
13 268 equemene
14 268 equemene
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15 268 equemene
def NativeSillyAddition(a_np,b_np):
16 268 equemene
    return(MySillyFunction(a_np)+MySillyFunction(b_np))
17 268 equemene
18 268 equemene
# CUDA complete operation
19 268 equemene
def CUDAAddition(a_np,b_np):
20 268 equemene
    import pycuda.autoinit
21 268 equemene
    import pycuda.driver as drv
22 268 equemene
    import numpy
23 268 equemene
24 268 equemene
    from pycuda.compiler import SourceModule
25 268 equemene
    mod = SourceModule("""
26 268 equemene
    __global__ void sum(float *dest, float *a, float *b)
27 268 equemene
{
28 268 equemene
  const int i = threadIdx.x;
29 268 equemene
  dest[i] = a[i] + b[i];
30 268 equemene
}
31 268 equemene
""")
32 268 equemene
33 268 equemene
    sum = mod.get_function("sum")
34 268 equemene
35 268 equemene
    res_np = numpy.zeros_like(a_np)
36 268 equemene
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
37 268 equemene
        block=(a_np.size,1,1), grid=(1,1))
38 268 equemene
    return(res_np)
39 268 equemene
40 268 equemene
# OpenCL complete operation
41 268 equemene
def OpenCLAddition(a_np,b_np):
42 268 equemene
43 268 equemene
    # Context creation
44 268 equemene
    ctx = cl.create_some_context()
45 268 equemene
    # Every process is stored in a queue
46 268 equemene
    queue = cl.CommandQueue(ctx)
47 268 equemene
48 268 equemene
    TimeIn=time.time()
49 268 equemene
    # Copy from Host to Device using pointers
50 268 equemene
    mf = cl.mem_flags
51 268 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
52 268 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
53 268 equemene
    Elapsed=time.time()-TimeIn
54 268 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
55 268 equemene
56 268 equemene
    TimeIn=time.time()
57 268 equemene
    # Definition of kernel under OpenCL
58 268 equemene
    prg = cl.Program(ctx, """
59 268 equemene
__kernel void sum(
60 268 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
61 268 equemene
{
62 268 equemene
  int gid = get_global_id(0);
63 268 equemene
  res_g[gid] = a_g[gid] + b_g[gid];
64 268 equemene
}
65 268 equemene
""").build()
66 268 equemene
    Elapsed=time.time()-TimeIn
67 268 equemene
    print("Building kernels : %.3f" % Elapsed)
68 268 equemene
69 268 equemene
    TimeIn=time.time()
70 268 equemene
    # Memory allocation on Device for result
71 268 equemene
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
72 268 equemene
    Elapsed=time.time()-TimeIn
73 268 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
74 268 equemene
75 268 equemene
    TimeIn=time.time()
76 268 equemene
    # Synthesis of function "sum" inside Kernel Sources
77 268 equemene
    knl = prg.sum  # Use this Kernel object for repeated calls
78 268 equemene
    Elapsed=time.time()-TimeIn
79 268 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
80 268 equemene
81 268 equemene
    TimeIn=time.time()
82 268 equemene
    # Call of kernel previously defined
83 268 equemene
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
84 268 equemene
    Elapsed=time.time()-TimeIn
85 268 equemene
    print("Execution of kernel : %.3f" % Elapsed)
86 268 equemene
87 268 equemene
    TimeIn=time.time()
88 268 equemene
    # Creation of vector for result with same size as input vectors
89 268 equemene
    res_np = np.empty_like(a_np)
90 268 equemene
    Elapsed=time.time()-TimeIn
91 268 equemene
    print("Allocation on Host for results: %.3f" % Elapsed)
92 268 equemene
93 268 equemene
    TimeIn=time.time()
94 268 equemene
    # Copy from Device to Host
95 268 equemene
    cl.enqueue_copy(queue, res_np, res_g)
96 268 equemene
    Elapsed=time.time()-TimeIn
97 268 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
98 268 equemene
99 275 equemene
    # Liberation of memory
100 275 equemene
    a_g.release()
101 275 equemene
    b_g.release()
102 275 equemene
    res_g.release()
103 275 equemene
104 268 equemene
    return(res_np)
105 268 equemene
106 268 equemene
# OpenCL complete operation
107 268 equemene
def OpenCLSillyAddition(a_np,b_np):
108 268 equemene
109 268 equemene
    # Context creation
110 268 equemene
    ctx = cl.create_some_context()
111 268 equemene
    # Every process is stored in a queue
112 268 equemene
    queue = cl.CommandQueue(ctx)
113 268 equemene
114 268 equemene
    TimeIn=time.time()
115 268 equemene
    # Copy from Host to Device using pointers
116 268 equemene
    mf = cl.mem_flags
117 268 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
118 268 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
119 268 equemene
    Elapsed=time.time()-TimeIn
120 268 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
121 268 equemene
122 268 equemene
    TimeIn=time.time()
123 268 equemene
    # Definition of kernel under OpenCL
124 268 equemene
    prg = cl.Program(ctx, """
125 268 equemene

126 268 equemene
float MySillyFunction(float x)
127 268 equemene
{
128 268 equemene
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
129 268 equemene
}
130 268 equemene

131 268 equemene
__kernel void sillysum(
132 268 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
133 268 equemene
{
134 268 equemene
  int gid = get_global_id(0);
135 268 equemene
  res_g[gid] = MySillyFunction(a_g[gid]) + MySillyFunction(b_g[gid]);
136 268 equemene
}
137 268 equemene

138 268 equemene
__kernel void sum(
139 268 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
140 268 equemene
{
141 268 equemene
  int gid = get_global_id(0);
142 268 equemene
  res_g[gid] = a_g[gid] + b_g[gid];
143 268 equemene
}
144 268 equemene
""").build()
145 268 equemene
    Elapsed=time.time()-TimeIn
146 268 equemene
    print("Building kernels : %.3f" % Elapsed)
147 268 equemene
148 268 equemene
    TimeIn=time.time()
149 268 equemene
    # Memory allocation on Device for result
150 268 equemene
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
151 268 equemene
    Elapsed=time.time()-TimeIn
152 268 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
153 268 equemene
154 268 equemene
    TimeIn=time.time()
155 268 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
156 268 equemene
    knl = prg.sillysum  # Use this Kernel object for repeated calls
157 268 equemene
    Elapsed=time.time()-TimeIn
158 268 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
159 268 equemene
160 268 equemene
    TimeIn=time.time()
161 268 equemene
    # Call of kernel previously defined
162 268 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
163 268 equemene
    #
164 268 equemene
    CallCL.wait()
165 268 equemene
    Elapsed=time.time()-TimeIn
166 268 equemene
    print("Execution of kernel : %.3f" % Elapsed)
167 268 equemene
168 268 equemene
    TimeIn=time.time()
169 268 equemene
    # Creation of vector for result with same size as input vectors
170 268 equemene
    res_np = np.empty_like(a_np)
171 268 equemene
    Elapsed=time.time()-TimeIn
172 268 equemene
    print("Allocation on Host for results: %.3f" % Elapsed)
173 268 equemene
174 268 equemene
    TimeIn=time.time()
175 268 equemene
    # Copy from Device to Host
176 268 equemene
    cl.enqueue_copy(queue, res_np, res_g)
177 268 equemene
    Elapsed=time.time()-TimeIn
178 268 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
179 268 equemene
180 275 equemene
    # Liberation of memory
181 275 equemene
    a_g.release()
182 275 equemene
    b_g.release()
183 275 equemene
    res_g.release()
184 275 equemene
185 268 equemene
    return(res_np)
186 268 equemene
187 268 equemene
import sys
188 268 equemene
import time
189 268 equemene
190 268 equemene
if __name__=='__main__':
191 268 equemene
192 268 equemene
    # Size of input vectors definition based on stdin
193 268 equemene
    import sys
194 268 equemene
    try:
195 268 equemene
        SIZE=int(sys.argv[1])
196 268 equemene
        print("Size of vectors set to %i" % SIZE)
197 268 equemene
    except:
198 268 equemene
        SIZE=50000
199 268 equemene
        print("Size of vectors set to default size %i" % SIZE)
200 268 equemene
201 268 equemene
    a_np = np.random.rand(SIZE).astype(np.float32)
202 268 equemene
    b_np = np.random.rand(SIZE).astype(np.float32)
203 268 equemene
204 269 equemene
    # Native Implementation
205 268 equemene
    TimeIn=time.time()
206 268 equemene
    # res_np=NativeSillyAddition(a_np,b_np)
207 268 equemene
    res_np=NativeAddition(a_np,b_np)
208 268 equemene
    NativeElapsed=time.time()-TimeIn
209 268 equemene
    NativeRate=int(SIZE/NativeElapsed)
210 268 equemene
    print("NativeRate: %i" % NativeRate)
211 268 equemene
212 269 equemene
    # OpenCL Implementation
213 268 equemene
    TimeIn=time.time()
214 268 equemene
    # res_cl=OpenCLSillyAddition(a_np,b_np)
215 268 equemene
    res_cl=OpenCLAddition(a_np,b_np)
216 268 equemene
    OpenCLElapsed=time.time()-TimeIn
217 268 equemene
    OpenCLRate=int(SIZE/OpenCLElapsed)
218 268 equemene
    print("OpenCLRate: %i" % OpenCLRate)
219 268 equemene
220 269 equemene
    # CUDA Implementation
221 268 equemene
    TimeIn=time.time()
222 268 equemene
    res_cuda=CUDAAddition(a_np,b_np)
223 268 equemene
    CUDAElapsed=time.time()-TimeIn
224 268 equemene
    CUDARate=int(SIZE/CUDAElapsed)
225 268 equemene
    print("CUDARate: %i" % CUDARate)
226 268 equemene
227 268 equemene
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
228 268 equemene
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
229 268 equemene
230 268 equemene
    # Check on CPU with Numpy:
231 268 equemene
    print(res_cl - res_np)
232 268 equemene
    print(np.linalg.norm(res_cl - res_np))
233 268 equemene
    assert np.allclose(res_np, res_cl)
234 268 equemene
235 268 equemene
    # Check on CPU with Numpy:
236 268 equemene
    print(res_cuda - res_np)
237 268 equemene
    print(np.linalg.norm(res_cuda - res_np))
238 268 equemene
    assert np.allclose(res_np, res_cuda)