Statistiques
| Révision :

root / ETSN / MySteps_5c.py @ 291

Historique | Voir | Annoter | Télécharger (10,36 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5

    
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

    
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

    
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np):
16
    return(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_np))))))))))))))))+MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_np)))))))))))))))))
17

    
18
# CUDA complete operation
19
def CUDAAddition(a_np,b_np):
20
    import pycuda.autoinit
21
    import pycuda.driver as drv
22
    import numpy
23

    
24
    from pycuda.compiler import SourceModule
25
    mod = SourceModule("""
26
    __global__ void sum(float *dest, float *a, float *b)
27
{
28
  // const int i = threadIdx.x;
29
  const int i = blockIdx.x;
30
  dest[i] = a[i] + b[i];
31
}
32
""")
33

    
34
    # sum = mod.get_function("sum")
35
    sum = mod.get_function("sum")
36

    
37
    res_np = numpy.zeros_like(a_np)
38
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
39
        block=(1,1,1), grid=(a_np.size,1))
40
    return(res_np)
41

    
42
# CUDA Silly complete operation
43
def CUDASillyAddition(a_np,b_np):
44
    import pycuda.autoinit
45
    import pycuda.driver as drv
46
    import numpy
47

    
48
    from pycuda.compiler import SourceModule
49
    mod = SourceModule("""
50
__device__ float MySillyFunction(float x)
51
{
52
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
53
}
54

55
__global__ void sillysum(float *dest, float *a, float *b)
56
{
57
  const int i = blockIdx.x;
58
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
59
}
60

61
__global__ void hybridsillysum(float *dest, float *a, float *b)
62
{
63
  const int i = threadIdx.x+blockDim.x*blockIdx.x;
64
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
65
}
66
""")
67

    
68
    # sum = mod.get_function("sum")
69
    # sillysum = mod.get_function("sillysum")
70
    hybridsillysum = mod.get_function("hybridsillysum")
71

    
72
    res_np = numpy.zeros_like(a_np)
73
    threads=1024
74
    blocks=int(a_np.size/threads)
75
    # sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
76
    #          block=(threads,1,1), grid=(blocks,1))
77
    hybridsillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
78
                   block=(threads,1,1), grid=(blocks,1))
79
    return(res_np)
80

    
81
# OpenCL complete operation
82
def OpenCLAddition(a_np,b_np):
83

    
84
    # Context creation
85
    ctx = cl.create_some_context()
86
    # Every process is stored in a queue
87
    queue = cl.CommandQueue(ctx)
88

    
89
    TimeIn=time.time()
90
    # Copy from Host to Device using pointers
91
    mf = cl.mem_flags
92
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
93
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
94
    Elapsed=time.time()-TimeIn
95
    print("Copy from Host 2 Device : %.3f" % Elapsed)
96

    
97
    TimeIn=time.time()
98
    # Definition of kernel under OpenCL
99
    prg = cl.Program(ctx, """
100
__kernel void sum(
101
    __global const float *a_g, __global const float *b_g, __global float *res_g)
102
{
103
  int gid = get_global_id(0);
104
  res_g[gid] = a_g[gid] + b_g[gid];
105
}
106
""").build()
107
    Elapsed=time.time()-TimeIn
108
    print("Building kernels : %.3f" % Elapsed)
109
    
110
    TimeIn=time.time()
111
    # Memory allocation on Device for result
112
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
113
    Elapsed=time.time()-TimeIn
114
    print("Allocation on Device for results : %.3f" % Elapsed)
115

    
116
    TimeIn=time.time()
117
    # Synthesis of function "sum" inside Kernel Sources
118
    knl = prg.sum  # Use this Kernel object for repeated calls
119
    Elapsed=time.time()-TimeIn
120
    print("Synthesis of kernel : %.3f" % Elapsed)
121

    
122
    TimeIn=time.time()
123
    # Call of kernel previously defined 
124
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
125
    Elapsed=time.time()-TimeIn
126
    print("Execution of kernel : %.3f" % Elapsed)
127

    
128
    TimeIn=time.time()
129
    # Creation of vector for result with same size as input vectors
130
    res_np = np.empty_like(a_np)
131
    Elapsed=time.time()-TimeIn
132
    print("Allocation on Host for results: %.3f" % Elapsed)
133

    
134
    TimeIn=time.time()
135
    # Copy from Device to Host
136
    cl.enqueue_copy(queue, res_np, res_g)
137
    Elapsed=time.time()-TimeIn
138
    print("Copy from Device 2 Host : %.3f" % Elapsed)
139

    
140
    # Liberation of memory
141
    a_g.release()
142
    b_g.release()
143
    res_g.release()
144
    
145
    return(res_np)
146

    
147
# OpenCL complete operation
148
def OpenCLSillyAddition(a_np,b_np):
149

    
150
    # Context creation
151
    ctx = cl.create_some_context()
152
    # Every process is stored in a queue
153
    queue = cl.CommandQueue(ctx)
154

    
155
    TimeIn=time.time()
156
    # Copy from Host to Device using pointers
157
    mf = cl.mem_flags
158
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
159
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
160
    Elapsed=time.time()-TimeIn
161
    print("Copy from Host 2 Device : %.3f" % Elapsed)
162

    
163
    TimeIn=time.time()
164
    # Definition of kernel under OpenCL
165
    prg = cl.Program(ctx, """
166

167
float MySillyFunction(float x)
168
{
169
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
170
}
171

172
__kernel void sillysum(
173
    __global const float *a_g, __global const float *b_g, __global float *res_g)
174
{
175
  int gid = get_global_id(0);
176
  res_g[gid] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_g[gid])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_g[gid]))))))))))))))));
177
}
178

179
__kernel void sum(
180
    __global const float *a_g, __global const float *b_g, __global float *res_g)
181
{
182
  int gid = get_global_id(0);
183
  res_g[gid] = a_g[gid] + b_g[gid];
184
}
185
""").build()
186
    Elapsed=time.time()-TimeIn
187
    print("Building kernels : %.3f" % Elapsed)
188
    
189
    TimeIn=time.time()
190
    # Memory allocation on Device for result
191
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
192
    Elapsed=time.time()-TimeIn
193
    print("Allocation on Device for results : %.3f" % Elapsed)
194

    
195
    TimeIn=time.time()
196
    # Synthesis of function "sillysum" inside Kernel Sources
197
    knl = prg.sillysum  # Use this Kernel object for repeated calls
198
    Elapsed=time.time()-TimeIn
199
    print("Synthesis of kernel : %.3f" % Elapsed)
200

    
201
    TimeIn=time.time()
202
    # Call of kernel previously defined 
203
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
204
    # 
205
    CallCL.wait()
206
    Elapsed=time.time()-TimeIn
207
    print("Execution of kernel : %.3f" % Elapsed)
208

    
209
    TimeIn=time.time()
210
    # Creation of vector for result with same size as input vectors
211
    res_np = np.empty_like(a_np)
212
    Elapsed=time.time()-TimeIn
213
    print("Allocation on Host for results: %.3f" % Elapsed)
214

    
215
    TimeIn=time.time()
216
    # Copy from Device to Host
217
    cl.enqueue_copy(queue, res_np, res_g)
218
    Elapsed=time.time()-TimeIn
219
    print("Copy from Device 2 Host : %.3f" % Elapsed)
220

    
221
    # Liberation of memory
222
    a_g.release()
223
    b_g.release()
224
    res_g.release()
225
    
226
    return(res_np)
227

    
228
import sys
229
import time
230

    
231
if __name__=='__main__':
232

    
233
    # Size of input vectors definition based on stdin
234
    import sys
235
    try:
236
        SIZE=int(sys.argv[1])
237
        print("Size of vectors set to %i" % SIZE)
238
    except: 
239
        SIZE=50000
240
        print("Size of vectors set to default size %i" % SIZE)
241
        
242
    a_np = np.random.rand(SIZE).astype(np.float32)
243
    b_np = np.random.rand(SIZE).astype(np.float32)
244

    
245
    # Native Implementation
246
    TimeIn=time.time()
247
    # res_np=NativeAddition(a_np,b_np)
248
    res_np=NativeSillyAddition(a_np,b_np)
249
    NativeElapsed=time.time()-TimeIn
250
    NativeRate=int(SIZE/NativeElapsed)
251
    print("NativeRate: %i" % NativeRate)
252

    
253
    # OpenCL Implementation
254
    TimeIn=time.time()
255
    # res_cl=OpenCLAddition(a_np,b_np)
256
    res_cl=OpenCLSillyAddition(a_np,b_np)
257
    OpenCLElapsed=time.time()-TimeIn
258
    OpenCLRate=int(SIZE/OpenCLElapsed)
259
    print("OpenCLRate: %i" % OpenCLRate)
260

    
261
    # CUDA Implementation
262
    TimeIn=time.time()
263
    # res_cuda=CUDAAddition(a_np,b_np)
264
    res_cuda=CUDASillyAddition(a_np,b_np)
265
    CUDAElapsed=time.time()-TimeIn
266
    CUDARate=int(SIZE/CUDAElapsed)
267
    print("CUDARate: %i" % CUDARate)
268
    
269
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
270
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
271
    
272
    # Check on CPU with Numpy:
273
    print(res_cl - res_np)
274
    print(np.linalg.norm(res_cl - res_np))
275
    try:
276
        assert np.allclose(res_np, res_cl)
277
    except:
278
        print("Results between Native & OpenCL seem to be too different!")
279
        
280
    # Check on CPU with Numpy:
281
    print(res_cuda - res_np)
282
    print(np.linalg.norm(res_cuda - res_np))
283
    try:
284
        assert np.allclose(res_np, res_cuda)
285
    except:
286
        print("Results between Native & CUDA seem to be too different!")
287