Statistiques
| Révision :

root / ETSN / MySteps_5b.py @ 280

Historique | Voir | Annoter | Télécharger (9,45 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5

    
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

    
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

    
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np):
16
    return(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_np))))))))))))))))+MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_np)))))))))))))))))
17

    
18
# CUDA complete operation
19
def CUDAAddition(a_np,b_np):
20
    import pycuda.autoinit
21
    import pycuda.driver as drv
22
    import numpy
23

    
24
    from pycuda.compiler import SourceModule
25
    mod = SourceModule("""
26
    __global__ void sum(float *dest, float *a, float *b)
27
{
28
  // const int i = threadIdx.x;
29
  const int i = blockIdx.x;
30
  dest[i] = a[i] + b[i];
31
}
32
""")
33

    
34
    # sum = mod.get_function("sum")
35
    sum = mod.get_function("sum")
36

    
37
    res_np = numpy.zeros_like(a_np)
38
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
39
        block=(1,1,1), grid=(a_np.size,1))
40
    return(res_np)
41

    
42
# CUDA Silly complete operation
43
def CUDASillyAddition(a_np,b_np):
44
    import pycuda.autoinit
45
    import pycuda.driver as drv
46
    import numpy
47

    
48
    from pycuda.compiler import SourceModule
49
    mod = SourceModule("""
50
__device__ float MySillyFunction(float x)
51
{
52
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
53
}
54

55
__global__ void sillysum(float *dest, float *a, float *b)
56
{
57
  const int i = blockIdx.x;
58
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
59
}
60
""")
61

    
62
    # sum = mod.get_function("sum")
63
    sillysum = mod.get_function("sillysum")
64

    
65
    res_np = numpy.zeros_like(a_np)
66
    sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
67
             block=(1,1,1), grid=(a_np.size,1))
68
    return(res_np)
69

    
70
# OpenCL complete operation
71
def OpenCLAddition(a_np,b_np):
72

    
73
    # Context creation
74
    ctx = cl.create_some_context()
75
    # Every process is stored in a queue
76
    queue = cl.CommandQueue(ctx)
77

    
78
    TimeIn=time.time()
79
    # Copy from Host to Device using pointers
80
    mf = cl.mem_flags
81
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
82
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
83
    Elapsed=time.time()-TimeIn
84
    print("Copy from Host 2 Device : %.3f" % Elapsed)
85

    
86
    TimeIn=time.time()
87
    # Definition of kernel under OpenCL
88
    prg = cl.Program(ctx, """
89
__kernel void sum(
90
    __global const float *a_g, __global const float *b_g, __global float *res_g)
91
{
92
  int gid = get_global_id(0);
93
  res_g[gid] = a_g[gid] + b_g[gid];
94
}
95
""").build()
96
    Elapsed=time.time()-TimeIn
97
    print("Building kernels : %.3f" % Elapsed)
98
    
99
    TimeIn=time.time()
100
    # Memory allocation on Device for result
101
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
102
    Elapsed=time.time()-TimeIn
103
    print("Allocation on Device for results : %.3f" % Elapsed)
104

    
105
    TimeIn=time.time()
106
    # Synthesis of function "sum" inside Kernel Sources
107
    knl = prg.sum  # Use this Kernel object for repeated calls
108
    Elapsed=time.time()-TimeIn
109
    print("Synthesis of kernel : %.3f" % Elapsed)
110

    
111
    TimeIn=time.time()
112
    # Call of kernel previously defined 
113
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
114
    Elapsed=time.time()-TimeIn
115
    print("Execution of kernel : %.3f" % Elapsed)
116

    
117
    TimeIn=time.time()
118
    # Creation of vector for result with same size as input vectors
119
    res_np = np.empty_like(a_np)
120
    Elapsed=time.time()-TimeIn
121
    print("Allocation on Host for results: %.3f" % Elapsed)
122

    
123
    TimeIn=time.time()
124
    # Copy from Device to Host
125
    cl.enqueue_copy(queue, res_np, res_g)
126
    Elapsed=time.time()-TimeIn
127
    print("Copy from Device 2 Host : %.3f" % Elapsed)
128

    
129
    # Liberation of memory
130
    a_g.release()
131
    b_g.release()
132
    res_g.release()
133
    
134
    return(res_np)
135

    
136
# OpenCL complete operation
137
def OpenCLSillyAddition(a_np,b_np):
138

    
139
    # Context creation
140
    ctx = cl.create_some_context()
141
    # Every process is stored in a queue
142
    queue = cl.CommandQueue(ctx)
143

    
144
    TimeIn=time.time()
145
    # Copy from Host to Device using pointers
146
    mf = cl.mem_flags
147
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
148
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
149
    Elapsed=time.time()-TimeIn
150
    print("Copy from Host 2 Device : %.3f" % Elapsed)
151

    
152
    TimeIn=time.time()
153
    # Definition of kernel under OpenCL
154
    prg = cl.Program(ctx, """
155

156
float MySillyFunction(float x)
157
{
158
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
159
}
160

161
__kernel void sillysum(
162
    __global const float *a_g, __global const float *b_g, __global float *res_g)
163
{
164
  int gid = get_global_id(0);
165
  res_g[gid] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_g[gid])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_g[gid]))))))))))))))));
166
}
167

168
__kernel void sum(
169
    __global const float *a_g, __global const float *b_g, __global float *res_g)
170
{
171
  int gid = get_global_id(0);
172
  res_g[gid] = a_g[gid] + b_g[gid];
173
}
174
""").build()
175
    Elapsed=time.time()-TimeIn
176
    print("Building kernels : %.3f" % Elapsed)
177
    
178
    TimeIn=time.time()
179
    # Memory allocation on Device for result
180
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
181
    Elapsed=time.time()-TimeIn
182
    print("Allocation on Device for results : %.3f" % Elapsed)
183

    
184
    TimeIn=time.time()
185
    # Synthesis of function "sillysum" inside Kernel Sources
186
    knl = prg.sillysum  # Use this Kernel object for repeated calls
187
    Elapsed=time.time()-TimeIn
188
    print("Synthesis of kernel : %.3f" % Elapsed)
189

    
190
    TimeIn=time.time()
191
    # Call of kernel previously defined 
192
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
193
    # 
194
    CallCL.wait()
195
    Elapsed=time.time()-TimeIn
196
    print("Execution of kernel : %.3f" % Elapsed)
197

    
198
    TimeIn=time.time()
199
    # Creation of vector for result with same size as input vectors
200
    res_np = np.empty_like(a_np)
201
    Elapsed=time.time()-TimeIn
202
    print("Allocation on Host for results: %.3f" % Elapsed)
203

    
204
    TimeIn=time.time()
205
    # Copy from Device to Host
206
    cl.enqueue_copy(queue, res_np, res_g)
207
    Elapsed=time.time()-TimeIn
208
    print("Copy from Device 2 Host : %.3f" % Elapsed)
209

    
210
    # Liberation of memory
211
    a_g.release()
212
    b_g.release()
213
    res_g.release()    
214
    
215
    return(res_np)
216

    
217
import sys
218
import time
219

    
220
if __name__=='__main__':
221

    
222
    # Size of input vectors definition based on stdin
223
    import sys
224
    try:
225
        SIZE=int(sys.argv[1])
226
        print("Size of vectors set to %i" % SIZE)
227
    except: 
228
        SIZE=50000
229
        print("Size of vectors set to default size %i" % SIZE)
230
        
231
    a_np = np.random.rand(SIZE).astype(np.float32)
232
    b_np = np.random.rand(SIZE).astype(np.float32)
233

    
234
    # Native Implementation
235
    TimeIn=time.time()
236
    # res_np=NativeAddition(a_np,b_np)
237
    res_np=NativeSillyAddition(a_np,b_np)
238
    NativeElapsed=time.time()-TimeIn
239
    NativeRate=int(SIZE/NativeElapsed)
240
    print("NativeRate: %i" % NativeRate)
241

    
242
    # OpenCL Implementation
243
    TimeIn=time.time()
244
    # res_cl=OpenCLAddition(a_np,b_np)
245
    res_cl=OpenCLSillyAddition(a_np,b_np)
246
    OpenCLElapsed=time.time()-TimeIn
247
    OpenCLRate=int(SIZE/OpenCLElapsed)
248
    print("OpenCLRate: %i" % OpenCLRate)
249

    
250
    # CUDA Implementation
251
    TimeIn=time.time()
252
    # res_cuda=CUDAAddition(a_np,b_np)
253
    res_cuda=CUDASillyAddition(a_np,b_np)
254
    CUDAElapsed=time.time()-TimeIn
255
    CUDARate=int(SIZE/CUDAElapsed)
256
    print("CUDARate: %i" % CUDARate)
257
    
258
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
259
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
260
    
261
    # Check on CPU with Numpy:
262
    print(res_cl - res_np)
263
    print(np.linalg.norm(res_cl - res_np))
264
    try:
265
        assert np.allclose(res_np, res_cl)
266
    except:
267
        print("Results between Native & OpenCL seem to be too different!")
268
        
269
    # Check on CPU with Numpy:
270
    print(res_cuda - res_np)
271
    print(np.linalg.norm(res_cuda - res_np))
272
    try:
273
        assert np.allclose(res_np, res_cuda)
274
    except:
275
        print("Results between Native & CUDA seem to be too different!")
276