Statistiques
| Révision :

root / ETSN / MySteps_5.py @ 271

Historique | Voir | Annoter | Télécharger (8,21 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5

    
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

    
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

    
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np):
16
    return(MySillyFunction(a_np)+MySillyFunction(b_np))
17

    
18
# CUDA complete operation
19
def CUDAAddition(a_np,b_np):
20
    import pycuda.autoinit
21
    import pycuda.driver as drv
22
    import numpy
23

    
24
    from pycuda.compiler import SourceModule
25
    mod = SourceModule("""
26
    __global__ void sum(float *dest, float *a, float *b)
27
{
28
  // const int i = threadIdx.x;
29
  const int i = blockIdx.x;
30
  dest[i] = a[i] + b[i];
31
}
32
""")
33

    
34
    # sum = mod.get_function("sum")
35
    sum = mod.get_function("sum")
36

    
37
    res_np = numpy.zeros_like(a_np)
38
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
39
        block=(1,1,1), grid=(a_np.size,1))
40
    return(res_np)
41

    
42
# CUDA Silly complete operation
43
def CUDASillyAddition(a_np,b_np):
44
    import pycuda.autoinit
45
    import pycuda.driver as drv
46
    import numpy
47

    
48
    from pycuda.compiler import SourceModule
49
    TimeIn=time.time()
50
    mod = SourceModule("""
51
__device__ float MySillyFunction(float x)
52
{
53
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
54
}
55

56
__global__ void sillysum(float *dest, float *a, float *b)
57
{
58
  const int i = blockIdx.x;
59
  dest[i] = MySillyFunction(a[i]) + MySillyFunction(b[i]);
60
}
61
""")
62
    Elapsed=time.time()-TimeIn
63
    print("Definition of kernel : %.3f" % Elapsed)
64

    
65
    TimeIn=time.time()
66
    # sum = mod.get_function("sum")
67
    sillysum = mod.get_function("sillysum")
68
    Elapsed=time.time()-TimeIn
69
    print("Synthesis of kernel : %.3f" % Elapsed)
70

    
71
    TimeIn=time.time()
72
    res_np = numpy.zeros_like(a_np)
73
    Elapsed=time.time()-TimeIn
74
    print("Allocation on Host for results : %.3f" % Elapsed)
75

    
76
    TimeIn=time.time()
77
    sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
78
             block=(1,1,1), grid=(a_np.size,1))
79
    Elapsed=time.time()-TimeIn
80
    print("Execution of kernel : %.3f" % Elapsed)
81
    return(res_np)
82

    
83
# OpenCL complete operation
84
def OpenCLAddition(a_np,b_np):
85

    
86
    # Context creation
87
    ctx = cl.create_some_context()
88
    # Every process is stored in a queue
89
    queue = cl.CommandQueue(ctx)
90

    
91
    TimeIn=time.time()
92
    # Copy from Host to Device using pointers
93
    mf = cl.mem_flags
94
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
95
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
96
    Elapsed=time.time()-TimeIn
97
    print("Copy from Host 2 Device : %.3f" % Elapsed)
98

    
99
    TimeIn=time.time()
100
    # Definition of kernel under OpenCL
101
    prg = cl.Program(ctx, """
102
__kernel void sum(
103
    __global const float *a_g, __global const float *b_g, __global float *res_g)
104
{
105
  int gid = get_global_id(0);
106
  res_g[gid] = a_g[gid] + b_g[gid];
107
}
108
""").build()
109
    Elapsed=time.time()-TimeIn
110
    print("Building kernels : %.3f" % Elapsed)
111
    
112
    TimeIn=time.time()
113
    # Memory allocation on Device for result
114
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
115
    Elapsed=time.time()-TimeIn
116
    print("Allocation on Device for results : %.3f" % Elapsed)
117

    
118
    TimeIn=time.time()
119
    # Synthesis of function "sum" inside Kernel Sources
120
    knl = prg.sum  # Use this Kernel object for repeated calls
121
    Elapsed=time.time()-TimeIn
122
    print("Synthesis of kernel : %.3f" % Elapsed)
123

    
124
    TimeIn=time.time()
125
    # Call of kernel previously defined 
126
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
127
    Elapsed=time.time()-TimeIn
128
    print("Execution of kernel : %.3f" % Elapsed)
129

    
130
    TimeIn=time.time()
131
    # Creation of vector for result with same size as input vectors
132
    res_np = np.empty_like(a_np)
133
    Elapsed=time.time()-TimeIn
134
    print("Allocation on Host for results: %.3f" % Elapsed)
135

    
136
    TimeIn=time.time()
137
    # Copy from Device to Host
138
    cl.enqueue_copy(queue, res_np, res_g)
139
    Elapsed=time.time()-TimeIn
140
    print("Copy from Device 2 Host : %.3f" % Elapsed)
141

    
142
    return(res_np)
143

    
144
# OpenCL complete operation
145
def OpenCLSillyAddition(a_np,b_np):
146

    
147
    # Context creation
148
    ctx = cl.create_some_context()
149
    # Every process is stored in a queue
150
    queue = cl.CommandQueue(ctx)
151

    
152
    TimeIn=time.time()
153
    # Copy from Host to Device using pointers
154
    mf = cl.mem_flags
155
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
156
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
157
    Elapsed=time.time()-TimeIn
158
    print("Copy from Host 2 Device : %.3f" % Elapsed)
159

    
160
    TimeIn=time.time()
161
    # Definition of kernel under OpenCL
162
    prg = cl.Program(ctx, """
163

164
float MySillyFunction(float x)
165
{
166
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
167
}
168

169
__kernel void sillysum(
170
    __global const float *a_g, __global const float *b_g, __global float *res_g)
171
{
172
  int gid = get_global_id(0);
173
  res_g[gid] = MySillyFunction(a_g[gid]) + MySillyFunction(b_g[gid]);
174
}
175

176
__kernel void sum(
177
    __global const float *a_g, __global const float *b_g, __global float *res_g)
178
{
179
  int gid = get_global_id(0);
180
  res_g[gid] = a_g[gid] + b_g[gid];
181
}
182
""").build()
183
    Elapsed=time.time()-TimeIn
184
    print("Building kernels : %.3f" % Elapsed)
185
    
186
    TimeIn=time.time()
187
    # Memory allocation on Device for result
188
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
189
    Elapsed=time.time()-TimeIn
190
    print("Allocation on Device for results : %.3f" % Elapsed)
191

    
192
    TimeIn=time.time()
193
    # Synthesis of function "sillysum" inside Kernel Sources
194
    knl = prg.sillysum  # Use this Kernel object for repeated calls
195
    Elapsed=time.time()-TimeIn
196
    print("Synthesis of kernel : %.3f" % Elapsed)
197

    
198
    TimeIn=time.time()
199
    # Call of kernel previously defined 
200
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
201
    # 
202
    CallCL.wait()
203
    Elapsed=time.time()-TimeIn
204
    print("Execution of kernel : %.3f" % Elapsed)
205

    
206
    TimeIn=time.time()
207
    # Creation of vector for result with same size as input vectors
208
    res_np = np.empty_like(a_np)
209
    Elapsed=time.time()-TimeIn
210
    print("Allocation on Host for results: %.3f" % Elapsed)
211

    
212
    TimeIn=time.time()
213
    # Copy from Device to Host
214
    cl.enqueue_copy(queue, res_np, res_g)
215
    Elapsed=time.time()-TimeIn
216
    print("Copy from Device 2 Host : %.3f" % Elapsed)
217

    
218
    return(res_np)
219

    
220
import sys
221
import time
222

    
223
if __name__=='__main__':
224

    
225
    # Size of input vectors definition based on stdin
226
    import sys
227
    try:
228
        SIZE=int(sys.argv[1])
229
        print("Size of vectors set to %i" % SIZE)
230
    except: 
231
        SIZE=50000
232
        print("Size of vectors set to default size %i" % SIZE)
233
        
234
    a_np = np.random.rand(SIZE).astype(np.float32)
235
    b_np = np.random.rand(SIZE).astype(np.float32)
236

    
237
    # Native Implementation
238
    TimeIn=time.time()
239
    # res_np=NativeAddition(a_np,b_np)
240
    res_np=NativeSillyAddition(a_np,b_np)
241
    NativeElapsed=time.time()-TimeIn
242
    NativeRate=int(SIZE/NativeElapsed)
243
    print("NativeRate: %i" % NativeRate)
244

    
245
    # OpenCL Implementation
246
    TimeIn=time.time()
247
    # res_cl=OpenCLAddition(a_np,b_np)
248
    res_cl=OpenCLSillyAddition(a_np,b_np)
249
    OpenCLElapsed=time.time()-TimeIn
250
    OpenCLRate=int(SIZE/OpenCLElapsed)
251
    print("OpenCLRate: %i" % OpenCLRate)
252

    
253
    # CUDA Implementation
254
    TimeIn=time.time()
255
    # res_cuda=CUDAAddition(a_np,b_np)
256
    res_cuda=CUDASillyAddition(a_np,b_np)
257
    CUDAElapsed=time.time()-TimeIn
258
    CUDARate=int(SIZE/CUDAElapsed)
259
    print("CUDARate: %i" % CUDARate)
260
    
261
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
262
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
263
    
264
   # Check on OpenCL with Numpy:
265
    print(res_cl - res_np)
266
    print(np.linalg.norm(res_cl - res_np))
267
    try:
268
        assert np.allclose(res_np, res_cl)
269
    except:
270
        print("Results between Native & OpenCL seem to be too different!")
271
        
272
    # Check on CUDA with Numpy:
273
    print(res_cuda - res_np)
274
    print(np.linalg.norm(res_cuda - res_np))
275
    try:
276
        assert np.allclose(res_np, res_cuda)
277
    except:
278
        print("Results between Native & CUDA seem to be too different!")
279

    
280