Statistiques
| Révision :

root / ETSN / MySteps_5b.py @ 274

Historique | Voir | Annoter | Télécharger (9,28 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5

    
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

    
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

    
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np):
16
    return(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_np))))))))))))))))+MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_np)))))))))))))))))
17

    
18
# CUDA complete operation
19
def CUDAAddition(a_np,b_np):
20
    import pycuda.autoinit
21
    import pycuda.driver as drv
22
    import numpy
23

    
24
    from pycuda.compiler import SourceModule
25
    mod = SourceModule("""
26
    __global__ void sum(float *dest, float *a, float *b)
27
{
28
  // const int i = threadIdx.x;
29
  const int i = blockIdx.x;
30
  dest[i] = a[i] + b[i];
31
}
32
""")
33

    
34
    # sum = mod.get_function("sum")
35
    sum = mod.get_function("sum")
36

    
37
    res_np = numpy.zeros_like(a_np)
38
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
39
        block=(1,1,1), grid=(a_np.size,1))
40
    return(res_np)
41

    
42
# CUDA Silly complete operation
43
def CUDASillyAddition(a_np,b_np):
44
    import pycuda.autoinit
45
    import pycuda.driver as drv
46
    import numpy
47

    
48
    from pycuda.compiler import SourceModule
49
    mod = SourceModule("""
50
__device__ float MySillyFunction(float x)
51
{
52
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
53
}
54

55
__global__ void sillysum(float *dest, float *a, float *b)
56
{
57
  const int i = blockIdx.x;
58
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
59
}
60
""")
61

    
62
    # sum = mod.get_function("sum")
63
    sillysum = mod.get_function("sillysum")
64

    
65
    res_np = numpy.zeros_like(a_np)
66
    sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
67
             block=(1,1,1), grid=(a_np.size,1))
68
    return(res_np)
69

    
70
# OpenCL complete operation
71
def OpenCLAddition(a_np,b_np):
72

    
73
    # Context creation
74
    ctx = cl.create_some_context()
75
    # Every process is stored in a queue
76
    queue = cl.CommandQueue(ctx)
77

    
78
    TimeIn=time.time()
79
    # Copy from Host to Device using pointers
80
    mf = cl.mem_flags
81
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
82
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
83
    Elapsed=time.time()-TimeIn
84
    print("Copy from Host 2 Device : %.3f" % Elapsed)
85

    
86
    TimeIn=time.time()
87
    # Definition of kernel under OpenCL
88
    prg = cl.Program(ctx, """
89
__kernel void sum(
90
    __global const float *a_g, __global const float *b_g, __global float *res_g)
91
{
92
  int gid = get_global_id(0);
93
  res_g[gid] = a_g[gid] + b_g[gid];
94
}
95
""").build()
96
    Elapsed=time.time()-TimeIn
97
    print("Building kernels : %.3f" % Elapsed)
98
    
99
    TimeIn=time.time()
100
    # Memory allocation on Device for result
101
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
102
    Elapsed=time.time()-TimeIn
103
    print("Allocation on Device for results : %.3f" % Elapsed)
104

    
105
    TimeIn=time.time()
106
    # Synthesis of function "sum" inside Kernel Sources
107
    knl = prg.sum  # Use this Kernel object for repeated calls
108
    Elapsed=time.time()-TimeIn
109
    print("Synthesis of kernel : %.3f" % Elapsed)
110

    
111
    TimeIn=time.time()
112
    # Call of kernel previously defined 
113
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
114
    Elapsed=time.time()-TimeIn
115
    print("Execution of kernel : %.3f" % Elapsed)
116

    
117
    TimeIn=time.time()
118
    # Creation of vector for result with same size as input vectors
119
    res_np = np.empty_like(a_np)
120
    Elapsed=time.time()-TimeIn
121
    print("Allocation on Host for results: %.3f" % Elapsed)
122

    
123
    TimeIn=time.time()
124
    # Copy from Device to Host
125
    cl.enqueue_copy(queue, res_np, res_g)
126
    Elapsed=time.time()-TimeIn
127
    print("Copy from Device 2 Host : %.3f" % Elapsed)
128

    
129
    return(res_np)
130

    
131
# OpenCL complete operation
132
def OpenCLSillyAddition(a_np,b_np):
133

    
134
    # Context creation
135
    ctx = cl.create_some_context()
136
    # Every process is stored in a queue
137
    queue = cl.CommandQueue(ctx)
138

    
139
    TimeIn=time.time()
140
    # Copy from Host to Device using pointers
141
    mf = cl.mem_flags
142
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
143
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
144
    Elapsed=time.time()-TimeIn
145
    print("Copy from Host 2 Device : %.3f" % Elapsed)
146

    
147
    TimeIn=time.time()
148
    # Definition of kernel under OpenCL
149
    prg = cl.Program(ctx, """
150

151
float MySillyFunction(float x)
152
{
153
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
154
}
155

156
__kernel void sillysum(
157
    __global const float *a_g, __global const float *b_g, __global float *res_g)
158
{
159
  int gid = get_global_id(0);
160
  res_g[gid] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_g[gid])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_g[gid]))))))))))))))));
161
}
162

163
__kernel void sum(
164
    __global const float *a_g, __global const float *b_g, __global float *res_g)
165
{
166
  int gid = get_global_id(0);
167
  res_g[gid] = a_g[gid] + b_g[gid];
168
}
169
""").build()
170
    Elapsed=time.time()-TimeIn
171
    print("Building kernels : %.3f" % Elapsed)
172
    
173
    TimeIn=time.time()
174
    # Memory allocation on Device for result
175
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
176
    Elapsed=time.time()-TimeIn
177
    print("Allocation on Device for results : %.3f" % Elapsed)
178

    
179
    TimeIn=time.time()
180
    # Synthesis of function "sillysum" inside Kernel Sources
181
    knl = prg.sillysum  # Use this Kernel object for repeated calls
182
    Elapsed=time.time()-TimeIn
183
    print("Synthesis of kernel : %.3f" % Elapsed)
184

    
185
    TimeIn=time.time()
186
    # Call of kernel previously defined 
187
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
188
    # 
189
    CallCL.wait()
190
    Elapsed=time.time()-TimeIn
191
    print("Execution of kernel : %.3f" % Elapsed)
192

    
193
    TimeIn=time.time()
194
    # Creation of vector for result with same size as input vectors
195
    res_np = np.empty_like(a_np)
196
    Elapsed=time.time()-TimeIn
197
    print("Allocation on Host for results: %.3f" % Elapsed)
198

    
199
    TimeIn=time.time()
200
    # Copy from Device to Host
201
    cl.enqueue_copy(queue, res_np, res_g)
202
    Elapsed=time.time()-TimeIn
203
    print("Copy from Device 2 Host : %.3f" % Elapsed)
204

    
205
    return(res_np)
206

    
207
import sys
208
import time
209

    
210
if __name__=='__main__':
211

    
212
    # Size of input vectors definition based on stdin
213
    import sys
214
    try:
215
        SIZE=int(sys.argv[1])
216
        print("Size of vectors set to %i" % SIZE)
217
    except: 
218
        SIZE=50000
219
        print("Size of vectors set to default size %i" % SIZE)
220
        
221
    a_np = np.random.rand(SIZE).astype(np.float32)
222
    b_np = np.random.rand(SIZE).astype(np.float32)
223

    
224
    # Native Implementation
225
    TimeIn=time.time()
226
    # res_np=NativeAddition(a_np,b_np)
227
    res_np=NativeSillyAddition(a_np,b_np)
228
    NativeElapsed=time.time()-TimeIn
229
    NativeRate=int(SIZE/NativeElapsed)
230
    print("NativeRate: %i" % NativeRate)
231

    
232
    # OpenCL Implementation
233
    TimeIn=time.time()
234
    # res_cl=OpenCLAddition(a_np,b_np)
235
    res_cl=OpenCLSillyAddition(a_np,b_np)
236
    OpenCLElapsed=time.time()-TimeIn
237
    OpenCLRate=int(SIZE/OpenCLElapsed)
238
    print("OpenCLRate: %i" % OpenCLRate)
239

    
240
    # CUDA Implementation
241
    TimeIn=time.time()
242
    # res_cuda=CUDAAddition(a_np,b_np)
243
    res_cuda=CUDASillyAddition(a_np,b_np)
244
    CUDAElapsed=time.time()-TimeIn
245
    CUDARate=int(SIZE/CUDAElapsed)
246
    print("CUDARate: %i" % CUDARate)
247
    
248
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
249
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
250
    
251
    # Check on CPU with Numpy:
252
    print(res_cl - res_np)
253
    print(np.linalg.norm(res_cl - res_np))
254
    try:
255
        assert np.allclose(res_np, res_cl)
256
    except:
257
        print("Results between Native & OpenCL seem to be too different!")
258
        
259
    # Check on CPU with Numpy:
260
    print(res_cuda - res_np)
261
    print(np.linalg.norm(res_cuda - res_np))
262
    try:
263
        assert np.allclose(res_np, res_cuda)
264
    except:
265
        print("Results between Native & CUDA seem to be too different!")
266