Statistiques
| Révision :

root / ETSN / MySteps_5c.py @ 269

Historique | Voir | Annoter | Télécharger (10,19 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5

    
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

    
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

    
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np):
16
    return(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_np))))))))))))))))+MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_np)))))))))))))))))
17

    
18
# CUDA complete operation
19
def CUDAAddition(a_np,b_np):
20
    import pycuda.autoinit
21
    import pycuda.driver as drv
22
    import numpy
23

    
24
    from pycuda.compiler import SourceModule
25
    mod = SourceModule("""
26
    __global__ void sum(float *dest, float *a, float *b)
27
{
28
  // const int i = threadIdx.x;
29
  const int i = blockIdx.x;
30
  dest[i] = a[i] + b[i];
31
}
32
""")
33

    
34
    # sum = mod.get_function("sum")
35
    sum = mod.get_function("sum")
36

    
37
    res_np = numpy.zeros_like(a_np)
38
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
39
        block=(1,1,1), grid=(a_np.size,1))
40
    return(res_np)
41

    
42
# CUDA Silly complete operation
43
def CUDASillyAddition(a_np,b_np):
44
    import pycuda.autoinit
45
    import pycuda.driver as drv
46
    import numpy
47

    
48
    from pycuda.compiler import SourceModule
49
    mod = SourceModule("""
50
__device__ float MySillyFunction(float x)
51
{
52
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
53
}
54

55
__global__ void sillysum(float *dest, float *a, float *b)
56
{
57
  const int i = blockIdx.x;
58
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
59
}
60

61
__global__ void hybridsillysum(float *dest, float *a, float *b)
62
{
63
  const int i = threadIdx.x+blockDim.x*blockIdx.x;
64
  dest[i] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a[i])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b[i]))))))))))))))));
65
}
66
""")
67

    
68
    # sum = mod.get_function("sum")
69
    # sillysum = mod.get_function("sillysum")
70
    hybridsillysum = mod.get_function("hybridsillysum")
71

    
72
    res_np = numpy.zeros_like(a_np)
73
    threads=1024
74
    blocks=int(a_np.size/threads)
75
    # sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
76
    #          block=(threads,1,1), grid=(blocks,1))
77
    hybridsillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
78
                   block=(threads,1,1), grid=(blocks,1))
79
    return(res_np)
80

    
81
# OpenCL complete operation
82
def OpenCLAddition(a_np,b_np):
83

    
84
    # Context creation
85
    ctx = cl.create_some_context()
86
    # Every process is stored in a queue
87
    queue = cl.CommandQueue(ctx)
88

    
89
    TimeIn=time.time()
90
    # Copy from Host to Device using pointers
91
    mf = cl.mem_flags
92
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
93
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
94
    Elapsed=time.time()-TimeIn
95
    print("Copy from Host 2 Device : %.3f" % Elapsed)
96

    
97
    TimeIn=time.time()
98
    # Definition of kernel under OpenCL
99
    prg = cl.Program(ctx, """
100
__kernel void sum(
101
    __global const float *a_g, __global const float *b_g, __global float *res_g)
102
{
103
  int gid = get_global_id(0);
104
  res_g[gid] = a_g[gid] + b_g[gid];
105
}
106
""").build()
107
    Elapsed=time.time()-TimeIn
108
    print("Building kernels : %.3f" % Elapsed)
109
    
110
    TimeIn=time.time()
111
    # Memory allocation on Device for result
112
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
113
    Elapsed=time.time()-TimeIn
114
    print("Allocation on Device for results : %.3f" % Elapsed)
115

    
116
    TimeIn=time.time()
117
    # Synthesis of function "sum" inside Kernel Sources
118
    knl = prg.sum  # Use this Kernel object for repeated calls
119
    Elapsed=time.time()-TimeIn
120
    print("Synthesis of kernel : %.3f" % Elapsed)
121

    
122
    TimeIn=time.time()
123
    # Call of kernel previously defined 
124
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
125
    Elapsed=time.time()-TimeIn
126
    print("Execution of kernel : %.3f" % Elapsed)
127

    
128
    TimeIn=time.time()
129
    # Creation of vector for result with same size as input vectors
130
    res_np = np.empty_like(a_np)
131
    Elapsed=time.time()-TimeIn
132
    print("Allocation on Host for results: %.3f" % Elapsed)
133

    
134
    TimeIn=time.time()
135
    # Copy from Device to Host
136
    cl.enqueue_copy(queue, res_np, res_g)
137
    Elapsed=time.time()-TimeIn
138
    print("Copy from Device 2 Host : %.3f" % Elapsed)
139

    
140
    return(res_np)
141

    
142
# OpenCL complete operation
143
def OpenCLSillyAddition(a_np,b_np):
144

    
145
    # Context creation
146
    ctx = cl.create_some_context()
147
    # Every process is stored in a queue
148
    queue = cl.CommandQueue(ctx)
149

    
150
    TimeIn=time.time()
151
    # Copy from Host to Device using pointers
152
    mf = cl.mem_flags
153
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
154
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
155
    Elapsed=time.time()-TimeIn
156
    print("Copy from Host 2 Device : %.3f" % Elapsed)
157

    
158
    TimeIn=time.time()
159
    # Definition of kernel under OpenCL
160
    prg = cl.Program(ctx, """
161

162
float MySillyFunction(float x)
163
{
164
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
165
}
166

167
__kernel void sillysum(
168
    __global const float *a_g, __global const float *b_g, __global float *res_g)
169
{
170
  int gid = get_global_id(0);
171
  res_g[gid] = MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(a_g[gid])))))))))))))))) + MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(MySillyFunction(b_g[gid]))))))))))))))));
172
}
173

174
__kernel void sum(
175
    __global const float *a_g, __global const float *b_g, __global float *res_g)
176
{
177
  int gid = get_global_id(0);
178
  res_g[gid] = a_g[gid] + b_g[gid];
179
}
180
""").build()
181
    Elapsed=time.time()-TimeIn
182
    print("Building kernels : %.3f" % Elapsed)
183
    
184
    TimeIn=time.time()
185
    # Memory allocation on Device for result
186
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
187
    Elapsed=time.time()-TimeIn
188
    print("Allocation on Device for results : %.3f" % Elapsed)
189

    
190
    TimeIn=time.time()
191
    # Synthesis of function "sillysum" inside Kernel Sources
192
    knl = prg.sillysum  # Use this Kernel object for repeated calls
193
    Elapsed=time.time()-TimeIn
194
    print("Synthesis of kernel : %.3f" % Elapsed)
195

    
196
    TimeIn=time.time()
197
    # Call of kernel previously defined 
198
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
199
    # 
200
    CallCL.wait()
201
    Elapsed=time.time()-TimeIn
202
    print("Execution of kernel : %.3f" % Elapsed)
203

    
204
    TimeIn=time.time()
205
    # Creation of vector for result with same size as input vectors
206
    res_np = np.empty_like(a_np)
207
    Elapsed=time.time()-TimeIn
208
    print("Allocation on Host for results: %.3f" % Elapsed)
209

    
210
    TimeIn=time.time()
211
    # Copy from Device to Host
212
    cl.enqueue_copy(queue, res_np, res_g)
213
    Elapsed=time.time()-TimeIn
214
    print("Copy from Device 2 Host : %.3f" % Elapsed)
215

    
216
    return(res_np)
217

    
218
import sys
219
import time
220

    
221
if __name__=='__main__':
222

    
223
    # Size of input vectors definition based on stdin
224
    import sys
225
    try:
226
        SIZE=int(sys.argv[1])
227
        print("Size of vectors set to %i" % SIZE)
228
    except: 
229
        SIZE=50000
230
        print("Size of vectors set to default size %i" % SIZE)
231
        
232
    a_np = np.random.rand(SIZE).astype(np.float32)
233
    b_np = np.random.rand(SIZE).astype(np.float32)
234

    
235
    # Native Implementation
236
    TimeIn=time.time()
237
    # res_np=NativeAddition(a_np,b_np)
238
    res_np=NativeSillyAddition(a_np,b_np)
239
    NativeElapsed=time.time()-TimeIn
240
    NativeRate=int(SIZE/NativeElapsed)
241
    print("NativeRate: %i" % NativeRate)
242

    
243
    # OpenCL Implementation
244
    TimeIn=time.time()
245
    # res_cl=OpenCLAddition(a_np,b_np)
246
    res_cl=OpenCLSillyAddition(a_np,b_np)
247
    OpenCLElapsed=time.time()-TimeIn
248
    OpenCLRate=int(SIZE/OpenCLElapsed)
249
    print("OpenCLRate: %i" % OpenCLRate)
250

    
251
    # CUDA Implementation
252
    TimeIn=time.time()
253
    # res_cuda=CUDAAddition(a_np,b_np)
254
    res_cuda=CUDASillyAddition(a_np,b_np)
255
    CUDAElapsed=time.time()-TimeIn
256
    CUDARate=int(SIZE/CUDAElapsed)
257
    print("CUDARate: %i" % CUDARate)
258
    
259
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
260
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
261
    
262
    # Check on CPU with Numpy:
263
    print(res_cl - res_np)
264
    print(np.linalg.norm(res_cl - res_np))
265
    try:
266
        assert np.allclose(res_np, res_cl)
267
    except:
268
        print("Results between Native & OpenCL seem to be too different!")
269
        
270
    # Check on CPU with Numpy:
271
    print(res_cuda - res_np)
272
    print(np.linalg.norm(res_cuda - res_np))
273
    try:
274
        assert np.allclose(res_np, res_cuda)
275
    except:
276
        print("Results between Native & CUDA seem to be too different!")
277