Statistiques
| Révision :

root / ETSN / MySteps_3.py @ 288

Historique | Voir | Annoter | Télécharger (6,89 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5

    
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

    
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

    
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np):
16
    return(MySillyFunction(a_np)+MySillyFunction(b_np))
17

    
18
# CUDA complete operation
19
def CUDAAddition(a_np,b_np):
20
    import pycuda.autoinit
21
    import pycuda.driver as drv
22
    import numpy
23

    
24
    from pycuda.compiler import SourceModule
25
    mod = SourceModule("""
26
    __global__ void sum(float *dest, float *a, float *b)
27
{
28
  const int i = threadIdx.x;
29
  dest[i] = a[i] + b[i];
30
}
31
""")
32

    
33
    sum = mod.get_function("sum")
34

    
35
    res_np = numpy.zeros_like(a_np)
36
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
37
        block=(a_np.size,1,1), grid=(1,1))
38
    return(res_np)
39

    
40
# OpenCL complete operation
41
def OpenCLAddition(a_np,b_np):
42

    
43
    # Context creation
44
    ctx = cl.create_some_context()
45
    # Every process is stored in a queue
46
    queue = cl.CommandQueue(ctx)
47

    
48
    TimeIn=time.time()
49
    # Copy from Host to Device using pointers
50
    mf = cl.mem_flags
51
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
52
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
53
    Elapsed=time.time()-TimeIn
54
    print("Copy from Host 2 Device : %.3f" % Elapsed)
55

    
56
    TimeIn=time.time()
57
    # Definition of kernel under OpenCL
58
    prg = cl.Program(ctx, """
59
__kernel void sum(
60
    __global const float *a_g, __global const float *b_g, __global float *res_g)
61
{
62
  int gid = get_global_id(0);
63
  res_g[gid] = a_g[gid] + b_g[gid];
64
}
65
""").build()
66
    Elapsed=time.time()-TimeIn
67
    print("Building kernels : %.3f" % Elapsed)
68
    
69
    TimeIn=time.time()
70
    # Memory allocation on Device for result
71
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
72
    Elapsed=time.time()-TimeIn
73
    print("Allocation on Device for results : %.3f" % Elapsed)
74

    
75
    TimeIn=time.time()
76
    # Synthesis of function "sum" inside Kernel Sources
77
    knl = prg.sum  # Use this Kernel object for repeated calls
78
    Elapsed=time.time()-TimeIn
79
    print("Synthesis of kernel : %.3f" % Elapsed)
80

    
81
    TimeIn=time.time()
82
    # Call of kernel previously defined 
83
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
84
    Elapsed=time.time()-TimeIn
85
    print("Execution of kernel : %.3f" % Elapsed)
86

    
87
    TimeIn=time.time()
88
    # Creation of vector for result with same size as input vectors
89
    res_np = np.empty_like(a_np)
90
    Elapsed=time.time()-TimeIn
91
    print("Allocation on Host for results: %.3f" % Elapsed)
92

    
93
    TimeIn=time.time()
94
    # Copy from Device to Host
95
    cl.enqueue_copy(queue, res_np, res_g)
96
    Elapsed=time.time()-TimeIn
97
    print("Copy from Device 2 Host : %.3f" % Elapsed)
98

    
99
    # Liberation of memory
100
    a_g.release()
101
    b_g.release()
102
    res_g.release()
103

    
104
    return(res_np)
105

    
106
# OpenCL complete operation
107
def OpenCLSillyAddition(a_np,b_np):
108

    
109
    # Context creation
110
    ctx = cl.create_some_context()
111
    # Every process is stored in a queue
112
    queue = cl.CommandQueue(ctx)
113

    
114
    TimeIn=time.time()
115
    # Copy from Host to Device using pointers
116
    mf = cl.mem_flags
117
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
118
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
119
    Elapsed=time.time()-TimeIn
120
    print("Copy from Host 2 Device : %.3f" % Elapsed)
121

    
122
    TimeIn=time.time()
123
    # Definition of kernel under OpenCL
124
    prg = cl.Program(ctx, """
125

126
float MySillyFunction(float x)
127
{
128
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
129
}
130

131
__kernel void sillysum(
132
    __global const float *a_g, __global const float *b_g, __global float *res_g)
133
{
134
  int gid = get_global_id(0);
135
  res_g[gid] = MySillyFunction(a_g[gid]) + MySillyFunction(b_g[gid]);
136
}
137

138
__kernel void sum(
139
    __global const float *a_g, __global const float *b_g, __global float *res_g)
140
{
141
  int gid = get_global_id(0);
142
  res_g[gid] = a_g[gid] + b_g[gid];
143
}
144
""").build()
145
    Elapsed=time.time()-TimeIn
146
    print("Building kernels : %.3f" % Elapsed)
147
    
148
    TimeIn=time.time()
149
    # Memory allocation on Device for result
150
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
151
    Elapsed=time.time()-TimeIn
152
    print("Allocation on Device for results : %.3f" % Elapsed)
153

    
154
    TimeIn=time.time()
155
    # Synthesis of function "sillysum" inside Kernel Sources
156
    knl = prg.sillysum  # Use this Kernel object for repeated calls
157
    Elapsed=time.time()-TimeIn
158
    print("Synthesis of kernel : %.3f" % Elapsed)
159

    
160
    TimeIn=time.time()
161
    # Call of kernel previously defined 
162
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
163
    # 
164
    CallCL.wait()
165
    Elapsed=time.time()-TimeIn
166
    print("Execution of kernel : %.3f" % Elapsed)
167

    
168
    TimeIn=time.time()
169
    # Creation of vector for result with same size as input vectors
170
    res_np = np.empty_like(a_np)
171
    Elapsed=time.time()-TimeIn
172
    print("Allocation on Host for results: %.3f" % Elapsed)
173

    
174
    TimeIn=time.time()
175
    # Copy from Device to Host
176
    cl.enqueue_copy(queue, res_np, res_g)
177
    Elapsed=time.time()-TimeIn
178
    print("Copy from Device 2 Host : %.3f" % Elapsed)
179

    
180
    # Liberation of memory
181
    a_g.release()
182
    b_g.release()
183
    res_g.release()
184
    
185
    return(res_np)
186

    
187
import sys
188
import time
189

    
190
if __name__=='__main__':
191

    
192
    # Size of input vectors definition based on stdin
193
    import sys
194
    try:
195
        SIZE=int(sys.argv[1])
196
        print("Size of vectors set to %i" % SIZE)
197
    except: 
198
        SIZE=50000
199
        print("Size of vectors set to default size %i" % SIZE)
200
        
201
    a_np = np.random.rand(SIZE).astype(np.float32)
202
    b_np = np.random.rand(SIZE).astype(np.float32)
203

    
204
    # Native Implementation
205
    TimeIn=time.time()
206
    # res_np=NativeSillyAddition(a_np,b_np)
207
    res_np=NativeAddition(a_np,b_np)
208
    NativeElapsed=time.time()-TimeIn
209
    NativeRate=int(SIZE/NativeElapsed)
210
    print("NativeRate: %i" % NativeRate)
211

    
212
    # OpenCL Implementation
213
    TimeIn=time.time()
214
    # res_cl=OpenCLSillyAddition(a_np,b_np)
215
    res_cl=OpenCLAddition(a_np,b_np)
216
    OpenCLElapsed=time.time()-TimeIn
217
    OpenCLRate=int(SIZE/OpenCLElapsed)
218
    print("OpenCLRate: %i" % OpenCLRate)
219

    
220
    # CUDA Implementation
221
    TimeIn=time.time()
222
    res_cuda=CUDAAddition(a_np,b_np)
223
    CUDAElapsed=time.time()-TimeIn
224
    CUDARate=int(SIZE/CUDAElapsed)
225
    print("CUDARate: %i" % CUDARate)
226
    
227
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
228
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
229
    
230
    # Check on CPU with Numpy:
231
    print(res_cl - res_np)
232
    print(np.linalg.norm(res_cl - res_np))
233
    assert np.allclose(res_np, res_cl)
234

    
235
    # Check on CPU with Numpy:
236
    print(res_cuda - res_np)
237
    print(np.linalg.norm(res_cuda - res_np))
238
    assert np.allclose(res_np, res_cuda)