Statistiques
| Révision :

root / ETSN / MySteps_4.py @ 301

Historique | Voir | Annoter | Télécharger (7,12 ko)

1
#!/usr/bin/env python3
2

    
3
import numpy as np
4
import pyopencl as cl
5

    
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

    
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

    
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np):
16
    return(MySillyFunction(a_np)+MySillyFunction(b_np))
17

    
18
# CUDA complete operation
19
def CUDAAddition(a_np,b_np):
20
    import pycuda.autoinit
21
    import pycuda.driver as drv
22
    import numpy
23

    
24
    from pycuda.compiler import SourceModule
25
    mod = SourceModule("""
26
    __global__ void sum(float *dest, float *a, float *b)
27
{
28
  // const int i = threadIdx.x;
29
  const int i = blockIdx.x;
30
  dest[i] = a[i] + b[i];
31
}
32
""")
33

    
34
    sum = mod.get_function("sum")
35

    
36
    res_np = numpy.zeros_like(a_np)
37
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
38
        block=(1,1,1), grid=(a_np.size,1))
39
    return(res_np)
40

    
41
# OpenCL complete operation
42
def OpenCLAddition(a_np,b_np):
43

    
44
    # Context creation
45
    ctx = cl.create_some_context()
46
    # Every process is stored in a queue
47
    queue = cl.CommandQueue(ctx)
48

    
49
    TimeIn=time.time()
50
    # Copy from Host to Device using pointers
51
    mf = cl.mem_flags
52
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
53
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
54
    Elapsed=time.time()-TimeIn
55
    print("Copy from Host 2 Device : %.3f" % Elapsed)
56

    
57
    TimeIn=time.time()
58
    # Definition of kernel under OpenCL
59
    prg = cl.Program(ctx, """
60
__kernel void sum(
61
    __global const float *a_g, __global const float *b_g, __global float *res_g)
62
{
63
  int gid = get_global_id(0);
64
  res_g[gid] = a_g[gid] + b_g[gid];
65
}
66
""").build()
67
    Elapsed=time.time()-TimeIn
68
    print("Building kernels : %.3f" % Elapsed)
69
    
70
    TimeIn=time.time()
71
    # Memory allocation on Device for result
72
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
73
    Elapsed=time.time()-TimeIn
74
    print("Allocation on Device for results : %.3f" % Elapsed)
75

    
76
    TimeIn=time.time()
77
    # Synthesis of function "sum" inside Kernel Sources
78
    knl = prg.sum  # Use this Kernel object for repeated calls
79
    Elapsed=time.time()-TimeIn
80
    print("Synthesis of kernel : %.3f" % Elapsed)
81

    
82
    TimeIn=time.time()
83
    # Call of kernel previously defined 
84
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
85
    Elapsed=time.time()-TimeIn
86
    print("Execution of kernel : %.3f" % Elapsed)
87

    
88
    TimeIn=time.time()
89
    # Creation of vector for result with same size as input vectors
90
    res_np = np.empty_like(a_np)
91
    Elapsed=time.time()-TimeIn
92
    print("Allocation on Host for results: %.3f" % Elapsed)
93

    
94
    TimeIn=time.time()
95
    # Copy from Device to Host
96
    cl.enqueue_copy(queue, res_np, res_g)
97
    Elapsed=time.time()-TimeIn
98
    print("Copy from Device 2 Host : %.3f" % Elapsed)
99

    
100
    # Liberation of memory
101
    a_g.release()
102
    b_g.release()
103
    res_g.release()
104

    
105
    return(res_np)
106

    
107
# OpenCL complete operation
108
def OpenCLSillyAddition(a_np,b_np):
109

    
110
    # Context creation
111
    ctx = cl.create_some_context()
112
    # Every process is stored in a queue
113
    queue = cl.CommandQueue(ctx)
114

    
115
    TimeIn=time.time()
116
    # Copy from Host to Device using pointers
117
    mf = cl.mem_flags
118
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
119
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
120
    Elapsed=time.time()-TimeIn
121
    print("Copy from Host 2 Device : %.3f" % Elapsed)
122

    
123
    TimeIn=time.time()
124
    # Definition of kernel under OpenCL
125
    prg = cl.Program(ctx, """
126

127
float MySillyFunction(float x)
128
{
129
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
130
}
131

132
__kernel void sillysum(
133
    __global const float *a_g, __global const float *b_g, __global float *res_g)
134
{
135
  int gid = get_global_id(0);
136
  res_g[gid] = MySillyFunction(a_g[gid]) + MySillyFunction(b_g[gid]);
137
}
138

139
__kernel void sum(
140
    __global const float *a_g, __global const float *b_g, __global float *res_g)
141
{
142
  int gid = get_global_id(0);
143
  res_g[gid] = a_g[gid] + b_g[gid];
144
}
145
""").build()
146
    Elapsed=time.time()-TimeIn
147
    print("Building kernels : %.3f" % Elapsed)
148
    
149
    TimeIn=time.time()
150
    # Memory allocation on Device for result
151
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
152
    Elapsed=time.time()-TimeIn
153
    print("Allocation on Device for results : %.3f" % Elapsed)
154

    
155
    TimeIn=time.time()
156
    # Synthesis of function "sillysum" inside Kernel Sources
157
    knl = prg.sillysum  # Use this Kernel object for repeated calls
158
    Elapsed=time.time()-TimeIn
159
    print("Synthesis of kernel : %.3f" % Elapsed)
160

    
161
    TimeIn=time.time()
162
    # Call of kernel previously defined 
163
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g)
164
    # 
165
    CallCL.wait()
166
    Elapsed=time.time()-TimeIn
167
    print("Execution of kernel : %.3f" % Elapsed)
168

    
169
    TimeIn=time.time()
170
    # Creation of vector for result with same size as input vectors
171
    res_np = np.empty_like(a_np)
172
    Elapsed=time.time()-TimeIn
173
    print("Allocation on Host for results: %.3f" % Elapsed)
174

    
175
    TimeIn=time.time()
176
    # Copy from Device to Host
177
    cl.enqueue_copy(queue, res_np, res_g)
178
    Elapsed=time.time()-TimeIn
179
    print("Copy from Device 2 Host : %.3f" % Elapsed)
180

    
181
    # Liberation of memory
182
    a_g.release()
183
    b_g.release()
184
    res_g.release()
185

    
186
    return(res_np)
187

    
188
import sys
189
import time
190

    
191
if __name__=='__main__':
192

    
193
    # Size of input vectors definition based on stdin
194
    import sys
195
    try:
196
        SIZE=int(sys.argv[1])
197
        print("Size of vectors set to %i" % SIZE)
198
    except: 
199
        SIZE=50000
200
        print("Size of vectors set to default size %i" % SIZE)
201
        
202
    a_np = np.random.rand(SIZE).astype(np.float32)
203
    b_np = np.random.rand(SIZE).astype(np.float32)
204

    
205
    # Native Implementation
206
    TimeIn=time.time()
207
    # res_np=NativeSillyAddition(a_np,b_np)
208
    res_np=NativeAddition(a_np,b_np)
209
    NativeElapsed=time.time()-TimeIn
210
    NativeRate=int(SIZE/NativeElapsed)
211
    print("NativeRate: %i" % NativeRate)
212

    
213
    # OpenCL Implementation
214
    TimeIn=time.time()
215
    # res_cl=OpenCLSillyAddition(a_np,b_np)
216
    res_cl=OpenCLAddition(a_np,b_np)
217
    OpenCLElapsed=time.time()-TimeIn
218
    OpenCLRate=int(SIZE/OpenCLElapsed)
219
    print("OpenCLRate: %i" % OpenCLRate)
220

    
221
    # CUDA Implementation
222
    TimeIn=time.time()
223
    res_cuda=CUDAAddition(a_np,b_np)
224
    CUDAElapsed=time.time()-TimeIn
225
    CUDARate=int(SIZE/CUDAElapsed)
226
    print("CUDARate: %i" % CUDARate)
227
    
228
    print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
229
    print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
230
    
231
    # Check on OpenCL with Numpy:
232
    print(res_cl - res_np)
233
    print(np.linalg.norm(res_cl - res_np))
234
    try:
235
        assert np.allclose(res_np, res_cl)
236
    except:
237
        print("Results between Native & OpenCL seem to be too different!")
238
        
239
    # Check on CUDA with Numpy:
240
    print(res_cuda - res_np)
241
    print(np.linalg.norm(res_cuda - res_np))
242
    try:
243
        assert np.allclose(res_np, res_cuda)
244
    except:
245
        print("Results between Native & CUDA seem to be too different!")
246

    
247