root / ETSN / MySteps_6.py @ 310
Historique | Voir | Annoter | Télécharger (13,33 ko)
1 |
#!/usr/bin/env python3
|
---|---|
2 |
|
3 |
import numpy as np |
4 |
import pyopencl as cl |
5 |
|
6 |
# piling 16 arithmetical functions
|
7 |
def MySillyFunction(x): |
8 |
return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2)) |
9 |
|
10 |
# Native Operation under Numpy (for prototyping & tests
|
11 |
def NativeAddition(a_np,b_np): |
12 |
return(a_np+b_np)
|
13 |
|
14 |
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
|
15 |
def NativeSillyAddition(a_np,b_np,Calls): |
16 |
a=a_np.copy() |
17 |
b=b_np.copy() |
18 |
for i in range(Calls): |
19 |
a=MySillyFunction(a) |
20 |
b=MySillyFunction(b) |
21 |
|
22 |
return(a+b)
|
23 |
|
24 |
# CUDA complete operation
|
25 |
def CUDAAddition(a_np,b_np): |
26 |
import pycuda.autoinit |
27 |
import pycuda.driver as drv |
28 |
import numpy |
29 |
|
30 |
from pycuda.compiler import SourceModule |
31 |
mod = SourceModule("""
|
32 |
__global__ void sum(float *dest, float *a, float *b)
|
33 |
{
|
34 |
// const int i = threadIdx.x;
|
35 |
const int i = blockIdx.x;
|
36 |
dest[i] = a[i] + b[i];
|
37 |
}
|
38 |
""")
|
39 |
|
40 |
# sum = mod.get_function("sum")
|
41 |
sum = mod.get_function("sum")
|
42 |
|
43 |
res_np = numpy.zeros_like(a_np) |
44 |
sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
|
45 |
block=(1,1,1), grid=(a_np.size,1)) |
46 |
return(res_np)
|
47 |
|
48 |
# CUDA Silly complete operation
|
49 |
def CUDASillyAddition(a_np,b_np,Calls,Threads): |
50 |
import pycuda.autoinit |
51 |
import pycuda.driver as drv |
52 |
import numpy |
53 |
|
54 |
from pycuda.compiler import SourceModule |
55 |
TimeIn=time.time() |
56 |
mod = SourceModule("""
|
57 |
__device__ float MySillyFunction(float x)
|
58 |
{
|
59 |
return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
|
60 |
}
|
61 |
|
62 |
__global__ void sillysum(float *dest, float *a, float *b, int Calls)
|
63 |
{
|
64 |
const int i = blockDim.x*blockIdx.x+threadIdx.x;
|
65 |
float ai=a[i];
|
66 |
float bi=b[i];
|
67 |
|
68 |
for (int c=0;c<Calls;c++)
|
69 |
{
|
70 |
ai=MySillyFunction(ai);
|
71 |
bi=MySillyFunction(bi);
|
72 |
}
|
73 |
|
74 |
dest[i] = ai + bi;
|
75 |
}
|
76 |
""")
|
77 |
Elapsed=time.time()-TimeIn |
78 |
print("Definition of kernel : %.3f" % Elapsed)
|
79 |
|
80 |
TimeIn=time.time() |
81 |
# sum = mod.get_function("sum")
|
82 |
sillysum = mod.get_function("sillysum")
|
83 |
Elapsed=time.time()-TimeIn |
84 |
print("Synthesis of kernel : %.3f" % Elapsed)
|
85 |
|
86 |
TimeIn=time.time() |
87 |
res_np = numpy.zeros_like(a_np) |
88 |
Elapsed=time.time()-TimeIn |
89 |
print("Allocation on Host for results : %.3f" % Elapsed)
|
90 |
|
91 |
Size=a_np.size |
92 |
if (Size % Threads != 0): |
93 |
print("Impossible : %i not multiple of %i..." % (Threads,Size) )
|
94 |
TimeIn=time.time() |
95 |
sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls), |
96 |
block=(1,1,1), grid=(a_np.size,1)) |
97 |
Elapsed=time.time()-TimeIn |
98 |
print("Execution of kernel : %.3f" % Elapsed)
|
99 |
else:
|
100 |
Blocks=int(Size/Threads)
|
101 |
TimeIn=time.time() |
102 |
sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls), |
103 |
block=(Threads,1,1), grid=(Blocks,1)) |
104 |
Elapsed=time.time()-TimeIn |
105 |
print("Execution of kernel : %.3f" % Elapsed)
|
106 |
|
107 |
print("Execution of kernel : %.3f" % Elapsed)
|
108 |
return(res_np)
|
109 |
|
110 |
# OpenCL complete operation
|
111 |
def OpenCLAddition(a_np,b_np): |
112 |
|
113 |
# Context creation
|
114 |
ctx = cl.create_some_context() |
115 |
# Every process is stored in a queue
|
116 |
queue = cl.CommandQueue(ctx) |
117 |
|
118 |
TimeIn=time.time() |
119 |
# Copy from Host to Device using pointers
|
120 |
mf = cl.mem_flags |
121 |
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) |
122 |
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) |
123 |
Elapsed=time.time()-TimeIn |
124 |
print("Copy from Host 2 Device : %.3f" % Elapsed)
|
125 |
|
126 |
TimeIn=time.time() |
127 |
# Definition of kernel under OpenCL
|
128 |
prg = cl.Program(ctx, """
|
129 |
__kernel void sum(
|
130 |
__global const float *a_g, __global const float *b_g, __global float *res_g)
|
131 |
{
|
132 |
int gid = get_global_id(0);
|
133 |
res_g[gid] = a_g[gid] + b_g[gid];
|
134 |
}
|
135 |
""").build()
|
136 |
Elapsed=time.time()-TimeIn |
137 |
print("Building kernels : %.3f" % Elapsed)
|
138 |
|
139 |
TimeIn=time.time() |
140 |
# Memory allocation on Device for result
|
141 |
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) |
142 |
Elapsed=time.time()-TimeIn |
143 |
print("Allocation on Device for results : %.3f" % Elapsed)
|
144 |
|
145 |
TimeIn=time.time() |
146 |
# Synthesis of function "sum" inside Kernel Sources
|
147 |
knl = prg.sum # Use this Kernel object for repeated calls
|
148 |
Elapsed=time.time()-TimeIn |
149 |
print("Synthesis of kernel : %.3f" % Elapsed)
|
150 |
|
151 |
TimeIn=time.time() |
152 |
# Call of kernel previously defined
|
153 |
knl(queue, a_np.shape, None, a_g, b_g, res_g)
|
154 |
Elapsed=time.time()-TimeIn |
155 |
print("Execution of kernel : %.3f" % Elapsed)
|
156 |
|
157 |
TimeIn=time.time() |
158 |
# Creation of vector for result with same size as input vectors
|
159 |
res_np = np.empty_like(a_np) |
160 |
Elapsed=time.time()-TimeIn |
161 |
print("Allocation on Host for results: %.3f" % Elapsed)
|
162 |
|
163 |
TimeIn=time.time() |
164 |
# Copy from Device to Host
|
165 |
cl.enqueue_copy(queue, res_np, res_g) |
166 |
Elapsed=time.time()-TimeIn |
167 |
print("Copy from Device 2 Host : %.3f" % Elapsed)
|
168 |
|
169 |
# Liberation of memory
|
170 |
a_g.release() |
171 |
b_g.release() |
172 |
res_g.release() |
173 |
|
174 |
return(res_np)
|
175 |
|
176 |
# OpenCL complete operation
|
177 |
def OpenCLSillyAddition(a_np,b_np,Calls): |
178 |
|
179 |
Id=0
|
180 |
HasXPU=False
|
181 |
for platform in cl.get_platforms(): |
182 |
for device in platform.get_devices(): |
183 |
if Id==Device:
|
184 |
XPU=device |
185 |
print("CPU/GPU selected: ",device.name.lstrip())
|
186 |
HasXPU=True
|
187 |
Id+=1
|
188 |
# print(Id)
|
189 |
|
190 |
if HasXPU==False: |
191 |
print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)) |
192 |
sys.exit() |
193 |
|
194 |
try:
|
195 |
ctx = cl.Context(devices=[XPU]) |
196 |
queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) |
197 |
except:
|
198 |
print("Crash during context creation")
|
199 |
|
200 |
TimeIn=time.time() |
201 |
# Copy from Host to Device using pointers
|
202 |
mf = cl.mem_flags |
203 |
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) |
204 |
b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) |
205 |
Elapsed=time.time()-TimeIn |
206 |
print("Copy from Host 2 Device : %.3f" % Elapsed)
|
207 |
|
208 |
TimeIn=time.time() |
209 |
# Definition of kernel under OpenCL
|
210 |
prg = cl.Program(ctx, """
|
211 |
|
212 |
float MySillyFunction(float x)
|
213 |
{
|
214 |
return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
|
215 |
}
|
216 |
|
217 |
__kernel void sillysum(
|
218 |
__global const float *a_g, __global const float *b_g, __global float *res_g, int Calls)
|
219 |
{
|
220 |
int gid = get_global_id(0);
|
221 |
float ai=a_g[gid];
|
222 |
float bi=b_g[gid];
|
223 |
|
224 |
for (int c=0;c<Calls;c++)
|
225 |
{
|
226 |
ai=MySillyFunction(ai);
|
227 |
bi=MySillyFunction(bi);
|
228 |
}
|
229 |
|
230 |
res_g[gid] = ai + bi;
|
231 |
}
|
232 |
|
233 |
__kernel void sum(
|
234 |
__global const float *a_g, __global const float *b_g, __global float *res_g)
|
235 |
{
|
236 |
int gid = get_global_id(0);
|
237 |
res_g[gid] = a_g[gid] + b_g[gid];
|
238 |
}
|
239 |
""").build()
|
240 |
Elapsed=time.time()-TimeIn |
241 |
print("Building kernels : %.3f" % Elapsed)
|
242 |
|
243 |
TimeIn=time.time() |
244 |
# Memory allocation on Device for result
|
245 |
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) |
246 |
Elapsed=time.time()-TimeIn |
247 |
print("Allocation on Device for results : %.3f" % Elapsed)
|
248 |
|
249 |
TimeIn=time.time() |
250 |
# Synthesis of function "sillysum" inside Kernel Sources
|
251 |
knl = prg.sillysum # Use this Kernel object for repeated calls
|
252 |
Elapsed=time.time()-TimeIn |
253 |
print("Synthesis of kernel : %.3f" % Elapsed)
|
254 |
|
255 |
TimeIn=time.time() |
256 |
# Call of kernel previously defined
|
257 |
CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g, np.uint32(Calls))
|
258 |
#
|
259 |
CallCL.wait() |
260 |
Elapsed=time.time()-TimeIn |
261 |
print("Execution of kernel : %.3f" % Elapsed)
|
262 |
|
263 |
TimeIn=time.time() |
264 |
# Creation of vector for result with same size as input vectors
|
265 |
res_np = np.empty_like(a_np) |
266 |
Elapsed=time.time()-TimeIn |
267 |
print("Allocation on Host for results: %.3f" % Elapsed)
|
268 |
|
269 |
TimeIn=time.time() |
270 |
# Copy from Device to Host
|
271 |
cl.enqueue_copy(queue, res_np, res_g) |
272 |
Elapsed=time.time()-TimeIn |
273 |
print("Copy from Device 2 Host : %.3f" % Elapsed)
|
274 |
|
275 |
# Liberation of memory
|
276 |
a_g.release() |
277 |
b_g.release() |
278 |
res_g.release() |
279 |
|
280 |
return(res_np)
|
281 |
|
282 |
import sys |
283 |
import time |
284 |
|
285 |
if __name__=='__main__': |
286 |
|
287 |
GpuStyle='OpenCL'
|
288 |
SIZE=1024
|
289 |
Device=0
|
290 |
Calls=1
|
291 |
Threads=1
|
292 |
Serial=True
|
293 |
|
294 |
import getopt |
295 |
|
296 |
HowToUse='%s -n -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId> -c <SillyCalls> -t <Threads>'
|
297 |
|
298 |
try:
|
299 |
opts, args = getopt.getopt(sys.argv[1:],"hng:s:d:c:t:",["gpustyle=","size=","device=","calls=","threads="]) |
300 |
except getopt.GetoptError:
|
301 |
print(HowToUse % sys.argv[0])
|
302 |
sys.exit(2)
|
303 |
|
304 |
# List of Devices
|
305 |
Devices=[] |
306 |
Alu={} |
307 |
|
308 |
for opt, arg in opts: |
309 |
if opt == '-h': |
310 |
print(HowToUse % sys.argv[0])
|
311 |
|
312 |
print("\nInformations about devices detected under OpenCL API:")
|
313 |
# For PyOpenCL import
|
314 |
try:
|
315 |
import pyopencl as cl |
316 |
Id=0
|
317 |
for platform in cl.get_platforms(): |
318 |
for device in platform.get_devices(): |
319 |
#deviceType=cl.device_type.to_string(device.type)
|
320 |
deviceType="xPU"
|
321 |
print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
|
322 |
Id=Id+1
|
323 |
|
324 |
except:
|
325 |
print("Your platform does not seem to support OpenCL")
|
326 |
|
327 |
print("\nInformations about devices detected under CUDA API:")
|
328 |
# For PyCUDA import
|
329 |
try:
|
330 |
import pycuda.driver as cuda |
331 |
cuda.init() |
332 |
for Id in range(cuda.Device.count()): |
333 |
device=cuda.Device(Id) |
334 |
print("Device #%i of type GPU : %s" % (Id,device.name()))
|
335 |
print
|
336 |
except:
|
337 |
print("Your platform does not seem to support CUDA")
|
338 |
|
339 |
sys.exit() |
340 |
|
341 |
elif opt in ("-d", "--device"): |
342 |
Device=int(arg)
|
343 |
elif opt in ("-t", "--threads"): |
344 |
Threads=int(arg)
|
345 |
elif opt in ("-c", "--calls"): |
346 |
Calls=int(arg)
|
347 |
elif opt in ("-g", "--gpustyle"): |
348 |
GpuStyle = arg |
349 |
elif opt in ("-s", "--size"): |
350 |
SIZE = int(arg)
|
351 |
elif opt in ("-n"): |
352 |
Serial = False
|
353 |
|
354 |
print("Device Selection : %i" % Device)
|
355 |
print("GpuStyle used : %s" % GpuStyle)
|
356 |
print("Size of complex vector : %i" % SIZE)
|
357 |
print("Number of silly calls : %i" % Calls)
|
358 |
print("Number of Threads : %i" % Threads)
|
359 |
print("Serial compute : %i" % Serial)
|
360 |
|
361 |
if GpuStyle=='CUDA': |
362 |
try:
|
363 |
# For PyCUDA import
|
364 |
import pycuda.driver as cuda |
365 |
|
366 |
cuda.init() |
367 |
for Id in range(cuda.Device.count()): |
368 |
device=cuda.Device(Id) |
369 |
print("Device #%i of type GPU : %s" % (Id,device.name()))
|
370 |
if Id in Devices: |
371 |
Alu[Id]='GPU'
|
372 |
|
373 |
except ImportError: |
374 |
print("Platform does not seem to support CUDA")
|
375 |
|
376 |
if GpuStyle=='OpenCL': |
377 |
try:
|
378 |
# For PyOpenCL import
|
379 |
import pyopencl as cl |
380 |
Id=0
|
381 |
for platform in cl.get_platforms(): |
382 |
for device in platform.get_devices(): |
383 |
#deviceType=cl.device_type.to_string(device.type)
|
384 |
deviceType="xPU"
|
385 |
print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
|
386 |
|
387 |
if Id in Devices: |
388 |
# Set the Alu as detected Device Type
|
389 |
Alu[Id]=deviceType |
390 |
Id=Id+1
|
391 |
except ImportError: |
392 |
print("Platform does not seem to support OpenCL")
|
393 |
|
394 |
a_np = np.random.rand(SIZE).astype(np.float32) |
395 |
b_np = np.random.rand(SIZE).astype(np.float32) |
396 |
|
397 |
# Native Implementation
|
398 |
if Serial:
|
399 |
TimeIn=time.time() |
400 |
res_np=NativeSillyAddition(a_np,b_np,Calls) |
401 |
NativeElapsed=time.time()-TimeIn |
402 |
NativeRate=int(SIZE/NativeElapsed)
|
403 |
print("NativeRate: %i" % NativeRate)
|
404 |
|
405 |
# OpenCL Implementation
|
406 |
if GpuStyle=='OpenCL' or GpuStyle=='all': |
407 |
TimeIn=time.time() |
408 |
# res_cl=OpenCLAddition(a_np,b_np)
|
409 |
res_cl=OpenCLSillyAddition(a_np,b_np,Calls) |
410 |
OpenCLElapsed=time.time()-TimeIn |
411 |
OpenCLRate=int(SIZE/OpenCLElapsed)
|
412 |
print("OpenCLRate: %i" % OpenCLRate)
|
413 |
# Check on OpenCL with Numpy:
|
414 |
if Serial:
|
415 |
print(res_cl - res_np) |
416 |
print(np.linalg.norm(res_cl - res_np)) |
417 |
try:
|
418 |
assert np.allclose(res_np, res_cl)
|
419 |
except:
|
420 |
print("Results between Native & OpenCL seem to be too different!")
|
421 |
|
422 |
print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
|
423 |
|
424 |
# CUDA Implementation
|
425 |
if GpuStyle=='CUDA' or GpuStyle=='all': |
426 |
TimeIn=time.time() |
427 |
res_cuda=CUDASillyAddition(a_np,b_np,Calls,Threads) |
428 |
CUDAElapsed=time.time()-TimeIn |
429 |
CUDARate=int(SIZE/CUDAElapsed)
|
430 |
print("CUDARate: %i" % CUDARate)
|
431 |
# Check on CUDA with Numpy:
|
432 |
if Serial:
|
433 |
print(res_cuda - res_np) |
434 |
print(np.linalg.norm(res_cuda - res_np)) |
435 |
try:
|
436 |
assert np.allclose(res_np, res_cuda)
|
437 |
except:
|
438 |
print("Results between Native & CUDA seem to be too different!")
|
439 |
|
440 |
print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
|
441 |
|
442 |
|
443 |
|
444 |
|