root / ETSN / MySteps_6.py @ 302
Historique | Voir | Annoter | Télécharger (13,33 ko)
1 | 276 | equemene | #!/usr/bin/env python3
|
---|---|---|---|
2 | 276 | equemene | |
3 | 276 | equemene | import numpy as np |
4 | 276 | equemene | import pyopencl as cl |
5 | 276 | equemene | |
6 | 276 | equemene | # piling 16 arithmetical functions
|
7 | 276 | equemene | def MySillyFunction(x): |
8 | 276 | equemene | return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2)) |
9 | 276 | equemene | |
10 | 276 | equemene | # Native Operation under Numpy (for prototyping & tests
|
11 | 276 | equemene | def NativeAddition(a_np,b_np): |
12 | 276 | equemene | return(a_np+b_np)
|
13 | 276 | equemene | |
14 | 276 | equemene | # Native Operation with MySillyFunction under Numpy (for prototyping & tests
|
15 | 276 | equemene | def NativeSillyAddition(a_np,b_np,Calls): |
16 | 276 | equemene | a=a_np.copy() |
17 | 276 | equemene | b=b_np.copy() |
18 | 276 | equemene | for i in range(Calls): |
19 | 276 | equemene | a=MySillyFunction(a) |
20 | 276 | equemene | b=MySillyFunction(b) |
21 | 276 | equemene | |
22 | 276 | equemene | return(a+b)
|
23 | 276 | equemene | |
24 | 276 | equemene | # CUDA complete operation
|
25 | 276 | equemene | def CUDAAddition(a_np,b_np): |
26 | 276 | equemene | import pycuda.autoinit |
27 | 276 | equemene | import pycuda.driver as drv |
28 | 276 | equemene | import numpy |
29 | 276 | equemene | |
30 | 276 | equemene | from pycuda.compiler import SourceModule |
31 | 276 | equemene | mod = SourceModule("""
|
32 | 276 | equemene | __global__ void sum(float *dest, float *a, float *b)
|
33 | 276 | equemene | {
|
34 | 276 | equemene | // const int i = threadIdx.x;
|
35 | 276 | equemene | const int i = blockIdx.x;
|
36 | 276 | equemene | dest[i] = a[i] + b[i];
|
37 | 276 | equemene | }
|
38 | 276 | equemene | """)
|
39 | 276 | equemene | |
40 | 276 | equemene | # sum = mod.get_function("sum")
|
41 | 276 | equemene | sum = mod.get_function("sum")
|
42 | 276 | equemene | |
43 | 276 | equemene | res_np = numpy.zeros_like(a_np) |
44 | 276 | equemene | sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
|
45 | 276 | equemene | block=(1,1,1), grid=(a_np.size,1)) |
46 | 276 | equemene | return(res_np)
|
47 | 276 | equemene | |
48 | 276 | equemene | # CUDA Silly complete operation
|
49 | 276 | equemene | def CUDASillyAddition(a_np,b_np,Calls,Threads): |
50 | 276 | equemene | import pycuda.autoinit |
51 | 276 | equemene | import pycuda.driver as drv |
52 | 276 | equemene | import numpy |
53 | 276 | equemene | |
54 | 276 | equemene | from pycuda.compiler import SourceModule |
55 | 276 | equemene | TimeIn=time.time() |
56 | 276 | equemene | mod = SourceModule("""
|
57 | 276 | equemene | __device__ float MySillyFunction(float x)
|
58 | 276 | equemene | {
|
59 | 276 | equemene | return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
|
60 | 276 | equemene | }
|
61 | 276 | equemene |
|
62 | 276 | equemene | __global__ void sillysum(float *dest, float *a, float *b, int Calls)
|
63 | 276 | equemene | {
|
64 | 276 | equemene | const int i = blockDim.x*blockIdx.x+threadIdx.x;
|
65 | 276 | equemene | float ai=a[i];
|
66 | 276 | equemene | float bi=b[i];
|
67 | 276 | equemene |
|
68 | 276 | equemene | for (int c=0;c<Calls;c++)
|
69 | 276 | equemene | {
|
70 | 276 | equemene | ai=MySillyFunction(ai);
|
71 | 276 | equemene | bi=MySillyFunction(bi);
|
72 | 276 | equemene | }
|
73 | 276 | equemene |
|
74 | 276 | equemene | dest[i] = ai + bi;
|
75 | 276 | equemene | }
|
76 | 276 | equemene | """)
|
77 | 276 | equemene | Elapsed=time.time()-TimeIn |
78 | 276 | equemene | print("Definition of kernel : %.3f" % Elapsed)
|
79 | 276 | equemene | |
80 | 276 | equemene | TimeIn=time.time() |
81 | 276 | equemene | # sum = mod.get_function("sum")
|
82 | 276 | equemene | sillysum = mod.get_function("sillysum")
|
83 | 276 | equemene | Elapsed=time.time()-TimeIn |
84 | 276 | equemene | print("Synthesis of kernel : %.3f" % Elapsed)
|
85 | 276 | equemene | |
86 | 276 | equemene | TimeIn=time.time() |
87 | 276 | equemene | res_np = numpy.zeros_like(a_np) |
88 | 276 | equemene | Elapsed=time.time()-TimeIn |
89 | 276 | equemene | print("Allocation on Host for results : %.3f" % Elapsed)
|
90 | 276 | equemene | |
91 | 276 | equemene | Size=a_np.size |
92 | 276 | equemene | if (Size % Threads != 0): |
93 | 276 | equemene | print("Impossible : %i not multiple of %i..." % (Threads,Size) )
|
94 | 276 | equemene | TimeIn=time.time() |
95 | 276 | equemene | sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls), |
96 | 276 | equemene | block=(1,1,1), grid=(a_np.size,1)) |
97 | 276 | equemene | Elapsed=time.time()-TimeIn |
98 | 276 | equemene | print("Execution of kernel : %.3f" % Elapsed)
|
99 | 276 | equemene | else:
|
100 | 276 | equemene | Blocks=int(Size/Threads)
|
101 | 276 | equemene | TimeIn=time.time() |
102 | 276 | equemene | sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls), |
103 | 276 | equemene | block=(Threads,1,1), grid=(Blocks,1)) |
104 | 276 | equemene | Elapsed=time.time()-TimeIn |
105 | 276 | equemene | print("Execution of kernel : %.3f" % Elapsed)
|
106 | 276 | equemene | |
107 | 276 | equemene | print("Execution of kernel : %.3f" % Elapsed)
|
108 | 276 | equemene | return(res_np)
|
109 | 276 | equemene | |
110 | 276 | equemene | # OpenCL complete operation
|
111 | 276 | equemene | def OpenCLAddition(a_np,b_np): |
112 | 276 | equemene | |
113 | 276 | equemene | # Context creation
|
114 | 276 | equemene | ctx = cl.create_some_context() |
115 | 276 | equemene | # Every process is stored in a queue
|
116 | 276 | equemene | queue = cl.CommandQueue(ctx) |
117 | 276 | equemene | |
118 | 276 | equemene | TimeIn=time.time() |
119 | 276 | equemene | # Copy from Host to Device using pointers
|
120 | 276 | equemene | mf = cl.mem_flags |
121 | 276 | equemene | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) |
122 | 276 | equemene | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) |
123 | 276 | equemene | Elapsed=time.time()-TimeIn |
124 | 276 | equemene | print("Copy from Host 2 Device : %.3f" % Elapsed)
|
125 | 276 | equemene | |
126 | 276 | equemene | TimeIn=time.time() |
127 | 276 | equemene | # Definition of kernel under OpenCL
|
128 | 276 | equemene | prg = cl.Program(ctx, """
|
129 | 276 | equemene | __kernel void sum(
|
130 | 276 | equemene | __global const float *a_g, __global const float *b_g, __global float *res_g)
|
131 | 276 | equemene | {
|
132 | 276 | equemene | int gid = get_global_id(0);
|
133 | 276 | equemene | res_g[gid] = a_g[gid] + b_g[gid];
|
134 | 276 | equemene | }
|
135 | 276 | equemene | """).build()
|
136 | 276 | equemene | Elapsed=time.time()-TimeIn |
137 | 276 | equemene | print("Building kernels : %.3f" % Elapsed)
|
138 | 276 | equemene | |
139 | 276 | equemene | TimeIn=time.time() |
140 | 276 | equemene | # Memory allocation on Device for result
|
141 | 276 | equemene | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) |
142 | 276 | equemene | Elapsed=time.time()-TimeIn |
143 | 276 | equemene | print("Allocation on Device for results : %.3f" % Elapsed)
|
144 | 276 | equemene | |
145 | 276 | equemene | TimeIn=time.time() |
146 | 276 | equemene | # Synthesis of function "sum" inside Kernel Sources
|
147 | 276 | equemene | knl = prg.sum # Use this Kernel object for repeated calls
|
148 | 276 | equemene | Elapsed=time.time()-TimeIn |
149 | 276 | equemene | print("Synthesis of kernel : %.3f" % Elapsed)
|
150 | 276 | equemene | |
151 | 276 | equemene | TimeIn=time.time() |
152 | 276 | equemene | # Call of kernel previously defined
|
153 | 276 | equemene | knl(queue, a_np.shape, None, a_g, b_g, res_g)
|
154 | 276 | equemene | Elapsed=time.time()-TimeIn |
155 | 276 | equemene | print("Execution of kernel : %.3f" % Elapsed)
|
156 | 276 | equemene | |
157 | 276 | equemene | TimeIn=time.time() |
158 | 276 | equemene | # Creation of vector for result with same size as input vectors
|
159 | 276 | equemene | res_np = np.empty_like(a_np) |
160 | 276 | equemene | Elapsed=time.time()-TimeIn |
161 | 276 | equemene | print("Allocation on Host for results: %.3f" % Elapsed)
|
162 | 276 | equemene | |
163 | 276 | equemene | TimeIn=time.time() |
164 | 276 | equemene | # Copy from Device to Host
|
165 | 276 | equemene | cl.enqueue_copy(queue, res_np, res_g) |
166 | 276 | equemene | Elapsed=time.time()-TimeIn |
167 | 276 | equemene | print("Copy from Device 2 Host : %.3f" % Elapsed)
|
168 | 276 | equemene | |
169 | 276 | equemene | # Liberation of memory
|
170 | 276 | equemene | a_g.release() |
171 | 276 | equemene | b_g.release() |
172 | 276 | equemene | res_g.release() |
173 | 276 | equemene | |
174 | 276 | equemene | return(res_np)
|
175 | 276 | equemene | |
176 | 276 | equemene | # OpenCL complete operation
|
177 | 276 | equemene | def OpenCLSillyAddition(a_np,b_np,Calls): |
178 | 276 | equemene | |
179 | 276 | equemene | Id=0
|
180 | 276 | equemene | HasXPU=False
|
181 | 276 | equemene | for platform in cl.get_platforms(): |
182 | 276 | equemene | for device in platform.get_devices(): |
183 | 276 | equemene | if Id==Device:
|
184 | 276 | equemene | XPU=device |
185 | 276 | equemene | print("CPU/GPU selected: ",device.name.lstrip())
|
186 | 276 | equemene | HasXPU=True
|
187 | 276 | equemene | Id+=1
|
188 | 276 | equemene | # print(Id)
|
189 | 276 | equemene | |
190 | 276 | equemene | if HasXPU==False: |
191 | 276 | equemene | print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)) |
192 | 276 | equemene | sys.exit() |
193 | 276 | equemene | |
194 | 276 | equemene | try:
|
195 | 276 | equemene | ctx = cl.Context(devices=[XPU]) |
196 | 276 | equemene | queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE) |
197 | 276 | equemene | except:
|
198 | 276 | equemene | print("Crash during context creation")
|
199 | 276 | equemene | |
200 | 276 | equemene | TimeIn=time.time() |
201 | 276 | equemene | # Copy from Host to Device using pointers
|
202 | 276 | equemene | mf = cl.mem_flags |
203 | 276 | equemene | a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) |
204 | 276 | equemene | b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) |
205 | 276 | equemene | Elapsed=time.time()-TimeIn |
206 | 276 | equemene | print("Copy from Host 2 Device : %.3f" % Elapsed)
|
207 | 276 | equemene | |
208 | 276 | equemene | TimeIn=time.time() |
209 | 276 | equemene | # Definition of kernel under OpenCL
|
210 | 276 | equemene | prg = cl.Program(ctx, """
|
211 | 276 | equemene |
|
212 | 276 | equemene | float MySillyFunction(float x)
|
213 | 276 | equemene | {
|
214 | 276 | equemene | return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
|
215 | 276 | equemene | }
|
216 | 276 | equemene |
|
217 | 276 | equemene | __kernel void sillysum(
|
218 | 276 | equemene | __global const float *a_g, __global const float *b_g, __global float *res_g, int Calls)
|
219 | 276 | equemene | {
|
220 | 276 | equemene | int gid = get_global_id(0);
|
221 | 276 | equemene | float ai=a_g[gid];
|
222 | 276 | equemene | float bi=b_g[gid];
|
223 | 276 | equemene |
|
224 | 276 | equemene | for (int c=0;c<Calls;c++)
|
225 | 276 | equemene | {
|
226 | 276 | equemene | ai=MySillyFunction(ai);
|
227 | 276 | equemene | bi=MySillyFunction(bi);
|
228 | 276 | equemene | }
|
229 | 276 | equemene |
|
230 | 276 | equemene | res_g[gid] = ai + bi;
|
231 | 276 | equemene | }
|
232 | 276 | equemene |
|
233 | 276 | equemene | __kernel void sum(
|
234 | 276 | equemene | __global const float *a_g, __global const float *b_g, __global float *res_g)
|
235 | 276 | equemene | {
|
236 | 276 | equemene | int gid = get_global_id(0);
|
237 | 276 | equemene | res_g[gid] = a_g[gid] + b_g[gid];
|
238 | 276 | equemene | }
|
239 | 276 | equemene | """).build()
|
240 | 276 | equemene | Elapsed=time.time()-TimeIn |
241 | 276 | equemene | print("Building kernels : %.3f" % Elapsed)
|
242 | 276 | equemene | |
243 | 276 | equemene | TimeIn=time.time() |
244 | 276 | equemene | # Memory allocation on Device for result
|
245 | 276 | equemene | res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) |
246 | 276 | equemene | Elapsed=time.time()-TimeIn |
247 | 276 | equemene | print("Allocation on Device for results : %.3f" % Elapsed)
|
248 | 276 | equemene | |
249 | 276 | equemene | TimeIn=time.time() |
250 | 276 | equemene | # Synthesis of function "sillysum" inside Kernel Sources
|
251 | 276 | equemene | knl = prg.sillysum # Use this Kernel object for repeated calls
|
252 | 276 | equemene | Elapsed=time.time()-TimeIn |
253 | 276 | equemene | print("Synthesis of kernel : %.3f" % Elapsed)
|
254 | 276 | equemene | |
255 | 276 | equemene | TimeIn=time.time() |
256 | 276 | equemene | # Call of kernel previously defined
|
257 | 276 | equemene | CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g, np.uint32(Calls))
|
258 | 276 | equemene | #
|
259 | 276 | equemene | CallCL.wait() |
260 | 276 | equemene | Elapsed=time.time()-TimeIn |
261 | 276 | equemene | print("Execution of kernel : %.3f" % Elapsed)
|
262 | 276 | equemene | |
263 | 276 | equemene | TimeIn=time.time() |
264 | 276 | equemene | # Creation of vector for result with same size as input vectors
|
265 | 276 | equemene | res_np = np.empty_like(a_np) |
266 | 276 | equemene | Elapsed=time.time()-TimeIn |
267 | 276 | equemene | print("Allocation on Host for results: %.3f" % Elapsed)
|
268 | 276 | equemene | |
269 | 276 | equemene | TimeIn=time.time() |
270 | 276 | equemene | # Copy from Device to Host
|
271 | 276 | equemene | cl.enqueue_copy(queue, res_np, res_g) |
272 | 276 | equemene | Elapsed=time.time()-TimeIn |
273 | 276 | equemene | print("Copy from Device 2 Host : %.3f" % Elapsed)
|
274 | 276 | equemene | |
275 | 276 | equemene | # Liberation of memory
|
276 | 276 | equemene | a_g.release() |
277 | 276 | equemene | b_g.release() |
278 | 276 | equemene | res_g.release() |
279 | 276 | equemene | |
280 | 276 | equemene | return(res_np)
|
281 | 276 | equemene | |
282 | 276 | equemene | import sys |
283 | 276 | equemene | import time |
284 | 276 | equemene | |
285 | 276 | equemene | if __name__=='__main__': |
286 | 276 | equemene | |
287 | 276 | equemene | GpuStyle='OpenCL'
|
288 | 276 | equemene | SIZE=1024
|
289 | 276 | equemene | Device=0
|
290 | 276 | equemene | Calls=1
|
291 | 276 | equemene | Threads=1
|
292 | 296 | equemene | Serial=True
|
293 | 296 | equemene | |
294 | 276 | equemene | import getopt |
295 | 276 | equemene | |
296 | 296 | equemene | HowToUse='%s -n -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId> -c <SillyCalls> -t <Threads>'
|
297 | 276 | equemene | |
298 | 276 | equemene | try:
|
299 | 296 | equemene | opts, args = getopt.getopt(sys.argv[1:],"hng:s:d:c:t:",["gpustyle=","size=","device=","calls=","threads="]) |
300 | 276 | equemene | except getopt.GetoptError:
|
301 | 276 | equemene | print(HowToUse % sys.argv[0])
|
302 | 276 | equemene | sys.exit(2)
|
303 | 276 | equemene | |
304 | 276 | equemene | # List of Devices
|
305 | 276 | equemene | Devices=[] |
306 | 276 | equemene | Alu={} |
307 | 276 | equemene | |
308 | 276 | equemene | for opt, arg in opts: |
309 | 276 | equemene | if opt == '-h': |
310 | 276 | equemene | print(HowToUse % sys.argv[0])
|
311 | 276 | equemene | |
312 | 276 | equemene | print("\nInformations about devices detected under OpenCL API:")
|
313 | 276 | equemene | # For PyOpenCL import
|
314 | 276 | equemene | try:
|
315 | 276 | equemene | import pyopencl as cl |
316 | 276 | equemene | Id=0
|
317 | 276 | equemene | for platform in cl.get_platforms(): |
318 | 276 | equemene | for device in platform.get_devices(): |
319 | 276 | equemene | #deviceType=cl.device_type.to_string(device.type)
|
320 | 276 | equemene | deviceType="xPU"
|
321 | 276 | equemene | print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
|
322 | 276 | equemene | Id=Id+1
|
323 | 276 | equemene | |
324 | 276 | equemene | except:
|
325 | 276 | equemene | print("Your platform does not seem to support OpenCL")
|
326 | 276 | equemene | |
327 | 276 | equemene | print("\nInformations about devices detected under CUDA API:")
|
328 | 276 | equemene | # For PyCUDA import
|
329 | 276 | equemene | try:
|
330 | 276 | equemene | import pycuda.driver as cuda |
331 | 276 | equemene | cuda.init() |
332 | 276 | equemene | for Id in range(cuda.Device.count()): |
333 | 276 | equemene | device=cuda.Device(Id) |
334 | 276 | equemene | print("Device #%i of type GPU : %s" % (Id,device.name()))
|
335 | 276 | equemene | print
|
336 | 276 | equemene | except:
|
337 | 276 | equemene | print("Your platform does not seem to support CUDA")
|
338 | 276 | equemene | |
339 | 276 | equemene | sys.exit() |
340 | 276 | equemene | |
341 | 276 | equemene | elif opt in ("-d", "--device"): |
342 | 276 | equemene | Device=int(arg)
|
343 | 276 | equemene | elif opt in ("-t", "--threads"): |
344 | 276 | equemene | Threads=int(arg)
|
345 | 276 | equemene | elif opt in ("-c", "--calls"): |
346 | 276 | equemene | Calls=int(arg)
|
347 | 276 | equemene | elif opt in ("-g", "--gpustyle"): |
348 | 276 | equemene | GpuStyle = arg |
349 | 276 | equemene | elif opt in ("-s", "--size"): |
350 | 276 | equemene | SIZE = int(arg)
|
351 | 296 | equemene | elif opt in ("-n"): |
352 | 296 | equemene | Serial = False
|
353 | 276 | equemene | |
354 | 276 | equemene | print("Device Selection : %i" % Device)
|
355 | 276 | equemene | print("GpuStyle used : %s" % GpuStyle)
|
356 | 276 | equemene | print("Size of complex vector : %i" % SIZE)
|
357 | 276 | equemene | print("Number of silly calls : %i" % Calls)
|
358 | 276 | equemene | print("Number of Threads : %i" % Threads)
|
359 | 296 | equemene | print("Serial compute : %i" % Serial)
|
360 | 276 | equemene | |
361 | 276 | equemene | if GpuStyle=='CUDA': |
362 | 276 | equemene | try:
|
363 | 276 | equemene | # For PyCUDA import
|
364 | 276 | equemene | import pycuda.driver as cuda |
365 | 276 | equemene | |
366 | 276 | equemene | cuda.init() |
367 | 276 | equemene | for Id in range(cuda.Device.count()): |
368 | 276 | equemene | device=cuda.Device(Id) |
369 | 276 | equemene | print("Device #%i of type GPU : %s" % (Id,device.name()))
|
370 | 276 | equemene | if Id in Devices: |
371 | 276 | equemene | Alu[Id]='GPU'
|
372 | 276 | equemene | |
373 | 276 | equemene | except ImportError: |
374 | 276 | equemene | print("Platform does not seem to support CUDA")
|
375 | 276 | equemene | |
376 | 276 | equemene | if GpuStyle=='OpenCL': |
377 | 276 | equemene | try:
|
378 | 276 | equemene | # For PyOpenCL import
|
379 | 276 | equemene | import pyopencl as cl |
380 | 276 | equemene | Id=0
|
381 | 276 | equemene | for platform in cl.get_platforms(): |
382 | 276 | equemene | for device in platform.get_devices(): |
383 | 276 | equemene | #deviceType=cl.device_type.to_string(device.type)
|
384 | 276 | equemene | deviceType="xPU"
|
385 | 276 | equemene | print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
|
386 | 276 | equemene | |
387 | 276 | equemene | if Id in Devices: |
388 | 276 | equemene | # Set the Alu as detected Device Type
|
389 | 276 | equemene | Alu[Id]=deviceType |
390 | 276 | equemene | Id=Id+1
|
391 | 276 | equemene | except ImportError: |
392 | 276 | equemene | print("Platform does not seem to support OpenCL")
|
393 | 276 | equemene | |
394 | 276 | equemene | a_np = np.random.rand(SIZE).astype(np.float32) |
395 | 276 | equemene | b_np = np.random.rand(SIZE).astype(np.float32) |
396 | 276 | equemene | |
397 | 276 | equemene | # Native Implementation
|
398 | 296 | equemene | if Serial:
|
399 | 296 | equemene | TimeIn=time.time() |
400 | 296 | equemene | res_np=NativeSillyAddition(a_np,b_np,Calls) |
401 | 296 | equemene | NativeElapsed=time.time()-TimeIn |
402 | 296 | equemene | NativeRate=int(SIZE/NativeElapsed)
|
403 | 296 | equemene | print("NativeRate: %i" % NativeRate)
|
404 | 276 | equemene | |
405 | 276 | equemene | # OpenCL Implementation
|
406 | 276 | equemene | if GpuStyle=='OpenCL' or GpuStyle=='all': |
407 | 276 | equemene | TimeIn=time.time() |
408 | 276 | equemene | # res_cl=OpenCLAddition(a_np,b_np)
|
409 | 276 | equemene | res_cl=OpenCLSillyAddition(a_np,b_np,Calls) |
410 | 276 | equemene | OpenCLElapsed=time.time()-TimeIn |
411 | 276 | equemene | OpenCLRate=int(SIZE/OpenCLElapsed)
|
412 | 276 | equemene | print("OpenCLRate: %i" % OpenCLRate)
|
413 | 276 | equemene | # Check on OpenCL with Numpy:
|
414 | 296 | equemene | if Serial:
|
415 | 296 | equemene | print(res_cl - res_np) |
416 | 296 | equemene | print(np.linalg.norm(res_cl - res_np)) |
417 | 296 | equemene | try:
|
418 | 296 | equemene | assert np.allclose(res_np, res_cl)
|
419 | 296 | equemene | except:
|
420 | 296 | equemene | print("Results between Native & OpenCL seem to be too different!")
|
421 | 276 | equemene | |
422 | 296 | equemene | print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
|
423 | 276 | equemene | |
424 | 276 | equemene | # CUDA Implementation
|
425 | 276 | equemene | if GpuStyle=='CUDA' or GpuStyle=='all': |
426 | 276 | equemene | TimeIn=time.time() |
427 | 276 | equemene | res_cuda=CUDASillyAddition(a_np,b_np,Calls,Threads) |
428 | 276 | equemene | CUDAElapsed=time.time()-TimeIn |
429 | 276 | equemene | CUDARate=int(SIZE/CUDAElapsed)
|
430 | 276 | equemene | print("CUDARate: %i" % CUDARate)
|
431 | 276 | equemene | # Check on CUDA with Numpy:
|
432 | 296 | equemene | if Serial:
|
433 | 296 | equemene | print(res_cuda - res_np) |
434 | 296 | equemene | print(np.linalg.norm(res_cuda - res_np)) |
435 | 296 | equemene | try:
|
436 | 296 | equemene | assert np.allclose(res_np, res_cuda)
|
437 | 296 | equemene | except:
|
438 | 296 | equemene | print("Results between Native & CUDA seem to be too different!")
|
439 | 276 | equemene | |
440 | 296 | equemene | print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
|
441 | 276 | equemene | |
442 | 276 | equemene | |
443 | 276 | equemene |