Statistiques
| Révision :

root / ETSN / MySteps_6.py @ 277

Historique | Voir | Annoter | Télécharger (13,09 ko)

1 276 equemene
#!/usr/bin/env python3
2 276 equemene
3 276 equemene
import numpy as np
4 276 equemene
import pyopencl as cl
5 276 equemene
6 276 equemene
# piling 16 arithmetical functions
7 276 equemene
def MySillyFunction(x):
8 276 equemene
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9 276 equemene
10 276 equemene
# Native Operation under Numpy (for prototyping & tests
11 276 equemene
def NativeAddition(a_np,b_np):
12 276 equemene
    return(a_np+b_np)
13 276 equemene
14 276 equemene
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15 276 equemene
def NativeSillyAddition(a_np,b_np,Calls):
16 276 equemene
    a=a_np.copy()
17 276 equemene
    b=b_np.copy()
18 276 equemene
    for i in range(Calls):
19 276 equemene
        a=MySillyFunction(a)
20 276 equemene
        b=MySillyFunction(b)
21 276 equemene
22 276 equemene
    return(a+b)
23 276 equemene
24 276 equemene
# CUDA complete operation
25 276 equemene
def CUDAAddition(a_np,b_np):
26 276 equemene
    import pycuda.autoinit
27 276 equemene
    import pycuda.driver as drv
28 276 equemene
    import numpy
29 276 equemene
30 276 equemene
    from pycuda.compiler import SourceModule
31 276 equemene
    mod = SourceModule("""
32 276 equemene
    __global__ void sum(float *dest, float *a, float *b)
33 276 equemene
{
34 276 equemene
  // const int i = threadIdx.x;
35 276 equemene
  const int i = blockIdx.x;
36 276 equemene
  dest[i] = a[i] + b[i];
37 276 equemene
}
38 276 equemene
""")
39 276 equemene
40 276 equemene
    # sum = mod.get_function("sum")
41 276 equemene
    sum = mod.get_function("sum")
42 276 equemene
43 276 equemene
    res_np = numpy.zeros_like(a_np)
44 276 equemene
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
45 276 equemene
        block=(1,1,1), grid=(a_np.size,1))
46 276 equemene
    return(res_np)
47 276 equemene
48 276 equemene
# CUDA Silly complete operation
49 276 equemene
def CUDASillyAddition(a_np,b_np,Calls,Threads):
50 276 equemene
    import pycuda.autoinit
51 276 equemene
    import pycuda.driver as drv
52 276 equemene
    import numpy
53 276 equemene
54 276 equemene
    from pycuda.compiler import SourceModule
55 276 equemene
    TimeIn=time.time()
56 276 equemene
    mod = SourceModule("""
57 276 equemene
__device__ float MySillyFunction(float x)
58 276 equemene
{
59 276 equemene
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
60 276 equemene
}
61 276 equemene

62 276 equemene
__global__ void sillysum(float *dest, float *a, float *b, int Calls)
63 276 equemene
{
64 276 equemene
  const int i = blockDim.x*blockIdx.x+threadIdx.x;
65 276 equemene
  float ai=a[i];
66 276 equemene
  float bi=b[i];
67 276 equemene

68 276 equemene
  for (int c=0;c<Calls;c++)
69 276 equemene
  {
70 276 equemene
    ai=MySillyFunction(ai);
71 276 equemene
    bi=MySillyFunction(bi);
72 276 equemene
  }
73 276 equemene

74 276 equemene
  dest[i] = ai + bi;
75 276 equemene
}
76 276 equemene
""")
77 276 equemene
    Elapsed=time.time()-TimeIn
78 276 equemene
    print("Definition of kernel : %.3f" % Elapsed)
79 276 equemene
80 276 equemene
    TimeIn=time.time()
81 276 equemene
    # sum = mod.get_function("sum")
82 276 equemene
    sillysum = mod.get_function("sillysum")
83 276 equemene
    Elapsed=time.time()-TimeIn
84 276 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
85 276 equemene
86 276 equemene
    TimeIn=time.time()
87 276 equemene
    res_np = numpy.zeros_like(a_np)
88 276 equemene
    Elapsed=time.time()-TimeIn
89 276 equemene
    print("Allocation on Host for results : %.3f" % Elapsed)
90 276 equemene
91 276 equemene
    Size=a_np.size
92 276 equemene
    if (Size % Threads != 0):
93 276 equemene
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
94 276 equemene
        TimeIn=time.time()
95 276 equemene
        sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls),
96 276 equemene
                 block=(1,1,1), grid=(a_np.size,1))
97 276 equemene
        Elapsed=time.time()-TimeIn
98 276 equemene
        print("Execution of kernel : %.3f" % Elapsed)
99 276 equemene
    else:
100 276 equemene
        Blocks=int(Size/Threads)
101 276 equemene
        TimeIn=time.time()
102 276 equemene
        sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls),
103 276 equemene
                 block=(Threads,1,1), grid=(Blocks,1))
104 276 equemene
        Elapsed=time.time()-TimeIn
105 276 equemene
        print("Execution of kernel : %.3f" % Elapsed)
106 276 equemene
107 276 equemene
    print("Execution of kernel : %.3f" % Elapsed)
108 276 equemene
    return(res_np)
109 276 equemene
110 276 equemene
# OpenCL complete operation
111 276 equemene
def OpenCLAddition(a_np,b_np):
112 276 equemene
113 276 equemene
    # Context creation
114 276 equemene
    ctx = cl.create_some_context()
115 276 equemene
    # Every process is stored in a queue
116 276 equemene
    queue = cl.CommandQueue(ctx)
117 276 equemene
118 276 equemene
    TimeIn=time.time()
119 276 equemene
    # Copy from Host to Device using pointers
120 276 equemene
    mf = cl.mem_flags
121 276 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
122 276 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
123 276 equemene
    Elapsed=time.time()-TimeIn
124 276 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
125 276 equemene
126 276 equemene
    TimeIn=time.time()
127 276 equemene
    # Definition of kernel under OpenCL
128 276 equemene
    prg = cl.Program(ctx, """
129 276 equemene
__kernel void sum(
130 276 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
131 276 equemene
{
132 276 equemene
  int gid = get_global_id(0);
133 276 equemene
  res_g[gid] = a_g[gid] + b_g[gid];
134 276 equemene
}
135 276 equemene
""").build()
136 276 equemene
    Elapsed=time.time()-TimeIn
137 276 equemene
    print("Building kernels : %.3f" % Elapsed)
138 276 equemene
139 276 equemene
    TimeIn=time.time()
140 276 equemene
    # Memory allocation on Device for result
141 276 equemene
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
142 276 equemene
    Elapsed=time.time()-TimeIn
143 276 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
144 276 equemene
145 276 equemene
    TimeIn=time.time()
146 276 equemene
    # Synthesis of function "sum" inside Kernel Sources
147 276 equemene
    knl = prg.sum  # Use this Kernel object for repeated calls
148 276 equemene
    Elapsed=time.time()-TimeIn
149 276 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
150 276 equemene
151 276 equemene
    TimeIn=time.time()
152 276 equemene
    # Call of kernel previously defined
153 276 equemene
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
154 276 equemene
    Elapsed=time.time()-TimeIn
155 276 equemene
    print("Execution of kernel : %.3f" % Elapsed)
156 276 equemene
157 276 equemene
    TimeIn=time.time()
158 276 equemene
    # Creation of vector for result with same size as input vectors
159 276 equemene
    res_np = np.empty_like(a_np)
160 276 equemene
    Elapsed=time.time()-TimeIn
161 276 equemene
    print("Allocation on Host for results: %.3f" % Elapsed)
162 276 equemene
163 276 equemene
    TimeIn=time.time()
164 276 equemene
    # Copy from Device to Host
165 276 equemene
    cl.enqueue_copy(queue, res_np, res_g)
166 276 equemene
    Elapsed=time.time()-TimeIn
167 276 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
168 276 equemene
169 276 equemene
    # Liberation of memory
170 276 equemene
    a_g.release()
171 276 equemene
    b_g.release()
172 276 equemene
    res_g.release()
173 276 equemene
174 276 equemene
    return(res_np)
175 276 equemene
176 276 equemene
# OpenCL complete operation
177 276 equemene
def OpenCLSillyAddition(a_np,b_np,Calls):
178 276 equemene
179 276 equemene
    Id=0
180 276 equemene
    HasXPU=False
181 276 equemene
    for platform in cl.get_platforms():
182 276 equemene
        for device in platform.get_devices():
183 276 equemene
            if Id==Device:
184 276 equemene
                XPU=device
185 276 equemene
                print("CPU/GPU selected: ",device.name.lstrip())
186 276 equemene
                HasXPU=True
187 276 equemene
            Id+=1
188 276 equemene
            # print(Id)
189 276 equemene
190 276 equemene
    if HasXPU==False:
191 276 equemene
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
192 276 equemene
        sys.exit()
193 276 equemene
194 276 equemene
    try:
195 276 equemene
        ctx = cl.Context(devices=[XPU])
196 276 equemene
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
197 276 equemene
    except:
198 276 equemene
        print("Crash during context creation")
199 276 equemene
200 276 equemene
    TimeIn=time.time()
201 276 equemene
    # Copy from Host to Device using pointers
202 276 equemene
    mf = cl.mem_flags
203 276 equemene
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
204 276 equemene
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
205 276 equemene
    Elapsed=time.time()-TimeIn
206 276 equemene
    print("Copy from Host 2 Device : %.3f" % Elapsed)
207 276 equemene
208 276 equemene
    TimeIn=time.time()
209 276 equemene
    # Definition of kernel under OpenCL
210 276 equemene
    prg = cl.Program(ctx, """
211 276 equemene

212 276 equemene
float MySillyFunction(float x)
213 276 equemene
{
214 276 equemene
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2));
215 276 equemene
}
216 276 equemene

217 276 equemene
__kernel void sillysum(
218 276 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g, int Calls)
219 276 equemene
{
220 276 equemene
  int gid = get_global_id(0);
221 276 equemene
  float ai=a_g[gid];
222 276 equemene
  float bi=b_g[gid];
223 276 equemene

224 276 equemene
  for (int c=0;c<Calls;c++)
225 276 equemene
  {
226 276 equemene
    ai=MySillyFunction(ai);
227 276 equemene
    bi=MySillyFunction(bi);
228 276 equemene
  }
229 276 equemene

230 276 equemene
  res_g[gid] = ai + bi;
231 276 equemene
}
232 276 equemene

233 276 equemene
__kernel void sum(
234 276 equemene
    __global const float *a_g, __global const float *b_g, __global float *res_g)
235 276 equemene
{
236 276 equemene
  int gid = get_global_id(0);
237 276 equemene
  res_g[gid] = a_g[gid] + b_g[gid];
238 276 equemene
}
239 276 equemene
""").build()
240 276 equemene
    Elapsed=time.time()-TimeIn
241 276 equemene
    print("Building kernels : %.3f" % Elapsed)
242 276 equemene
243 276 equemene
    TimeIn=time.time()
244 276 equemene
    # Memory allocation on Device for result
245 276 equemene
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
246 276 equemene
    Elapsed=time.time()-TimeIn
247 276 equemene
    print("Allocation on Device for results : %.3f" % Elapsed)
248 276 equemene
249 276 equemene
    TimeIn=time.time()
250 276 equemene
    # Synthesis of function "sillysum" inside Kernel Sources
251 276 equemene
    knl = prg.sillysum  # Use this Kernel object for repeated calls
252 276 equemene
    Elapsed=time.time()-TimeIn
253 276 equemene
    print("Synthesis of kernel : %.3f" % Elapsed)
254 276 equemene
255 276 equemene
    TimeIn=time.time()
256 276 equemene
    # Call of kernel previously defined
257 276 equemene
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g, np.uint32(Calls))
258 276 equemene
    #
259 276 equemene
    CallCL.wait()
260 276 equemene
    Elapsed=time.time()-TimeIn
261 276 equemene
    print("Execution of kernel : %.3f" % Elapsed)
262 276 equemene
263 276 equemene
    TimeIn=time.time()
264 276 equemene
    # Creation of vector for result with same size as input vectors
265 276 equemene
    res_np = np.empty_like(a_np)
266 276 equemene
    Elapsed=time.time()-TimeIn
267 276 equemene
    print("Allocation on Host for results: %.3f" % Elapsed)
268 276 equemene
269 276 equemene
    TimeIn=time.time()
270 276 equemene
    # Copy from Device to Host
271 276 equemene
    cl.enqueue_copy(queue, res_np, res_g)
272 276 equemene
    Elapsed=time.time()-TimeIn
273 276 equemene
    print("Copy from Device 2 Host : %.3f" % Elapsed)
274 276 equemene
275 276 equemene
    # Liberation of memory
276 276 equemene
    a_g.release()
277 276 equemene
    b_g.release()
278 276 equemene
    res_g.release()
279 276 equemene
280 276 equemene
    return(res_np)
281 276 equemene
282 276 equemene
import sys
283 276 equemene
import time
284 276 equemene
285 276 equemene
if __name__=='__main__':
286 276 equemene
287 276 equemene
    GpuStyle='OpenCL'
288 276 equemene
    SIZE=1024
289 276 equemene
    Device=0
290 276 equemene
    Calls=1
291 276 equemene
    Threads=1
292 276 equemene
293 276 equemene
    import getopt
294 276 equemene
295 276 equemene
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId> -c <SillyCalls> -t <Threads>'
296 276 equemene
297 276 equemene
    try:
298 276 equemene
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:c:t:",["gpustyle=","size=","device=","calls=","threads="])
299 276 equemene
    except getopt.GetoptError:
300 276 equemene
        print(HowToUse % sys.argv[0])
301 276 equemene
        sys.exit(2)
302 276 equemene
303 276 equemene
    # List of Devices
304 276 equemene
    Devices=[]
305 276 equemene
    Alu={}
306 276 equemene
307 276 equemene
    for opt, arg in opts:
308 276 equemene
        if opt == '-h':
309 276 equemene
            print(HowToUse % sys.argv[0])
310 276 equemene
311 276 equemene
            print("\nInformations about devices detected under OpenCL API:")
312 276 equemene
            # For PyOpenCL import
313 276 equemene
            try:
314 276 equemene
                import pyopencl as cl
315 276 equemene
                Id=0
316 276 equemene
                for platform in cl.get_platforms():
317 276 equemene
                    for device in platform.get_devices():
318 276 equemene
                        #deviceType=cl.device_type.to_string(device.type)
319 276 equemene
                        deviceType="xPU"
320 276 equemene
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
321 276 equemene
                        Id=Id+1
322 276 equemene
323 276 equemene
            except:
324 276 equemene
                print("Your platform does not seem to support OpenCL")
325 276 equemene
326 276 equemene
            print("\nInformations about devices detected under CUDA API:")
327 276 equemene
            # For PyCUDA import
328 276 equemene
            try:
329 276 equemene
                import pycuda.driver as cuda
330 276 equemene
                cuda.init()
331 276 equemene
                for Id in range(cuda.Device.count()):
332 276 equemene
                    device=cuda.Device(Id)
333 276 equemene
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
334 276 equemene
                print
335 276 equemene
            except:
336 276 equemene
                print("Your platform does not seem to support CUDA")
337 276 equemene
338 276 equemene
            sys.exit()
339 276 equemene
340 276 equemene
        elif opt in ("-d", "--device"):
341 276 equemene
            Device=int(arg)
342 276 equemene
        elif opt in ("-t", "--threads"):
343 276 equemene
            Threads=int(arg)
344 276 equemene
        elif opt in ("-c", "--calls"):
345 276 equemene
            Calls=int(arg)
346 276 equemene
        elif opt in ("-g", "--gpustyle"):
347 276 equemene
            GpuStyle = arg
348 276 equemene
        elif opt in ("-s", "--size"):
349 276 equemene
            SIZE = int(arg)
350 276 equemene
351 276 equemene
    print("Device Selection : %i" % Device)
352 276 equemene
    print("GpuStyle used : %s" % GpuStyle)
353 276 equemene
    print("Size of complex vector : %i" % SIZE)
354 276 equemene
    print("Number of silly calls : %i" % Calls)
355 276 equemene
    print("Number of Threads : %i" % Threads)
356 276 equemene
357 276 equemene
    if GpuStyle=='CUDA':
358 276 equemene
        try:
359 276 equemene
            # For PyCUDA import
360 276 equemene
            import pycuda.driver as cuda
361 276 equemene
362 276 equemene
            cuda.init()
363 276 equemene
            for Id in range(cuda.Device.count()):
364 276 equemene
                device=cuda.Device(Id)
365 276 equemene
                print("Device #%i of type GPU : %s" % (Id,device.name()))
366 276 equemene
                if Id in Devices:
367 276 equemene
                    Alu[Id]='GPU'
368 276 equemene
369 276 equemene
        except ImportError:
370 276 equemene
            print("Platform does not seem to support CUDA")
371 276 equemene
372 276 equemene
    if GpuStyle=='OpenCL':
373 276 equemene
        try:
374 276 equemene
            # For PyOpenCL import
375 276 equemene
            import pyopencl as cl
376 276 equemene
            Id=0
377 276 equemene
            for platform in cl.get_platforms():
378 276 equemene
                for device in platform.get_devices():
379 276 equemene
                    #deviceType=cl.device_type.to_string(device.type)
380 276 equemene
                    deviceType="xPU"
381 276 equemene
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
382 276 equemene
383 276 equemene
                    if Id in Devices:
384 276 equemene
                    # Set the Alu as detected Device Type
385 276 equemene
                        Alu[Id]=deviceType
386 276 equemene
                    Id=Id+1
387 276 equemene
        except ImportError:
388 276 equemene
            print("Platform does not seem to support OpenCL")
389 276 equemene
390 276 equemene
    a_np = np.random.rand(SIZE).astype(np.float32)
391 276 equemene
    b_np = np.random.rand(SIZE).astype(np.float32)
392 276 equemene
393 276 equemene
    # Native Implementation
394 276 equemene
    TimeIn=time.time()
395 276 equemene
    res_np=NativeSillyAddition(a_np,b_np,Calls)
396 276 equemene
    NativeElapsed=time.time()-TimeIn
397 276 equemene
    NativeRate=int(SIZE/NativeElapsed)
398 276 equemene
    print("NativeRate: %i" % NativeRate)
399 276 equemene
400 276 equemene
    # OpenCL Implementation
401 276 equemene
    if GpuStyle=='OpenCL' or GpuStyle=='all':
402 276 equemene
        TimeIn=time.time()
403 276 equemene
        # res_cl=OpenCLAddition(a_np,b_np)
404 276 equemene
        res_cl=OpenCLSillyAddition(a_np,b_np,Calls)
405 276 equemene
        OpenCLElapsed=time.time()-TimeIn
406 276 equemene
        OpenCLRate=int(SIZE/OpenCLElapsed)
407 276 equemene
        print("OpenCLRate: %i" % OpenCLRate)
408 276 equemene
        # Check on OpenCL with Numpy:
409 276 equemene
        print(res_cl - res_np)
410 276 equemene
        print(np.linalg.norm(res_cl - res_np))
411 276 equemene
        try:
412 276 equemene
            assert np.allclose(res_np, res_cl)
413 276 equemene
        except:
414 276 equemene
            print("Results between Native & OpenCL seem to be too different!")
415 276 equemene
416 276 equemene
        print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
417 276 equemene
418 276 equemene
    # CUDA Implementation
419 276 equemene
    if GpuStyle=='CUDA' or GpuStyle=='all':
420 276 equemene
        TimeIn=time.time()
421 276 equemene
        res_cuda=CUDASillyAddition(a_np,b_np,Calls,Threads)
422 276 equemene
        CUDAElapsed=time.time()-TimeIn
423 276 equemene
        CUDARate=int(SIZE/CUDAElapsed)
424 276 equemene
        print("CUDARate: %i" % CUDARate)
425 276 equemene
        # Check on CUDA with Numpy:
426 276 equemene
        print(res_cuda - res_np)
427 276 equemene
        print(np.linalg.norm(res_cuda - res_np))
428 276 equemene
        try:
429 276 equemene
            assert np.allclose(res_np, res_cuda)
430 276 equemene
        except:
431 276 equemene
            print("Results between Native & CUDA seem to be too different!")
432 276 equemene
433 276 equemene
        print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
434 276 equemene
435 276 equemene
436 276 equemene