Révision 276

ETSN/MySteps_6.py (revision 276)
1
#!/usr/bin/env python3
2

  
3
import numpy as np
4
import pyopencl as cl
5

  
6
# piling 16 arithmetical functions
7
def MySillyFunction(x):
8
    return(np.power(np.sqrt(np.log(np.exp(np.arctanh(np.tanh(np.arcsinh(np.sinh(np.arccosh(np.cosh(np.arctan(np.tan(np.arcsin(np.sin(np.arccos(np.cos(x))))))))))))))),2))
9

  
10
# Native Operation under Numpy (for prototyping & tests
11
def NativeAddition(a_np,b_np):
12
    return(a_np+b_np)
13

  
14
# Native Operation with MySillyFunction under Numpy (for prototyping & tests
15
def NativeSillyAddition(a_np,b_np,Calls):
16
    a=a_np.copy()
17
    b=b_np.copy()
18
    for i in range(Calls):
19
        a=MySillyFunction(a)
20
        b=MySillyFunction(b)
21
        
22
    return(a+b)
23

  
24
# CUDA complete operation
25
def CUDAAddition(a_np,b_np):
26
    import pycuda.autoinit
27
    import pycuda.driver as drv
28
    import numpy
29

  
30
    from pycuda.compiler import SourceModule
31
    mod = SourceModule("""
32
    __global__ void sum(float *dest, float *a, float *b)
33
{
34
  // const int i = threadIdx.x;
35
  const int i = blockIdx.x;
36
  dest[i] = a[i] + b[i];
37
}
38
""")
39

  
40
    # sum = mod.get_function("sum")
41
    sum = mod.get_function("sum")
42

  
43
    res_np = numpy.zeros_like(a_np)
44
    sum(drv.Out(res_np), drv.In(a_np), drv.In(b_np),
45
        block=(1,1,1), grid=(a_np.size,1))
46
    return(res_np)
47

  
48
# CUDA Silly complete operation
49
def CUDASillyAddition(a_np,b_np,Calls,Threads):
50
    import pycuda.autoinit
51
    import pycuda.driver as drv
52
    import numpy
53

  
54
    from pycuda.compiler import SourceModule
55
    TimeIn=time.time()
56
    mod = SourceModule("""
57
__device__ float MySillyFunction(float x)
58
{
59
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
60
}
61

  
62
__global__ void sillysum(float *dest, float *a, float *b, int Calls)
63
{
64
  const int i = blockDim.x*blockIdx.x+threadIdx.x;
65
  float ai=a[i];
66
  float bi=b[i];
67

  
68
  for (int c=0;c<Calls;c++)
69
  {
70
    ai=MySillyFunction(ai);
71
    bi=MySillyFunction(bi);
72
  }
73

  
74
  dest[i] = ai + bi;
75
}
76
""")
77
    Elapsed=time.time()-TimeIn
78
    print("Definition of kernel : %.3f" % Elapsed)
79

  
80
    TimeIn=time.time()
81
    # sum = mod.get_function("sum")
82
    sillysum = mod.get_function("sillysum")
83
    Elapsed=time.time()-TimeIn
84
    print("Synthesis of kernel : %.3f" % Elapsed)
85

  
86
    TimeIn=time.time()
87
    res_np = numpy.zeros_like(a_np)
88
    Elapsed=time.time()-TimeIn
89
    print("Allocation on Host for results : %.3f" % Elapsed)
90

  
91
    Size=a_np.size
92
    if (Size % Threads != 0):
93
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
94
        TimeIn=time.time()
95
        sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls),
96
                 block=(1,1,1), grid=(a_np.size,1))
97
        Elapsed=time.time()-TimeIn
98
        print("Execution of kernel : %.3f" % Elapsed)
99
    else:
100
        Blocks=int(Size/Threads)
101
        TimeIn=time.time()
102
        sillysum(drv.Out(res_np), drv.In(a_np), drv.In(b_np), np.uint32(Calls),
103
                 block=(Threads,1,1), grid=(Blocks,1))
104
        Elapsed=time.time()-TimeIn
105
        print("Execution of kernel : %.3f" % Elapsed)
106
        
107
    print("Execution of kernel : %.3f" % Elapsed)
108
    return(res_np)
109

  
110
# OpenCL complete operation
111
def OpenCLAddition(a_np,b_np):
112

  
113
    # Context creation
114
    ctx = cl.create_some_context()
115
    # Every process is stored in a queue
116
    queue = cl.CommandQueue(ctx)
117

  
118
    TimeIn=time.time()
119
    # Copy from Host to Device using pointers
120
    mf = cl.mem_flags
121
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
122
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
123
    Elapsed=time.time()-TimeIn
124
    print("Copy from Host 2 Device : %.3f" % Elapsed)
125

  
126
    TimeIn=time.time()
127
    # Definition of kernel under OpenCL
128
    prg = cl.Program(ctx, """
129
__kernel void sum(
130
    __global const float *a_g, __global const float *b_g, __global float *res_g)
131
{
132
  int gid = get_global_id(0);
133
  res_g[gid] = a_g[gid] + b_g[gid];
134
}
135
""").build()
136
    Elapsed=time.time()-TimeIn
137
    print("Building kernels : %.3f" % Elapsed)
138
    
139
    TimeIn=time.time()
140
    # Memory allocation on Device for result
141
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
142
    Elapsed=time.time()-TimeIn
143
    print("Allocation on Device for results : %.3f" % Elapsed)
144

  
145
    TimeIn=time.time()
146
    # Synthesis of function "sum" inside Kernel Sources
147
    knl = prg.sum  # Use this Kernel object for repeated calls
148
    Elapsed=time.time()-TimeIn
149
    print("Synthesis of kernel : %.3f" % Elapsed)
150

  
151
    TimeIn=time.time()
152
    # Call of kernel previously defined 
153
    knl(queue, a_np.shape, None, a_g, b_g, res_g)
154
    Elapsed=time.time()-TimeIn
155
    print("Execution of kernel : %.3f" % Elapsed)
156

  
157
    TimeIn=time.time()
158
    # Creation of vector for result with same size as input vectors
159
    res_np = np.empty_like(a_np)
160
    Elapsed=time.time()-TimeIn
161
    print("Allocation on Host for results: %.3f" % Elapsed)
162

  
163
    TimeIn=time.time()
164
    # Copy from Device to Host
165
    cl.enqueue_copy(queue, res_np, res_g)
166
    Elapsed=time.time()-TimeIn
167
    print("Copy from Device 2 Host : %.3f" % Elapsed)
168

  
169
    # Liberation of memory
170
    a_g.release()
171
    b_g.release()
172
    res_g.release()
173
    
174
    return(res_np)
175

  
176
# OpenCL complete operation
177
def OpenCLSillyAddition(a_np,b_np,Calls):
178

  
179
    Id=0
180
    HasXPU=False
181
    for platform in cl.get_platforms():
182
        for device in platform.get_devices():
183
            if Id==Device:
184
                XPU=device
185
                print("CPU/GPU selected: ",device.name.lstrip())
186
                HasXPU=True
187
            Id+=1
188
            # print(Id)
189

  
190
    if HasXPU==False:
191
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
192
        sys.exit()           
193

  
194
    try:
195
        ctx = cl.Context(devices=[XPU])
196
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
197
    except:
198
        print("Crash during context creation")
199

  
200
    TimeIn=time.time()
201
    # Copy from Host to Device using pointers
202
    mf = cl.mem_flags
203
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
204
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
205
    Elapsed=time.time()-TimeIn
206
    print("Copy from Host 2 Device : %.3f" % Elapsed)
207

  
208
    TimeIn=time.time()
209
    # Definition of kernel under OpenCL
210
    prg = cl.Program(ctx, """
211

  
212
float MySillyFunction(float x)
213
{
214
    return(pow(sqrt(log(exp(atanh(tanh(asinh(sinh(acosh(cosh(atan(tan(asin(sin(acos(cos(x))))))))))))))),2)); 
215
}
216

  
217
__kernel void sillysum(
218
    __global const float *a_g, __global const float *b_g, __global float *res_g, int Calls)
219
{
220
  int gid = get_global_id(0);
221
  float ai=a_g[gid];
222
  float bi=b_g[gid];
223

  
224
  for (int c=0;c<Calls;c++)
225
  {
226
    ai=MySillyFunction(ai);
227
    bi=MySillyFunction(bi);
228
  }
229

  
230
  res_g[gid] = ai + bi;
231
}
232

  
233
__kernel void sum(
234
    __global const float *a_g, __global const float *b_g, __global float *res_g)
235
{
236
  int gid = get_global_id(0);
237
  res_g[gid] = a_g[gid] + b_g[gid];
238
}
239
""").build()
240
    Elapsed=time.time()-TimeIn
241
    print("Building kernels : %.3f" % Elapsed)
242
    
243
    TimeIn=time.time()
244
    # Memory allocation on Device for result
245
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
246
    Elapsed=time.time()-TimeIn
247
    print("Allocation on Device for results : %.3f" % Elapsed)
248

  
249
    TimeIn=time.time()
250
    # Synthesis of function "sillysum" inside Kernel Sources
251
    knl = prg.sillysum  # Use this Kernel object for repeated calls
252
    Elapsed=time.time()-TimeIn
253
    print("Synthesis of kernel : %.3f" % Elapsed)
254

  
255
    TimeIn=time.time()
256
    # Call of kernel previously defined 
257
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, res_g, np.uint32(Calls))
258
    # 
259
    CallCL.wait()
260
    Elapsed=time.time()-TimeIn
261
    print("Execution of kernel : %.3f" % Elapsed)
262

  
263
    TimeIn=time.time()
264
    # Creation of vector for result with same size as input vectors
265
    res_np = np.empty_like(a_np)
266
    Elapsed=time.time()-TimeIn
267
    print("Allocation on Host for results: %.3f" % Elapsed)
268

  
269
    TimeIn=time.time()
270
    # Copy from Device to Host
271
    cl.enqueue_copy(queue, res_np, res_g)
272
    Elapsed=time.time()-TimeIn
273
    print("Copy from Device 2 Host : %.3f" % Elapsed)
274

  
275
    # Liberation of memory
276
    a_g.release()
277
    b_g.release()
278
    res_g.release()
279
    
280
    return(res_np)
281

  
282
import sys
283
import time
284

  
285
if __name__=='__main__':
286

  
287
    GpuStyle='OpenCL'
288
    SIZE=1024
289
    Device=0
290
    Calls=1
291
    Threads=1
292

  
293
    import getopt
294

  
295
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId> -c <SillyCalls> -t <Threads>'
296
    
297
    try:
298
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:c:t:",["gpustyle=","size=","device=","calls=","threads="])
299
    except getopt.GetoptError:
300
        print(HowToUse % sys.argv[0])
301
        sys.exit(2)
302

  
303
    # List of Devices
304
    Devices=[]
305
    Alu={}
306
        
307
    for opt, arg in opts:
308
        if opt == '-h':
309
            print(HowToUse % sys.argv[0])
310

  
311
            print("\nInformations about devices detected under OpenCL API:")
312
            # For PyOpenCL import
313
            try:
314
                import pyopencl as cl
315
                Id=0
316
                for platform in cl.get_platforms():
317
                    for device in platform.get_devices():
318
                        #deviceType=cl.device_type.to_string(device.type)
319
                        deviceType="xPU"
320
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
321
                        Id=Id+1
322

  
323
            except:
324
                print("Your platform does not seem to support OpenCL")
325

  
326
            print("\nInformations about devices detected under CUDA API:")
327
            # For PyCUDA import
328
            try:
329
                import pycuda.driver as cuda
330
                cuda.init()
331
                for Id in range(cuda.Device.count()):
332
                    device=cuda.Device(Id)
333
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
334
                print
335
            except:
336
                print("Your platform does not seem to support CUDA")
337
        
338
            sys.exit()
339
        
340
        elif opt in ("-d", "--device"):
341
            Device=int(arg)
342
        elif opt in ("-t", "--threads"):
343
            Threads=int(arg)
344
        elif opt in ("-c", "--calls"):
345
            Calls=int(arg)
346
        elif opt in ("-g", "--gpustyle"):
347
            GpuStyle = arg
348
        elif opt in ("-s", "--size"):
349
            SIZE = int(arg)
350

  
351
    print("Device Selection : %i" % Device)
352
    print("GpuStyle used : %s" % GpuStyle)
353
    print("Size of complex vector : %i" % SIZE)
354
    print("Number of silly calls : %i" % Calls)
355
    print("Number of Threads : %i" % Threads)
356

  
357
    if GpuStyle=='CUDA':
358
        try:
359
            # For PyCUDA import
360
            import pycuda.driver as cuda
361
            
362
            cuda.init()
363
            for Id in range(cuda.Device.count()):
364
                device=cuda.Device(Id)
365
                print("Device #%i of type GPU : %s" % (Id,device.name()))
366
                if Id in Devices:
367
                    Alu[Id]='GPU'
368
            
369
        except ImportError:
370
            print("Platform does not seem to support CUDA")
371

  
372
    if GpuStyle=='OpenCL':
373
        try:
374
            # For PyOpenCL import
375
            import pyopencl as cl
376
            Id=0
377
            for platform in cl.get_platforms():
378
                for device in platform.get_devices():
379
                    #deviceType=cl.device_type.to_string(device.type)
380
                    deviceType="xPU"
381
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
382

  
383
                    if Id in Devices:
384
                    # Set the Alu as detected Device Type
385
                        Alu[Id]=deviceType
386
                    Id=Id+1
387
        except ImportError:
388
            print("Platform does not seem to support OpenCL")
389
        
390
    a_np = np.random.rand(SIZE).astype(np.float32)
391
    b_np = np.random.rand(SIZE).astype(np.float32)
392

  
393
    # Native Implementation
394
    TimeIn=time.time()
395
    res_np=NativeSillyAddition(a_np,b_np,Calls)
396
    NativeElapsed=time.time()-TimeIn
397
    NativeRate=int(SIZE/NativeElapsed)
398
    print("NativeRate: %i" % NativeRate)
399

  
400
    # OpenCL Implementation
401
    if GpuStyle=='OpenCL' or GpuStyle=='all':
402
        TimeIn=time.time()
403
        # res_cl=OpenCLAddition(a_np,b_np)
404
        res_cl=OpenCLSillyAddition(a_np,b_np,Calls)
405
        OpenCLElapsed=time.time()-TimeIn
406
        OpenCLRate=int(SIZE/OpenCLElapsed)
407
        print("OpenCLRate: %i" % OpenCLRate)
408
        # Check on OpenCL with Numpy:
409
        print(res_cl - res_np)
410
        print(np.linalg.norm(res_cl - res_np))
411
        try:
412
            assert np.allclose(res_np, res_cl)
413
        except:
414
            print("Results between Native & OpenCL seem to be too different!")
415

  
416
        print("OpenCLvsNative ratio: %f" % (OpenCLRate/NativeRate))
417
            
418
    # CUDA Implementation
419
    if GpuStyle=='CUDA' or GpuStyle=='all':
420
        TimeIn=time.time()
421
        res_cuda=CUDASillyAddition(a_np,b_np,Calls,Threads)
422
        CUDAElapsed=time.time()-TimeIn
423
        CUDARate=int(SIZE/CUDAElapsed)
424
        print("CUDARate: %i" % CUDARate)
425
        # Check on CUDA with Numpy:
426
        print(res_cuda - res_np)
427
        print(np.linalg.norm(res_cuda - res_np))
428
        try:
429
            assert np.allclose(res_np, res_cuda)
430
        except:
431
            print("Results between Native & CUDA seem to be too different!")
432
    
433
        print("CUDAvsNative ratio: %f" % (CUDARate/NativeRate))
434
    
435
        
436

  
437

  
0 438

  

Formats disponibles : Unified diff