Révision 274

ETSN/MyDFT_6.py (revision 274)
1
#!/usr/bin/env python3
2

  
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

  
7
# Naive Discrete Fourier Transform
8
def MyDFT(x,y):
9
    size=x.shape[0]
10
    X=np.zeros(size).astype(np.float32)
11
    Y=np.zeros(size).astype(np.float32)
12
    for i in range(size):
13
        for j in range(size):
14
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16
    return(X,Y)
17

  
18
# Numpy Discrete Fourier Transform
19
def NumpyDFT(x,y):
20
    size=x.shape[0]
21
    X=np.zeros(size).astype(np.float32)
22
    Y=np.zeros(size).astype(np.float32)
23
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24
    for i in range(size):
25
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27
    return(X,Y)
28

  
29
# Numba Discrete Fourier Transform
30
import numba
31
@numba.njit(parallel=True)
32
def NumbaDFT(x,y):
33
    size=x.shape[0]
34
    X=np.zeros(size).astype(np.float32)
35
    Y=np.zeros(size).astype(np.float32)
36
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37
    for i in numba.prange(size):
38
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40
    return(X,Y)
41

  
42
# OpenCL complete operation
43
def OpenCLDFT(a_np,b_np):
44

  
45
    # Context creation
46
    ctx = cl.create_some_context()
47
    # Every process is stored in a queue
48
    queue = cl.CommandQueue(ctx)
49

  
50
    TimeIn=time.time()
51
    # Copy from Host to Device using pointers
52
    mf = cl.mem_flags
53
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
54
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
55
    Elapsed=time.time()-TimeIn
56
    print("Copy from Host 2 Device : %.3f" % Elapsed)
57

  
58
    TimeIn=time.time()
59
    # Definition of kernel under OpenCL
60
    prg = cl.Program(ctx, """
61

  
62
#define PI 3.141592653589793
63

  
64
__kernel void MyDFT(
65
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
66
{
67
  int gid = get_global_id(0);
68
  uint size = get_global_size(0);
69
  float A=0.,B=0.;
70
  for (uint i=0; i<size;i++) 
71
  {
72
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
73
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
74
  }
75
  A_g[gid]=A;
76
  B_g[gid]=B;
77
}
78
""").build()
79
    Elapsed=time.time()-TimeIn
80
    print("Building kernels : %.3f" % Elapsed)
81
    
82
    TimeIn=time.time()
83
    # Memory allocation on Device for result
84
    A_ocl = np.empty_like(a_np)
85
    B_ocl = np.empty_like(a_np)
86
    Elapsed=time.time()-TimeIn
87
    print("Allocation on Host for results : %.3f" % Elapsed)
88

  
89
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
90
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
91
    Elapsed=time.time()-TimeIn
92
    print("Allocation on Device for results : %.3f" % Elapsed)
93

  
94
    TimeIn=time.time()
95
    # Synthesis of function "sillysum" inside Kernel Sources
96
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
97
    Elapsed=time.time()-TimeIn
98
    print("Synthesis of kernel : %.3f" % Elapsed)
99

  
100
    TimeIn=time.time()
101
    # Call of kernel previously defined 
102
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
103
    # 
104
    CallCL.wait()
105
    Elapsed=time.time()-TimeIn
106
    print("Execution of kernel : %.3f" % Elapsed)
107

  
108
    TimeIn=time.time()
109
    # Copy from Device to Host
110
    cl.enqueue_copy(queue, A_ocl, A_g)
111
    cl.enqueue_copy(queue, B_ocl, B_g)
112
    Elapsed=time.time()-TimeIn
113
    print("Copy from Device 2 Host : %.3f" % Elapsed)
114

  
115
    return(A_ocl,B_ocl)
116

  
117
# CUDA Silly complete operation
118
def CUDADFT(a_np,b_np):
119
    import pycuda.autoinit
120
    import pycuda.driver as drv
121
    import numpy
122

  
123
    from pycuda.compiler import SourceModule
124
    TimeIn=time.time()
125
    mod = SourceModule("""
126

  
127
#define PI 3.141592653589793
128

  
129
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
130
{
131
  const int gid = blockIdx.x;
132
  uint size = gridDim.x;
133
  float A=0.,B=0.;
134
  for (uint i=0; i<size;i++) 
135
  {
136
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
137
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
138
  }
139
  A_g[gid]=A;
140
  B_g[gid]=B;
141
}
142

  
143
""")
144
    Elapsed=time.time()-TimeIn
145
    print("Definition of kernel : %.3f" % Elapsed)
146

  
147
    TimeIn=time.time()
148
    MyDFT = mod.get_function("MyDFT")
149
    Elapsed=time.time()-TimeIn
150
    print("Synthesis of kernel : %.3f" % Elapsed)
151

  
152
    TimeIn=time.time()
153
    A_np = numpy.zeros_like(a_np)
154
    B_np = numpy.zeros_like(a_np)
155
    Elapsed=time.time()-TimeIn
156
    print("Allocation on Host for results : %.3f" % Elapsed)
157

  
158
    TimeIn=time.time()
159
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
160
          block=(1,1,1), grid=(a_np.size,1))
161
    Elapsed=time.time()-TimeIn
162
    print("Execution of kernel : %.3f" % Elapsed)
163
    return(A_np,B_np)
164

  
165
import sys
166
import time
167

  
168
if __name__=='__main__':
169

  
170
    GpuStyle='OpenCL'
171
    SIZE=1024
172
    Device=0
173

  
174
    import getopt
175

  
176
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
177
    
178
    try:
179
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
180
    except getopt.GetoptError:
181
        print(HowToUse % sys.argv[0])
182
        sys.exit(2)
183

  
184
    # List of Devices
185
    Devices=[]
186
    Alu={}
187
        
188
    for opt, arg in opts:
189
        if opt == '-h':
190
            print(HowToUse % sys.argv[0])
191

  
192
            print("\nInformations about devices detected under OpenCL API:")
193
            # For PyOpenCL import
194
            try:
195
                import pyopencl as cl
196
                Id=0
197
                for platform in cl.get_platforms():
198
                    for device in platform.get_devices():
199
                        #deviceType=cl.device_type.to_string(device.type)
200
                        deviceType="xPU"
201
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
202
                        Id=Id+1
203

  
204
            except:
205
                print("Your platform does not seem to support OpenCL")
206

  
207
            print("\nInformations about devices detected under CUDA API:")
208
            # For PyCUDA import
209
            try:
210
                import pycuda.driver as cuda
211
                cuda.init()
212
                for Id in range(cuda.Device.count()):
213
                    device=cuda.Device(Id)
214
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
215
                print
216
            except:
217
                print("Your platform does not seem to support CUDA")
218
        
219
            sys.exit()
220
        
221
        elif opt in ("-d", "--device"):
222
            Device=int(arg)
223
        elif opt in ("-g", "--gpustyle"):
224
            GpuStyle = arg
225
        elif opt in ("-s", "--size"):
226
            SIZE = int(arg)
227

  
228
    print("Device Selection : %i" % Device)
229
    print("GpuStyle used : %s" % GpuStyle)
230
    print("Size of complex vector : %i" % SIZE)
231

  
232
    if GpuStyle=='CUDA':
233
        try:
234
            # For PyCUDA import
235
            import pycuda.driver as cuda
236
            
237
            cuda.init()
238
            for Id in range(cuda.Device.count()):
239
                device=cuda.Device(Id)
240
                print("Device #%i of type GPU : %s" % (Id,device.name()))
241
                if Id in Devices:
242
                    Alu[Id]='GPU'
243
            
244
        except ImportError:
245
            print("Platform does not seem to support CUDA")
246

  
247
    if GpuStyle=='OpenCL':
248
        try:
249
            # For PyOpenCL import
250
            import pyopencl as cl
251
            Id=0
252
            for platform in cl.get_platforms():
253
                for device in platform.get_devices():
254
                    #deviceType=cl.device_type.to_string(device.type)
255
                    deviceType="xPU"
256
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
257

  
258
                    if Id in Devices:
259
                    # Set the Alu as detected Device Type
260
                        Alu[Id]=deviceType
261
                    Id=Id+1
262
        except ImportError:
263
            print("Platform does not seem to support OpenCL")
264

  
265
    
266
        
267
    a_np = np.ones(SIZE).astype(np.float32)
268
    b_np = np.ones(SIZE).astype(np.float32)
269

  
270
    C_np = np.zeros(SIZE).astype(np.float32)
271
    D_np = np.zeros(SIZE).astype(np.float32)
272
    C_np[0] = np.float32(SIZE)
273
    D_np[0] = np.float32(SIZE)
274
    
275
    # # Native & Naive Implementation
276
    # print("Performing naive implementation")
277
    # TimeIn=time.time()
278
    # c_np,d_np=MyDFT(a_np,b_np)
279
    # NativeElapsed=time.time()-TimeIn
280
    # NativeRate=int(SIZE/NativeElapsed)
281
    # print("NativeRate: %i" % NativeRate)
282
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np)) 
283

  
284
    # # Native & Numpy Implementation
285
    # print("Performing Numpy implementation")
286
    # TimeIn=time.time()
287
    # e_np,f_np=NumpyDFT(a_np,b_np)
288
    # NumpyElapsed=time.time()-TimeIn
289
    # NumpyRate=int(SIZE/NumpyElapsed)
290
    # print("NumpyRate: %i" % NumpyRate)
291
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np)) 
292
        
293
    # # Native & Numba Implementation
294
    # print("Performing Numba implementation")
295
    # TimeIn=time.time()
296
    # g_np,h_np=NumbaDFT(a_np,b_np)
297
    # NumbaElapsed=time.time()-TimeIn
298
    # NumbaRate=int(SIZE/NumbaElapsed)
299
    # print("NumbaRate: %i" % NumbaRate)
300
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np)) 
301
    
302
    # # OpenCL Implementation
303
    # print("Performing OpenCL implementation")
304
    # TimeIn=time.time()
305
    # i_np,j_np=OpenCLDFT(a_np,b_np)
306
    # OpenCLElapsed=time.time()-TimeIn
307
    # OpenCLRate=int(SIZE/OpenCLElapsed)
308
    # print("OpenCLRate: %i" % OpenCLRate)
309
    # print("Precision: ",np.linalg.norm(i_np-C_np),np.linalg.norm(j_np-D_np)) 
310
    
311
    # # CUDA Implementation
312
    # print("Performing CUDA implementation")
313
    # TimeIn=time.time()
314
    # k_np,l_np=CUDADFT(a_np,b_np)
315
    # CUDAElapsed=time.time()-TimeIn
316
    # CUDARate=int(SIZE/CUDAElapsed)
317
    # print("CUDARate: %i" % CUDARate)
318
    # print("Precision: ",np.linalg.norm(k_np-C_np),np.linalg.norm(l_np-D_np)) 
319
    
0 320

  
ETSN/MyDFT_7.py (revision 274)
1
#!/usr/bin/env python3
2

  
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

  
7
# Naive Discrete Fourier Transform
8
def MyDFT(x,y):
9
    size=x.shape[0]
10
    X=np.zeros(size).astype(np.float32)
11
    Y=np.zeros(size).astype(np.float32)
12
    for i in range(size):
13
        for j in range(size):
14
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16
    return(X,Y)
17

  
18
# Numpy Discrete Fourier Transform
19
def NumpyDFT(x,y):
20
    size=x.shape[0]
21
    X=np.zeros(size).astype(np.float32)
22
    Y=np.zeros(size).astype(np.float32)
23
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24
    for i in range(size):
25
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27
    return(X,Y)
28

  
29
# Numba Discrete Fourier Transform
30
import numba
31
@numba.njit(parallel=True)
32
def NumbaDFT(x,y):
33
    size=x.shape[0]
34
    X=np.zeros(size).astype(np.float32)
35
    Y=np.zeros(size).astype(np.float32)
36
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37
    for i in numba.prange(size):
38
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40
    return(X,Y)
41

  
42
# OpenCL complete operation
43
def OpenCLDFT(a_np,b_np,Device):
44

  
45
    Id=0
46
    HasXPU=False
47
    for platform in cl.get_platforms():
48
        for device in platform.get_devices():
49
            if Id==Device:
50
                XPU=device
51
                print("CPU/GPU selected: ",device.name.lstrip())
52
                HasXPU=True
53
            Id+=1
54
            # print(Id)
55

  
56
    if HasXPU==False:
57
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58
        sys.exit()           
59

  
60
    try:
61
        ctx = cl.Context(devices=[XPU])
62
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63
    except:
64
        print("Crash during context creation")
65

  
66
    TimeIn=time.time()
67
    # Copy from Host to Device using pointers
68
    mf = cl.mem_flags
69
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71
    Elapsed=time.time()-TimeIn
72
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73

  
74
    TimeIn=time.time()
75
    # Definition of kernel under OpenCL
76
    prg = cl.Program(ctx, """
77

  
78
#define PI 3.141592653589793
79

  
80
__kernel void MyDFT(
81
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82
{
83
  int gid = get_global_id(0);
84
  uint size = get_global_size(0);
85
  float A=0.,B=0.;
86
  for (uint i=0; i<size;i++) 
87
  {
88
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90
  }
91
  A_g[gid]=A;
92
  B_g[gid]=B;
93
}
94
""").build()
95
    Elapsed=time.time()-TimeIn
96
    print("Building kernels : %.3f" % Elapsed)
97
    
98
    TimeIn=time.time()
99
    # Memory allocation on Device for result
100
    A_ocl = np.empty_like(a_np)
101
    B_ocl = np.empty_like(a_np)
102
    Elapsed=time.time()-TimeIn
103
    print("Allocation on Host for results : %.3f" % Elapsed)
104

  
105
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107
    Elapsed=time.time()-TimeIn
108
    print("Allocation on Device for results : %.3f" % Elapsed)
109

  
110
    TimeIn=time.time()
111
    # Synthesis of function "sillysum" inside Kernel Sources
112
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113
    Elapsed=time.time()-TimeIn
114
    print("Synthesis of kernel : %.3f" % Elapsed)
115

  
116
    TimeIn=time.time()
117
    # Call of kernel previously defined 
118
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119
    # 
120
    CallCL.wait()
121
    Elapsed=time.time()-TimeIn
122
    print("Execution of kernel : %.3f" % Elapsed)
123

  
124
    TimeIn=time.time()
125
    # Copy from Device to Host
126
    cl.enqueue_copy(queue, A_ocl, A_g)
127
    cl.enqueue_copy(queue, B_ocl, B_g)
128
    Elapsed=time.time()-TimeIn
129
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130

  
131
    a_g.release()
132
    b_g.release()
133
    A_g.release()
134
    B_g.release()
135
    
136
    return(A_ocl,B_ocl)
137

  
138
# CUDA Silly complete operation
139
def CUDADFT(a_np,b_np):
140
    import pycuda.autoinit
141
    import pycuda.driver as drv
142

  
143
    from pycuda.compiler import SourceModule
144
    TimeIn=time.time()
145
    mod = SourceModule("""
146

  
147
#define PI 3.141592653589793
148

  
149
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
150
{
151
  const int gid = blockIdx.x;
152
  uint size = gridDim.x;
153
  float A=0.,B=0.;
154
  for (uint i=0; i<size;i++) 
155
  {
156
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
157
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
158
  }
159
  A_g[gid]=A;
160
  B_g[gid]=B;
161
}
162

  
163
""")
164
    Elapsed=time.time()-TimeIn
165
    print("Definition of kernel : %.3f" % Elapsed)
166

  
167
    TimeIn=time.time()
168
    MyDFT = mod.get_function("MyDFT")
169
    Elapsed=time.time()-TimeIn
170
    print("Synthesis of kernel : %.3f" % Elapsed)
171

  
172
    TimeIn=time.time()
173
    A_np = np.zeros_like(a_np)
174
    B_np = np.zeros_like(a_np)
175
    Elapsed=time.time()-TimeIn
176
    print("Allocation on Host for results : %.3f" % Elapsed)
177

  
178
    TimeIn=time.time()
179
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
180
          block=(1,1,1), grid=(a_np.size,1))
181
    Elapsed=time.time()-TimeIn
182
    print("Execution of kernel : %.3f" % Elapsed)
183
    return(A_np,B_np)
184

  
185
import sys
186
import time
187

  
188
if __name__=='__main__':
189

  
190
    GpuStyle='OpenCL'
191
    SIZE=1024
192
    Device=0
193

  
194
    import getopt
195

  
196
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
197
    
198
    try:
199
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
200
    except getopt.GetoptError:
201
        print(HowToUse % sys.argv[0])
202
        sys.exit(2)
203

  
204
    # List of Devices
205
    Devices=[]
206
    Alu={}
207
        
208
    for opt, arg in opts:
209
        if opt == '-h':
210
            print(HowToUse % sys.argv[0])
211

  
212
            print("\nInformations about devices detected under OpenCL API:")
213
            # For PyOpenCL import
214
            try:
215
                import pyopencl as cl
216
                Id=0
217
                for platform in cl.get_platforms():
218
                    for device in platform.get_devices():
219
                        #deviceType=cl.device_type.to_string(device.type)
220
                        deviceType="xPU"
221
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
222
                        Id=Id+1
223

  
224
            except:
225
                print("Your platform does not seem to support OpenCL")
226

  
227
            print("\nInformations about devices detected under CUDA API:")
228
            # For PyCUDA import
229
            try:
230
                import pycuda.driver as cuda
231
                cuda.init()
232
                for Id in range(cuda.Device.count()):
233
                    device=cuda.Device(Id)
234
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
235
                print
236
            except:
237
                print("Your platform does not seem to support CUDA")
238
        
239
            sys.exit()
240
        
241
        elif opt in ("-d", "--device"):
242
            Device=int(arg)
243
        elif opt in ("-g", "--gpustyle"):
244
            GpuStyle = arg
245
        elif opt in ("-s", "--size"):
246
            SIZE = int(arg)
247

  
248
    print("Device Selection : %i" % Device)
249
    print("GpuStyle used : %s" % GpuStyle)
250
    print("Size of complex vector : %i" % SIZE)
251

  
252
    if GpuStyle=='CUDA':
253
        try:
254
            # For PyCUDA import
255
            import pycuda.driver as cuda
256
            
257
            cuda.init()
258
            for Id in range(cuda.Device.count()):
259
                device=cuda.Device(Id)
260
                print("Device #%i of type GPU : %s" % (Id,device.name()))
261
                if Id in Devices:
262
                    Alu[Id]='GPU'
263
            
264
        except ImportError:
265
            print("Platform does not seem to support CUDA")
266

  
267
    if GpuStyle=='OpenCL':
268
        try:
269
            # For PyOpenCL import
270
            import pyopencl as cl
271
            Id=0
272
            for platform in cl.get_platforms():
273
                for device in platform.get_devices():
274
                    #deviceType=cl.device_type.to_string(device.type)
275
                    deviceType="xPU"
276
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
277

  
278
                    if Id in Devices:
279
                    # Set the Alu as detected Device Type
280
                        Alu[Id]=deviceType
281
                    Id=Id+1
282
        except ImportError:
283
            print("Platform does not seem to support OpenCL")
284

  
285
    
286
        
287
    a_np = np.ones(SIZE).astype(np.float32)
288
    b_np = np.ones(SIZE).astype(np.float32)
289

  
290
    C_np = np.zeros(SIZE).astype(np.float32)
291
    D_np = np.zeros(SIZE).astype(np.float32)
292
    C_np[0] = np.float32(SIZE)
293
    D_np[0] = np.float32(SIZE)
294
    
295
    # # Native & Naive Implementation
296
    # print("Performing naive implementation")
297
    # TimeIn=time.time()
298
    # c_np,d_np=MyDFT(a_np,b_np)
299
    # NativeElapsed=time.time()-TimeIn
300
    # NativeRate=int(SIZE/NativeElapsed)
301
    # print("NativeRate: %i" % NativeRate)
302
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np)) 
303

  
304
    # # Native & Numpy Implementation
305
    # print("Performing Numpy implementation")
306
    # TimeIn=time.time()
307
    # e_np,f_np=NumpyDFT(a_np,b_np)
308
    # NumpyElapsed=time.time()-TimeIn
309
    # NumpyRate=int(SIZE/NumpyElapsed)
310
    # print("NumpyRate: %i" % NumpyRate)
311
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np)) 
312
        
313
    # # Native & Numba Implementation
314
    # print("Performing Numba implementation")
315
    # TimeIn=time.time()
316
    # g_np,h_np=NumbaDFT(a_np,b_np)
317
    # NumbaElapsed=time.time()-TimeIn
318
    # NumbaRate=int(SIZE/NumbaElapsed)
319
    # print("NumbaRate: %i" % NumbaRate)
320
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np)) 
321
    
322
    # OpenCL Implementation
323
    if GpuStyle=='OpenCL':
324
        print("Performing OpenCL implementation")
325
        TimeIn=time.time()
326
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
327
        OpenCLElapsed=time.time()-TimeIn
328
        OpenCLRate=int(SIZE/OpenCLElapsed)
329
        print("OpenCLRate: %i" % OpenCLRate)
330
        print("Precision: ",np.linalg.norm(i_np-C_np),
331
              np.linalg.norm(j_np-D_np)) 
332
    
333
    # # CUDA Implementation
334
    # print("Performing CUDA implementation")
335
    # TimeIn=time.time()
336
    # k_np,l_np=CUDADFT(a_np,b_np)
337
    # CUDAElapsed=time.time()-TimeIn
338
    # CUDARate=int(SIZE/CUDAElapsed)
339
    # print("CUDARate: %i" % CUDARate)
340
    # print("Precision: ",np.linalg.norm(k_np-C_np),np.linalg.norm(l_np-D_np)) 
341
    
0 342

  
ETSN/MyDFT_8.py (revision 274)
1
#!/usr/bin/env python3
2

  
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

  
7
# Naive Discrete Fourier Transform
8
def MyDFT(x,y):
9
    size=x.shape[0]
10
    X=np.zeros(size).astype(np.float32)
11
    Y=np.zeros(size).astype(np.float32)
12
    for i in range(size):
13
        for j in range(size):
14
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16
    return(X,Y)
17

  
18
# Numpy Discrete Fourier Transform
19
def NumpyDFT(x,y):
20
    size=x.shape[0]
21
    X=np.zeros(size).astype(np.float32)
22
    Y=np.zeros(size).astype(np.float32)
23
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24
    for i in range(size):
25
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27
    return(X,Y)
28

  
29
# Numba Discrete Fourier Transform
30
import numba
31
@numba.njit(parallel=True)
32
def NumbaDFT(x,y):
33
    size=x.shape[0]
34
    X=np.zeros(size).astype(np.float32)
35
    Y=np.zeros(size).astype(np.float32)
36
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37
    for i in numba.prange(size):
38
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40
    return(X,Y)
41

  
42
# OpenCL complete operation
43
def OpenCLDFT(a_np,b_np,Device):
44

  
45
    Id=0
46
    HasXPU=False
47
    for platform in cl.get_platforms():
48
        for device in platform.get_devices():
49
            if Id==Device:
50
                XPU=device
51
                print("CPU/GPU selected: ",device.name.lstrip())
52
                HasXPU=True
53
            Id+=1
54
            # print(Id)
55

  
56
    if HasXPU==False:
57
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58
        sys.exit()           
59

  
60
    try:
61
        ctx = cl.Context(devices=[XPU])
62
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63
    except:
64
        print("Crash during context creation")
65

  
66
    TimeIn=time.time()
67
    # Copy from Host to Device using pointers
68
    mf = cl.mem_flags
69
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71
    Elapsed=time.time()-TimeIn
72
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73

  
74
    TimeIn=time.time()
75
    # Definition of kernel under OpenCL
76
    prg = cl.Program(ctx, """
77

  
78
#define PI 3.141592653589793
79

  
80
__kernel void MyDFT(
81
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82
{
83
  int gid = get_global_id(0);
84
  uint size = get_global_size(0);
85
  float A=0.,B=0.;
86
  for (uint i=0; i<size;i++) 
87
  {
88
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90
  }
91
  A_g[gid]=A;
92
  B_g[gid]=B;
93
}
94
""").build()
95
    Elapsed=time.time()-TimeIn
96
    print("Building kernels : %.3f" % Elapsed)
97
    
98
    TimeIn=time.time()
99
    # Memory allocation on Device for result
100
    A_ocl = np.empty_like(a_np)
101
    B_ocl = np.empty_like(a_np)
102
    Elapsed=time.time()-TimeIn
103
    print("Allocation on Host for results : %.3f" % Elapsed)
104

  
105
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107
    Elapsed=time.time()-TimeIn
108
    print("Allocation on Device for results : %.3f" % Elapsed)
109

  
110
    TimeIn=time.time()
111
    # Synthesis of function "sillysum" inside Kernel Sources
112
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113
    Elapsed=time.time()-TimeIn
114
    print("Synthesis of kernel : %.3f" % Elapsed)
115

  
116
    TimeIn=time.time()
117
    # Call of kernel previously defined 
118
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119
    # 
120
    CallCL.wait()
121
    Elapsed=time.time()-TimeIn
122
    print("Execution of kernel : %.3f" % Elapsed)
123

  
124
    TimeIn=time.time()
125
    # Copy from Device to Host
126
    cl.enqueue_copy(queue, A_ocl, A_g)
127
    cl.enqueue_copy(queue, B_ocl, B_g)
128
    Elapsed=time.time()-TimeIn
129
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130

  
131
    a_g.release()
132
    b_g.release()
133
    A_g.release()
134
    B_g.release()
135
    
136
    return(A_ocl,B_ocl)
137

  
138
# CUDA Silly complete operation
139
def CUDADFT(a_np,b_np,Device):
140
    # import pycuda.autoinit
141
    import pycuda.driver as drv
142
    from pycuda.compiler import SourceModule
143
    
144
    try:
145
        # For PyCUDA import
146
        import pycuda.driver as cuda
147
        from pycuda.compiler import SourceModule
148
        
149
        cuda.init()
150
        for Id in range(cuda.Device.count()):
151
            if Id==Device:
152
                XPU=cuda.Device(Id)
153
                print("GPU selected %s" % XPU.name())
154
        print
155

  
156
    except ImportError:
157
        print("Platform does not seem to support CUDA")
158

  
159
    Context=XPU.make_context()
160
        
161
    TimeIn=time.time()
162
    mod = SourceModule("""
163

  
164
#define PI 3.141592653589793
165

  
166
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
167
{
168
  const int gid = blockIdx.x;
169
  uint size = gridDim.x;
170
  float A=0.,B=0.;
171
  for (uint i=0; i<size;i++) 
172
  {
173
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
174
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
175
  }
176
  A_g[gid]=A;
177
  B_g[gid]=B;
178
}
179

  
180
""")
181
    Elapsed=time.time()-TimeIn
182
    print("Definition of kernel : %.3f" % Elapsed)
183

  
184
    TimeIn=time.time()
185
    MyDFT = mod.get_function("MyDFT")
186
    Elapsed=time.time()-TimeIn
187
    print("Synthesis of kernel : %.3f" % Elapsed)
188

  
189
    TimeIn=time.time()
190
    A_np = np.zeros_like(a_np)
191
    B_np = np.zeros_like(a_np)
192
    Elapsed=time.time()-TimeIn
193
    print("Allocation on Host for results : %.3f" % Elapsed)
194

  
195
    TimeIn=time.time()
196
    MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
197
          block=(1,1,1), grid=(a_np.size,1))
198
    Elapsed=time.time()-TimeIn
199
    print("Execution of kernel : %.3f" % Elapsed)
200

  
201
    Context.pop()
202
    Context.detach()
203
    
204
    return(A_np,B_np)
205

  
206
import sys
207
import time
208

  
209
if __name__=='__main__':
210

  
211
    GpuStyle='OpenCL'
212
    SIZE=1024
213
    Device=0
214

  
215
    import getopt
216

  
217
    HowToUse='%s -g <CUDA/OpenCL> -s <SizeOfVector> -d <DeviceId>'
218
    
219
    try:
220
        opts, args = getopt.getopt(sys.argv[1:],"hg:s:d:",["gpustyle=","size=","device="])
221
    except getopt.GetoptError:
222
        print(HowToUse % sys.argv[0])
223
        sys.exit(2)
224

  
225
    # List of Devices
226
    Devices=[]
227
    Alu={}
228
        
229
    for opt, arg in opts:
230
        if opt == '-h':
231
            print(HowToUse % sys.argv[0])
232

  
233
            print("\nInformations about devices detected under OpenCL API:")
234
            # For PyOpenCL import
235
            try:
236
                import pyopencl as cl
237
                Id=0
238
                for platform in cl.get_platforms():
239
                    for device in platform.get_devices():
240
                        #deviceType=cl.device_type.to_string(device.type)
241
                        deviceType="xPU"
242
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
243
                        Id=Id+1
244

  
245
            except:
246
                print("Your platform does not seem to support OpenCL")
247

  
248
            print("\nInformations about devices detected under CUDA API:")
249
            # For PyCUDA import
250
            try:
251
                import pycuda.driver as cuda
252
                cuda.init()
253
                for Id in range(cuda.Device.count()):
254
                    device=cuda.Device(Id)
255
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
256
                print
257
            except:
258
                print("Your platform does not seem to support CUDA")
259
        
260
            sys.exit()
261
        
262
        elif opt in ("-d", "--device"):
263
            Device=int(arg)
264
        elif opt in ("-g", "--gpustyle"):
265
            GpuStyle = arg
266
        elif opt in ("-s", "--size"):
267
            SIZE = int(arg)
268

  
269
    print("Device Selection : %i" % Device)
270
    print("GpuStyle used : %s" % GpuStyle)
271
    print("Size of complex vector : %i" % SIZE)
272

  
273
    if GpuStyle=='CUDA':
274
        try:
275
            # For PyCUDA import
276
            import pycuda.driver as cuda
277
            
278
            cuda.init()
279
            for Id in range(cuda.Device.count()):
280
                device=cuda.Device(Id)
281
                print("Device #%i of type GPU : %s" % (Id,device.name()))
282
                if Id in Devices:
283
                    Alu[Id]='GPU'
284
            
285
        except ImportError:
286
            print("Platform does not seem to support CUDA")
287

  
288
    if GpuStyle=='OpenCL':
289
        try:
290
            # For PyOpenCL import
291
            import pyopencl as cl
292
            Id=0
293
            for platform in cl.get_platforms():
294
                for device in platform.get_devices():
295
                    #deviceType=cl.device_type.to_string(device.type)
296
                    deviceType="xPU"
297
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
298

  
299
                    if Id in Devices:
300
                    # Set the Alu as detected Device Type
301
                        Alu[Id]=deviceType
302
                    Id=Id+1
303
        except ImportError:
304
            print("Platform does not seem to support OpenCL")
305

  
306
    
307
        
308
    a_np = np.ones(SIZE).astype(np.float32)
309
    b_np = np.ones(SIZE).astype(np.float32)
310

  
311
    C_np = np.zeros(SIZE).astype(np.float32)
312
    D_np = np.zeros(SIZE).astype(np.float32)
313
    C_np[0] = np.float32(SIZE)
314
    D_np[0] = np.float32(SIZE)
315
    
316
    # # Native & Naive Implementation
317
    # print("Performing naive implementation")
318
    # TimeIn=time.time()
319
    # c_np,d_np=MyDFT(a_np,b_np)
320
    # NativeElapsed=time.time()-TimeIn
321
    # NativeRate=int(SIZE/NativeElapsed)
322
    # print("NativeRate: %i" % NativeRate)
323
    # print("Precision: ",np.linalg.norm(c_np-C_np),np.linalg.norm(d_np-D_np)) 
324

  
325
    # # Native & Numpy Implementation
326
    # print("Performing Numpy implementation")
327
    # TimeIn=time.time()
328
    # e_np,f_np=NumpyDFT(a_np,b_np)
329
    # NumpyElapsed=time.time()-TimeIn
330
    # NumpyRate=int(SIZE/NumpyElapsed)
331
    # print("NumpyRate: %i" % NumpyRate)
332
    # print("Precision: ",np.linalg.norm(e_np-C_np),np.linalg.norm(f_np-D_np)) 
333
        
334
    # # Native & Numba Implementation
335
    # print("Performing Numba implementation")
336
    # TimeIn=time.time()
337
    # g_np,h_np=NumbaDFT(a_np,b_np)
338
    # NumbaElapsed=time.time()-TimeIn
339
    # NumbaRate=int(SIZE/NumbaElapsed)
340
    # print("NumbaRate: %i" % NumbaRate)
341
    # print("Precision: ",np.linalg.norm(g_np-C_np),np.linalg.norm(h_np-D_np)) 
342
    
343
    # OpenCL Implementation
344
    if GpuStyle=='OpenCL':
345
        print("Performing OpenCL implementation")
346
        TimeIn=time.time()
347
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
348
        OpenCLElapsed=time.time()-TimeIn
349
        OpenCLRate=int(SIZE/OpenCLElapsed)
350
        print("OpenCLRate: %i" % OpenCLRate)
351
        print("Precision: ",np.linalg.norm(i_np-C_np),
352
              np.linalg.norm(j_np-D_np)) 
353
    
354
    # CUDA Implementation
355
    if GpuStyle=='CUDA':
356
        print("Performing CUDA implementation")
357
        TimeIn=time.time()
358
        k_np,l_np=CUDADFT(a_np,b_np,Device)
359
        CUDAElapsed=time.time()-TimeIn
360
        CUDARate=int(SIZE/CUDAElapsed)
361
        print("CUDARate: %i" % CUDARate)
362
        print("Precision: ",np.linalg.norm(k_np-C_np),
363
              np.linalg.norm(l_np-D_np)) 
364
    
0 365

  
ETSN/MyDFT_9.py (revision 274)
1
#!/usr/bin/env python3
2

  
3
import numpy as np
4
import pyopencl as cl
5
from numpy import pi,cos,sin
6

  
7
# Naive Discrete Fourier Transform
8
def MyDFT(x,y):
9
    size=x.shape[0]
10
    X=np.zeros(size).astype(np.float32)
11
    Y=np.zeros(size).astype(np.float32)
12
    for i in range(size):
13
        for j in range(size):
14
            X[i]=X[i]+x[j]*cos(2.*pi*i*j/size)-y[j]*sin(2.*pi*i*j/size)
15
            Y[i]=Y[i]+x[j]*sin(2.*pi*i*j/size)+y[j]*cos(2.*pi*i*j/size)
16
    return(X,Y)
17

  
18
# Numpy Discrete Fourier Transform
19
def NumpyDFT(x,y):
20
    size=x.shape[0]
21
    X=np.zeros(size).astype(np.float32)
22
    Y=np.zeros(size).astype(np.float32)
23
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
24
    for i in range(size):
25
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
26
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
27
    return(X,Y)
28

  
29
# Numba Discrete Fourier Transform
30
import numba
31
@numba.njit(parallel=True)
32
def NumbaDFT(x,y):
33
    size=x.shape[0]
34
    X=np.zeros(size).astype(np.float32)
35
    Y=np.zeros(size).astype(np.float32)
36
    nj=np.multiply(2.0*np.pi/size,np.arange(size)).astype(np.float32)
37
    for i in numba.prange(size):
38
        X[i]=np.sum(np.subtract(np.multiply(np.cos(i*nj),x),np.multiply(np.sin(i*nj),y)))
39
        Y[i]=np.sum(np.add(np.multiply(np.sin(i*nj),x),np.multiply(np.cos(i*nj),y)))
40
    return(X,Y)
41

  
42
# OpenCL complete operation
43
def OpenCLDFT(a_np,b_np,Device):
44

  
45
    Id=0
46
    HasXPU=False
47
    for platform in cl.get_platforms():
48
        for device in platform.get_devices():
49
            if Id==Device:
50
                XPU=device
51
                print("CPU/GPU selected: ",device.name.lstrip())
52
                HasXPU=True
53
            Id+=1
54
            # print(Id)
55

  
56
    if HasXPU==False:
57
        print("No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1))
58
        sys.exit()           
59

  
60
    try:
61
        ctx = cl.Context(devices=[XPU])
62
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
63
    except:
64
        print("Crash during context creation")
65

  
66
    TimeIn=time.time()
67
    # Copy from Host to Device using pointers
68
    mf = cl.mem_flags
69
    a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np)
70
    b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np)
71
    Elapsed=time.time()-TimeIn
72
    print("Copy from Host 2 Device : %.3f" % Elapsed)
73

  
74
    TimeIn=time.time()
75
    # Definition of kernel under OpenCL
76
    prg = cl.Program(ctx, """
77

  
78
#define PI 3.141592653589793
79

  
80
__kernel void MyDFT(
81
    __global const float *a_g, __global const float *b_g, __global float *A_g, __global float *B_g)
82
{
83
  int gid = get_global_id(0);
84
  uint size = get_global_size(0);
85
  float A=0.,B=0.;
86
  for (uint i=0; i<size;i++) 
87
  {
88
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
89
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
90
  }
91
  A_g[gid]=A;
92
  B_g[gid]=B;
93
}
94
""").build()
95
    Elapsed=time.time()-TimeIn
96
    print("Building kernels : %.3f" % Elapsed)
97
    
98
    TimeIn=time.time()
99
    # Memory allocation on Device for result
100
    A_ocl = np.empty_like(a_np)
101
    B_ocl = np.empty_like(a_np)
102
    Elapsed=time.time()-TimeIn
103
    print("Allocation on Host for results : %.3f" % Elapsed)
104

  
105
    A_g = cl.Buffer(ctx, mf.WRITE_ONLY, A_ocl.nbytes)
106
    B_g = cl.Buffer(ctx, mf.WRITE_ONLY, B_ocl.nbytes)
107
    Elapsed=time.time()-TimeIn
108
    print("Allocation on Device for results : %.3f" % Elapsed)
109

  
110
    TimeIn=time.time()
111
    # Synthesis of function "sillysum" inside Kernel Sources
112
    knl = prg.MyDFT  # Use this Kernel object for repeated calls
113
    Elapsed=time.time()-TimeIn
114
    print("Synthesis of kernel : %.3f" % Elapsed)
115

  
116
    TimeIn=time.time()
117
    # Call of kernel previously defined 
118
    CallCL=knl(queue, a_np.shape, None, a_g, b_g, A_g, B_g)
119
    # 
120
    CallCL.wait()
121
    Elapsed=time.time()-TimeIn
122
    print("Execution of kernel : %.3f" % Elapsed)
123

  
124
    TimeIn=time.time()
125
    # Copy from Device to Host
126
    cl.enqueue_copy(queue, A_ocl, A_g)
127
    cl.enqueue_copy(queue, B_ocl, B_g)
128
    Elapsed=time.time()-TimeIn
129
    print("Copy from Device 2 Host : %.3f" % Elapsed)
130

  
131
    a_g.release()
132
    b_g.release()
133
    A_g.release()
134
    B_g.release()
135
    
136
    return(A_ocl,B_ocl)
137

  
138
# CUDA Silly complete operation
139
def CUDADFT(a_np,b_np,Device,THreads):
140
    # import pycuda.autoinit
141
    import pycuda.driver as drv
142
    from pycuda.compiler import SourceModule
143
    
144
    try:
145
        # For PyCUDA import
146
        import pycuda.driver as cuda
147
        from pycuda.compiler import SourceModule
148
        
149
        cuda.init()
150
        for Id in range(cuda.Device.count()):
151
            if Id==Device:
152
                XPU=cuda.Device(Id)
153
                print("GPU selected %s" % XPU.name())
154
        print
155

  
156
    except ImportError:
157
        print("Platform does not seem to support CUDA")
158

  
159
    Context=XPU.make_context()
160
        
161
    TimeIn=time.time()
162
    mod = SourceModule("""
163

  
164
#define PI 3.141592653589793
165

  
166
__global__ void MyDFT(float *A_g, float *B_g, const float *a_g,const float *b_g)
167
{
168
  const int gid = blockIdx.x;
169
  uint size = gridDim.x;
170
  float A=0.,B=0.;
171
  for (uint i=0; i<size;i++) 
172
  {
173
     A+=a_g[i]*cos(2.*PI*(float)(gid*i)/(float)size)-b_g[i]*sin(2.*PI*(float)(gid*i)/(float)size);
174
     B+=a_g[i]*sin(2.*PI*(float)(gid*i)/(float)size)+b_g[i]*cos(2.*PI*(float)(gid*i)/(float)size);
175
  }
176
  A_g[gid]=A;
177
  B_g[gid]=B;
178
}
179

  
180
""")
181
    Elapsed=time.time()-TimeIn
182
    print("Definition of kernel : %.3f" % Elapsed)
183

  
184
    TimeIn=time.time()
185
    MyDFT = mod.get_function("MyDFT")
186
    Elapsed=time.time()-TimeIn
187
    print("Synthesis of kernel : %.3f" % Elapsed)
188

  
189
    TimeIn=time.time()
190
    A_np = np.zeros_like(a_np)
191
    B_np = np.zeros_like(a_np)
192
    Elapsed=time.time()-TimeIn
193
    print("Allocation on Host for results : %.3f" % Elapsed)
194

  
195
    Size=a_np.size
196
    if (Size % Threads != 0):
197
        print("Impossible : %i not multiple of %i..." % (Threads,Size) )
198
        TimeIn=time.time()
199
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
200
              block=(1,1,1), grid=(a_np.size,1))
201
        Elapsed=time.time()-TimeIn
202
        print("Execution of kernel : %.3f" % Elapsed)
203
    else:
204
        Blocks=int(Size/Threads)
205
        TimeIn=time.time()
206
        MyDFT(drv.Out(A_np), drv.Out(B_np), drv.In(a_np), drv.In(b_np),
207
              block=(Threads,1,1), grid=(Blocks,1))
208
        Elapsed=time.time()-TimeIn
209
        print("Execution of kernel : %.3f" % Elapsed)
210
        
211
    Context.pop()
212
    Context.detach()
213
    
214
    return(A_np,B_np)
215

  
216
import sys
217
import time
218

  
219
if __name__=='__main__':
220

  
221
    SIZE=1024
222
    Device=0
223
    NaiveMethod=False
224
    NumpyMethod=True
225
    NumbaMethod=False
226
    OpenCLMethod=True
227
    CUDAMethod=False
228
    Threads=1
229
    
230
    import getopt
231

  
232
    HowToUse='%s -n [Naive] -y [numpY] -a [numbA] -o [OpenCL] -c [CUDA] -s <SizeOfVector> -d <DeviceId> -t <threads>'
233
    
234
    try:
235
        opts, args = getopt.getopt(sys.argv[1:],"nyaochs:d:t:",["size=","device="])
236
    except getopt.GetoptError:
237
        print(HowToUse % sys.argv[0])
238
        sys.exit(2)
239

  
240
    # List of Devices
241
    Devices=[]
242
    Alu={}
243
        
244
    for opt, arg in opts:
245
        if opt == '-h':
246
            print(HowToUse % sys.argv[0])
247

  
248
            print("\nInformations about devices detected under OpenCL API:")
249
            # For PyOpenCL import
250
            try:
251
                import pyopencl as cl
252
                Id=0
253
                for platform in cl.get_platforms():
254
                    for device in platform.get_devices():
255
                        #deviceType=cl.device_type.to_string(device.type)
256
                        deviceType="xPU"
257
                        print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip(),deviceType,device.name.lstrip()))
258
                        Id=Id+1
259

  
260
            except:
261
                print("Your platform does not seem to support OpenCL")
262

  
263
            print("\nInformations about devices detected under CUDA API:")
264
            # For PyCUDA import
265
            try:
266
                import pycuda.driver as cuda
267
                cuda.init()
268
                for Id in range(cuda.Device.count()):
269
                    device=cuda.Device(Id)
270
                    print("Device #%i of type GPU : %s" % (Id,device.name()))
271
                print
272
            except:
273
                print("Your platform does not seem to support CUDA")
274
        
275
            sys.exit()
276
        
277
        elif opt in ("-d", "--device"):
278
            Device=int(arg)
279
        elif opt in ("-s", "--size"):
280
            SIZE = int(arg)
281
        elif opt in ("-t", "--threads"):
282
            Threads = int(arg)
283
        elif opt in ("-n"):
284
            NaiveMethod=True
285
        elif opt in ("-y"):
286
            NumpyMethod=True
287
        elif opt in ("-a"):
288
            NumbaMethod=True
289
        elif opt in ("-o"):
290
            OpenCLMethod=True
291
        elif opt in ("-c"):
292
            CUDAMethod=True
293

  
294
    print("Device Selection : %i" % Device)
295
    print("Size of complex vector : %i" % SIZE)
296
    print("DFT Naive computation %s " % NaiveMethod )
297
    print("DFT Numpy computation %s " % NumpyMethod )
298
    print("DFT Numba computation %s " % NumbaMethod )
299
    print("DFT OpenCL computation %s " % OpenCLMethod )
300
    print("DFT CUDA computation %s " % CUDAMethod )
301
    
302
    if CUDAMethod:
303
        try:
304
            # For PyCUDA import
305
            import pycuda.driver as cuda
306
            
307
            cuda.init()
308
            for Id in range(cuda.Device.count()):
309
                device=cuda.Device(Id)
310
                print("Device #%i of type GPU : %s" % (Id,device.name()))
311
                if Id in Devices:
312
                    Alu[Id]='GPU'
313
            
314
        except ImportError:
315
            print("Platform does not seem to support CUDA")
316

  
317
    if OpenCLMethod:
318
        try:
319
            # For PyOpenCL import
320
            import pyopencl as cl
321
            Id=0
322
            for platform in cl.get_platforms():
323
                for device in platform.get_devices():
324
                    #deviceType=cl.device_type.to_string(device.type)
325
                    deviceType="xPU"
326
                    print("Device #%i from %s of type %s : %s" % (Id,platform.vendor.lstrip().rstrip(),deviceType,device.name.lstrip().rstrip()))
327

  
328
                    if Id in Devices:
329
                    # Set the Alu as detected Device Type
330
                        Alu[Id]=deviceType
331
                    Id=Id+1
332
        except ImportError:
333
            print("Platform does not seem to support OpenCL")
334

  
335
    
336
        
337
    a_np = np.ones(SIZE).astype(np.float32)
338
    b_np = np.ones(SIZE).astype(np.float32)
339

  
340
    C_np = np.zeros(SIZE).astype(np.float32)
341
    D_np = np.zeros(SIZE).astype(np.float32)
342
    C_np[0] = np.float32(SIZE)
343
    D_np[0] = np.float32(SIZE)
344
    
345
    # Native & Naive Implementation
346
    if NaiveMethod:
347
        print("Performing naive implementation")
348
        TimeIn=time.time()
349
        c_np,d_np=MyDFT(a_np,b_np)
350
        NativeElapsed=time.time()-TimeIn
351
        NativeRate=int(SIZE/NativeElapsed)
352
        print("NativeRate: %i" % NativeRate)
353
        print("Precision: ",np.linalg.norm(c_np-C_np),
354
              np.linalg.norm(d_np-D_np)) 
355

  
356
    # Native & Numpy Implementation
357
    if NumpyMethod:
358
        print("Performing Numpy implementation")
359
        TimeIn=time.time()
360
        e_np,f_np=NumpyDFT(a_np,b_np)
361
        NumpyElapsed=time.time()-TimeIn
362
        NumpyRate=int(SIZE/NumpyElapsed)
363
        print("NumpyRate: %i" % NumpyRate)
364
        print("Precision: ",np.linalg.norm(e_np-C_np),
365
              np.linalg.norm(f_np-D_np)) 
366
        
367
    # Native & Numba Implementation
368
    if NumbaMethod:
369
        print("Performing Numba implementation")
370
        TimeIn=time.time()
371
        g_np,h_np=NumbaDFT(a_np,b_np)
372
        NumbaElapsed=time.time()-TimeIn
373
        NumbaRate=int(SIZE/NumbaElapsed)
374
        print("NumbaRate: %i" % NumbaRate)
375
        print("Precision: ",np.linalg.norm(g_np-C_np),
376
              np.linalg.norm(h_np-D_np)) 
377
    
378
    # OpenCL Implementation
379
    if OpenCLMethod:
380
        print("Performing OpenCL implementation")
381
        TimeIn=time.time()
382
        i_np,j_np=OpenCLDFT(a_np,b_np,Device)
383
        OpenCLElapsed=time.time()-TimeIn
384
        OpenCLRate=int(SIZE/OpenCLElapsed)
385
        print("OpenCLRate: %i" % OpenCLRate)
386
        print("Precision: ",np.linalg.norm(i_np-C_np),
387
              np.linalg.norm(j_np-D_np)) 
388
    
389
    # CUDA Implementation
390
    if CUDAMethod:
391
        print("Performing CUDA implementation")
392
        TimeIn=time.time()
393
        k_np,l_np=CUDADFT(a_np,b_np,Device,Threads)
394
        CUDAElapsed=time.time()-TimeIn
395
        CUDARate=int(SIZE/CUDAElapsed)
396
        print("CUDARate: %i" % CUDARate)
397
        print("Precision: ",np.linalg.norm(k_np-C_np),
398
              np.linalg.norm(l_np-D_np)) 
399
    
0 400

  

Formats disponibles : Unified diff