root / BLAS / xGEMM / xGEMM.c @ 177
Historique | Voir | Annoter | Télécharger (21,4 ko)
1 | 1 | equemene | /*
|
---|---|---|---|
2 | 51 | equemene | Performs matrix multiply on several BLAS implementations
|
3 | 51 | equemene | Copyleft Emmanuel QUEMENER <emmanuel.quemener@gmail.com> under GPLv3
|
4 | 1 | equemene | |
5 | 51 | equemene | 2014-03-14 : Add clBLAS implementation
|
6 | 51 | equemene | |
7 | 1 | equemene | Thanks for help from aurel32@debian.org
|
8 | 1 | equemene | */
|
9 | 1 | equemene | |
10 | 1 | equemene | #include <stdio.h> |
11 | 1 | equemene | #include <math.h> |
12 | 1 | equemene | #include <stdlib.h> |
13 | 1 | equemene | #include <sys/time.h> |
14 | 1 | equemene | #include <string.h> |
15 | 1 | equemene | |
16 | 51 | equemene | #ifdef CLBLAS
|
17 | 51 | equemene | #include <clBLAS.h> |
18 | 53 | equemene | /* Precise here to avoid new specific bench function */
|
19 | 53 | equemene | int MyPlatform;
|
20 | 53 | equemene | int MyDevice;
|
21 | 51 | equemene | #elif CUBLAS
|
22 | 1 | equemene | #include <cublas.h> |
23 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_NOERR 0 |
24 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_ALLOC 1 |
25 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_SET 2 |
26 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_GET 3 |
27 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_STUB 4 |
28 | 1 | equemene | #elif THUNKING
|
29 | 1 | equemene | #include <cublas.h> |
30 | 7 | equemene | #include "fortran_common.h" |
31 | 7 | equemene | #include "fortran_thunking.h" |
32 | 1 | equemene | #elif FBLAS
|
33 | 147 | equemene | #include <cblas.h> |
34 | 1 | equemene | #include <cblas_f77.h> |
35 | 1 | equemene | #elif GSL
|
36 | 1 | equemene | #include <gsl_cblas.h> |
37 | 1 | equemene | #elif ACML
|
38 | 1 | equemene | #include <acml.h> |
39 | 1 | equemene | #else
|
40 | 1 | equemene | #include <cblas.h> |
41 | 146 | equemene | #include <blaswrap.h> |
42 | 1 | equemene | #endif
|
43 | 1 | equemene | |
44 | 51 | equemene | #ifdef CLBLAS
|
45 | 51 | equemene | |
46 | 1 | equemene | #ifdef DOUBLE
|
47 | 51 | equemene | #define LENGTH cl_double
|
48 | 51 | equemene | #else
|
49 | 51 | equemene | #define LENGTH cl_float
|
50 | 51 | equemene | #endif
|
51 | 51 | equemene | |
52 | 51 | equemene | #else
|
53 | 51 | equemene | |
54 | 51 | equemene | #ifdef DOUBLE
|
55 | 1 | equemene | #define LENGTH double |
56 | 1 | equemene | #else
|
57 | 1 | equemene | #define LENGTH float |
58 | 1 | equemene | #endif
|
59 | 1 | equemene | |
60 | 51 | equemene | #endif
|
61 | 51 | equemene | |
62 | 7 | equemene | #ifdef FBLAS
|
63 | 1 | equemene | |
64 | 1 | equemene | #ifdef DOUBLE
|
65 | 1 | equemene | |
66 | 147 | equemene | void F77_dgemm(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, |
67 | 1 | equemene | const double *, FINT, const double *, double *, FINT); |
68 | 1 | equemene | |
69 | 1 | equemene | #else
|
70 | 1 | equemene | |
71 | 147 | equemene | void F77_sgemm(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, |
72 | 1 | equemene | const float *, FINT, const float *, float *, FINT); |
73 | 1 | equemene | |
74 | 1 | equemene | #endif
|
75 | 1 | equemene | #endif
|
76 | 1 | equemene | |
77 | 1 | equemene | /* Matrix with only defined triangular terms */
|
78 | 1 | equemene | /* Even if there are 0 in matrix, must be defined at all ! */
|
79 | 1 | equemene | |
80 | 1 | equemene | /* Get from fortran.c */
|
81 | 1 | equemene | |
82 | 1 | equemene | #ifdef CUBLAS
|
83 | 1 | equemene | static char *errMsg[5] = |
84 | 1 | equemene | { |
85 | 1 | equemene | "no error",
|
86 | 1 | equemene | "allocation error",
|
87 | 1 | equemene | "setVector/setMatrix error",
|
88 | 1 | equemene | "getVector/getMatrix error",
|
89 | 1 | equemene | "not implemented"
|
90 | 1 | equemene | }; |
91 | 1 | equemene | |
92 | 1 | equemene | static void wrapperError (const char *funcName, int error) |
93 | 1 | equemene | { |
94 | 1 | equemene | printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
|
95 | 1 | equemene | fflush (stdout); |
96 | 1 | equemene | } |
97 | 1 | equemene | #endif
|
98 | 1 | equemene | |
99 | 1 | equemene | int printVector(const int dimVector,const LENGTH *dataVector, |
100 | 1 | equemene | char *nameVector,char *mesgVector) |
101 | 1 | equemene | { |
102 | 1 | equemene | #ifndef QUIET
|
103 | 1 | equemene | |
104 | 1 | equemene | int i;
|
105 | 1 | equemene | printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
106 | 1 | equemene | for (i=0;i<dimVector;i++) |
107 | 1 | equemene | { |
108 | 1 | equemene | printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
109 | 1 | equemene | } |
110 | 1 | equemene | #endif
|
111 | 1 | equemene | |
112 | 1 | equemene | return 0; |
113 | 1 | equemene | } |
114 | 1 | equemene | |
115 | 1 | equemene | int printResults(const int dimVector,const LENGTH *dataVector, |
116 | 1 | equemene | char *nameVector,char *mesgVector) |
117 | 1 | equemene | { |
118 | 1 | equemene | #ifdef RESULTS
|
119 | 1 | equemene | int i;
|
120 | 1 | equemene | |
121 | 1 | equemene | printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
122 | 1 | equemene | for (i=0;i<dimVector;i++) |
123 | 1 | equemene | { |
124 | 1 | equemene | printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
125 | 1 | equemene | } |
126 | 1 | equemene | #endif
|
127 | 1 | equemene | return 0; |
128 | 1 | equemene | } |
129 | 1 | equemene | |
130 | 1 | equemene | #ifdef CUBLAS
|
131 | 1 | equemene | int printVectorGPU(const int dimVector,const LENGTH *dataVector, |
132 | 1 | equemene | char *nameVector,char *mesgVector) |
133 | 1 | equemene | { |
134 | 1 | equemene | #ifndef QUIET
|
135 | 1 | equemene | int i;
|
136 | 1 | equemene | cublasStatus stat; |
137 | 1 | equemene | LENGTH *P=0;
|
138 | 1 | equemene | int incx=1; |
139 | 1 | equemene | |
140 | 1 | equemene | P=malloc(dimVector*sizeof(LENGTH));
|
141 | 1 | equemene | |
142 | 1 | equemene | stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx); |
143 | 1 | equemene | |
144 | 1 | equemene | if (stat != CUBLAS_STATUS_SUCCESS) {
|
145 | 1 | equemene | wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
|
146 | 1 | equemene | } |
147 | 1 | equemene | |
148 | 1 | equemene | printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
149 | 1 | equemene | for (i=0;i<dimVector;i++) |
150 | 1 | equemene | { |
151 | 1 | equemene | printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
|
152 | 1 | equemene | } |
153 | 1 | equemene | |
154 | 1 | equemene | free(P); |
155 | 1 | equemene | #endif
|
156 | 1 | equemene | |
157 | 1 | equemene | return 0; |
158 | 1 | equemene | } |
159 | 1 | equemene | #endif
|
160 | 1 | equemene | |
161 | 1 | equemene | int bench(int dim,int RUNS) |
162 | 1 | equemene | { |
163 | 1 | equemene | /*
|
164 | 1 | equemene | int dim=1000;
|
165 | 1 | equemene | int RUNS=100;
|
166 | 1 | equemene | int incx=1;
|
167 | 1 | equemene | */
|
168 | 1 | equemene | #ifdef PRINT
|
169 | 1 | equemene | LENGTH factor=1.;
|
170 | 1 | equemene | #endif
|
171 | 1 | equemene | |
172 | 1 | equemene | LENGTH alpha=1.,beta=0.; |
173 | 1 | equemene | LENGTH *A,*B,*C,*D; |
174 | 1 | equemene | |
175 | 1 | equemene | /* checkBefore checkAfter checks */
|
176 | 1 | equemene | LENGTH *checksA,*checksB; |
177 | 1 | equemene | |
178 | 1 | equemene | int i=0, j=0; |
179 | 1 | equemene | |
180 | 1 | equemene | double duration;
|
181 | 1 | equemene | |
182 | 1 | equemene | struct timeval tv1,tv2;
|
183 | 1 | equemene | struct timezone tz;
|
184 | 1 | equemene | |
185 | 1 | equemene | /* Create 4 Matrix of dimension dim by dim */
|
186 | 1 | equemene | |
187 | 1 | equemene | A=malloc(dim*dim*sizeof(LENGTH));
|
188 | 1 | equemene | B=malloc(dim*dim*sizeof(LENGTH));
|
189 | 1 | equemene | C=malloc(dim*dim*sizeof(LENGTH));
|
190 | 1 | equemene | D=malloc(dim*dim*sizeof(LENGTH));
|
191 | 1 | equemene | |
192 | 1 | equemene | /* Create 2 vectors for checker Before and After */
|
193 | 1 | equemene | |
194 | 1 | equemene | checksA=malloc(RUNS*sizeof(LENGTH));
|
195 | 1 | equemene | checksB=malloc(RUNS*sizeof(LENGTH));
|
196 | 1 | equemene | |
197 | 1 | equemene | /* Initialize elements with random numbers */
|
198 | 1 | equemene | /* Initialize the seed for rand() */
|
199 | 1 | equemene | /* srand(time()); */
|
200 | 1 | equemene | |
201 | 1 | equemene | for (i=0; i<dim; i++) { |
202 | 1 | equemene | for (j=0; j<dim; j++) { |
203 | 1 | equemene | A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
204 | 1 | equemene | *(LENGTH)(i+1.)/(LENGTH)(j+1.); |
205 | 1 | equemene | B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
206 | 1 | equemene | *(LENGTH)(i+1.)/(LENGTH)(j+1.); |
207 | 1 | equemene | C[i*dim+j]=0.;
|
208 | 1 | equemene | D[i*dim+j]=0.;
|
209 | 1 | equemene | } |
210 | 1 | equemene | } |
211 | 1 | equemene | /*
|
212 | 1 | equemene | A[0]=1;
|
213 | 1 | equemene | A[1]=2;
|
214 | 1 | equemene | A[2]=3;
|
215 | 1 | equemene | A[3]=4;
|
216 | 1 | equemene | |
217 | 1 | equemene | B[0]=5;
|
218 | 1 | equemene | B[1]=6;
|
219 | 1 | equemene | B[2]=7;
|
220 | 1 | equemene | B[3]=8;
|
221 | 1 | equemene | */
|
222 | 1 | equemene | |
223 | 1 | equemene | /* Print the matrix */
|
224 | 1 | equemene | |
225 | 1 | equemene | #ifdef QUIET
|
226 | 1 | equemene | #else
|
227 | 1 | equemene | for (i=0; i<dim; i++) { |
228 | 1 | equemene | for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]); |
229 | 1 | equemene | putchar('\n');
|
230 | 1 | equemene | } |
231 | 1 | equemene | putchar('\n');
|
232 | 1 | equemene | for (i=0; i<dim; i++) { |
233 | 1 | equemene | for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]); |
234 | 1 | equemene | putchar('\n');
|
235 | 1 | equemene | } |
236 | 1 | equemene | putchar('\n');
|
237 | 1 | equemene | #endif
|
238 | 1 | equemene | |
239 | 1 | equemene | /* Get first timer before launching */
|
240 | 1 | equemene | gettimeofday(&tv1, &tz); |
241 | 1 | equemene | |
242 | 51 | equemene | /* Compute with CLBLAS library */
|
243 | 51 | equemene | #ifdef CLBLAS
|
244 | 51 | equemene | |
245 | 53 | equemene | cl_uint platformCount; |
246 | 53 | equemene | cl_platform_id* platforms; |
247 | 53 | equemene | cl_uint deviceCount; |
248 | 53 | equemene | cl_device_id* devices; |
249 | 53 | equemene | |
250 | 51 | equemene | cl_int err,errA,errB,errC,errD; |
251 | 51 | equemene | cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
252 | 51 | equemene | cl_context ctx = 0;
|
253 | 51 | equemene | cl_command_queue queue = 0;
|
254 | 51 | equemene | cl_mem bufA, bufB, bufC, bufD; |
255 | 51 | equemene | cl_event event = NULL;
|
256 | 51 | equemene | |
257 | 53 | equemene | char* value;
|
258 | 53 | equemene | size_t valueSize; |
259 | 53 | equemene | |
260 | 53 | equemene | // tv3 Put on Device: Allocate & Write buffer
|
261 | 53 | equemene | // tv4 Compute
|
262 | 51 | equemene | struct timeval tv3,tv4;
|
263 | 51 | equemene | |
264 | 53 | equemene | printf("Using CLBLAS: %i iterations for %ix%i matrix on (%d,%d)\n",
|
265 | 53 | equemene | RUNS,dim,dim,MyPlatform,MyDevice); |
266 | 51 | equemene | |
267 | 51 | equemene | /* Setup OpenCL environment. */
|
268 | 53 | equemene | /* - get all platforms and select MyPlatform */
|
269 | 53 | equemene | /* - get all devices from MyPlatform and select MyDevice */
|
270 | 51 | equemene | |
271 | 53 | equemene | // Get all platforms
|
272 | 53 | equemene | err = clGetPlatformIDs(0, NULL, &platformCount); |
273 | 53 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
274 | 53 | equemene | err = clGetPlatformIDs(platformCount, platforms, NULL);
|
275 | 53 | equemene | |
276 | 53 | equemene | // Get Device defined
|
277 | 53 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
278 | 53 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
279 | 53 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
280 | 53 | equemene | |
281 | 53 | equemene | // print device name
|
282 | 53 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
283 | 53 | equemene | value = (char*) malloc(valueSize);
|
284 | 53 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, value, NULL);
|
285 | 53 | equemene | printf("Device (%d,%d): %s\n",MyPlatform,MyDevice, value);
|
286 | 53 | equemene | free(value); |
287 | 53 | equemene | |
288 | 53 | equemene | props[1] = (cl_context_properties)platforms[MyPlatform];
|
289 | 53 | equemene | |
290 | 53 | equemene | /* Initialize Context */
|
291 | 53 | equemene | ctx = clCreateContext( props, 1, &devices[MyDevice], NULL, NULL, &err ); |
292 | 53 | equemene | queue = clCreateCommandQueue( ctx, devices[MyDevice], 0, &err );
|
293 | 53 | equemene | |
294 | 51 | equemene | /* Setup clBLAS */
|
295 | 51 | equemene | err = clblasSetup( ); |
296 | 51 | equemene | |
297 | 51 | equemene | /* Prepare OpenCL memory objects and place matrices inside them. */
|
298 | 52 | equemene | bufA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*A),NULL,&errA ); |
299 | 52 | equemene | bufB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*B),NULL,&errB ); |
300 | 52 | equemene | bufC = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*C),NULL,&errC ); |
301 | 52 | equemene | bufD = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*D),NULL,&errD ); |
302 | 51 | equemene | |
303 | 52 | equemene | errA = clEnqueueWriteBuffer( queue,bufA,CL_TRUE,0,
|
304 | 52 | equemene | dim*dim*sizeof(*A),A,0,NULL,NULL ); |
305 | 52 | equemene | errB = clEnqueueWriteBuffer( queue, bufB, CL_TRUE,0,
|
306 | 52 | equemene | dim*dim*sizeof(*B),B,0,NULL,NULL ); |
307 | 52 | equemene | errC = clEnqueueWriteBuffer( queue, bufC, CL_TRUE,0,
|
308 | 54 | equemene | dim*dim*sizeof(*C),C,0,NULL,NULL ); |
309 | 52 | equemene | errD = clEnqueueWriteBuffer( queue, bufD, CL_TRUE,0,
|
310 | 54 | equemene | dim*dim*sizeof(*D),D,0,NULL,NULL ); |
311 | 51 | equemene | |
312 | 51 | equemene | /* Get third timer after memory operation */
|
313 | 51 | equemene | gettimeofday(&tv3, &tz); |
314 | 51 | equemene | |
315 | 51 | equemene | #ifdef DOUBLE
|
316 | 51 | equemene | |
317 | 51 | equemene | for (i=0;i<RUNS;i++) |
318 | 51 | equemene | { |
319 | 52 | equemene | err = clblasDgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
320 | 52 | equemene | dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
321 | 52 | equemene | bufC,0,dim,1,&queue,0,NULL,&event ); |
322 | 51 | equemene | |
323 | 52 | equemene | err = clblasDgemm( clblasRowMajor,clblasTrans,clblasTrans, |
324 | 52 | equemene | dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
325 | 52 | equemene | bufD,0,dim,1,&queue,0,NULL,&event ); |
326 | 51 | equemene | |
327 | 51 | equemene | } |
328 | 51 | equemene | |
329 | 53 | equemene | if (err != CL_SUCCESS) {
|
330 | 53 | equemene | printf("clblasDgemm() failed with %d\n", err);
|
331 | 53 | equemene | } |
332 | 53 | equemene | |
333 | 51 | equemene | #else
|
334 | 51 | equemene | |
335 | 51 | equemene | for (i=0;i<RUNS;i++) |
336 | 51 | equemene | { |
337 | 51 | equemene | |
338 | 52 | equemene | err = clblasSgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
339 | 52 | equemene | dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
340 | 52 | equemene | bufC,0,dim,1,&queue,0,NULL,&event ); |
341 | 51 | equemene | |
342 | 52 | equemene | err = clblasSgemm( clblasRowMajor,clblasTrans,clblasTrans, |
343 | 52 | equemene | dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
344 | 52 | equemene | bufD,0,dim,1,&queue,0,NULL,&event ); |
345 | 51 | equemene | } |
346 | 51 | equemene | |
347 | 53 | equemene | if (err != CL_SUCCESS) {
|
348 | 53 | equemene | printf("clblasSgemm() failed with %d\n", err);
|
349 | 53 | equemene | } |
350 | 53 | equemene | |
351 | 51 | equemene | #endif
|
352 | 51 | equemene | |
353 | 51 | equemene | /* Wait for calculations to be finished. */
|
354 | 51 | equemene | err = clWaitForEvents( 1, &event );
|
355 | 51 | equemene | |
356 | 53 | equemene | /* Get fourth timer after memory free */
|
357 | 53 | equemene | gettimeofday(&tv4, &tz); |
358 | 53 | equemene | |
359 | 51 | equemene | /* Fetch results of calculations from GPU memory. */
|
360 | 53 | equemene | errC = clEnqueueReadBuffer( queue,bufC,CL_TRUE,0,dim*dim * sizeof(*C), |
361 | 52 | equemene | C,0,NULL,NULL ); |
362 | 51 | equemene | |
363 | 51 | equemene | /* Fetch results of calculations from GPU memory. */
|
364 | 53 | equemene | errD = clEnqueueReadBuffer( queue,bufD,CL_TRUE,0,dim*dim*sizeof(*D), |
365 | 52 | equemene | D,0,NULL,NULL ); |
366 | 51 | equemene | |
367 | 51 | equemene | /* Release OpenCL memory objects. */
|
368 | 51 | equemene | clReleaseMemObject( bufD ); |
369 | 51 | equemene | clReleaseMemObject( bufC ); |
370 | 51 | equemene | clReleaseMemObject( bufB ); |
371 | 51 | equemene | clReleaseMemObject( bufA ); |
372 | 51 | equemene | |
373 | 51 | equemene | /* Finalize work with clBLAS */
|
374 | 51 | equemene | clblasTeardown( ); |
375 | 51 | equemene | |
376 | 51 | equemene | /* Release OpenCL working objects. */
|
377 | 51 | equemene | clReleaseCommandQueue( queue ); |
378 | 51 | equemene | clReleaseContext( ctx ); |
379 | 51 | equemene | |
380 | 51 | equemene | |
381 | 1 | equemene | /* Compute with CuBLAS library */
|
382 | 51 | equemene | #elif CUBLAS
|
383 | 1 | equemene | LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0; |
384 | 1 | equemene | cublasStatus stat1, stat2, stat3, stat4; |
385 | 1 | equemene | struct timeval tv3,tv4;
|
386 | 1 | equemene | |
387 | 1 | equemene | /* Order is Row */
|
388 | 1 | equemene | /* Have to swap uplo and trans */
|
389 | 1 | equemene | char transa='N',transb='T'; |
390 | 1 | equemene | |
391 | 1 | equemene | printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
|
392 | 1 | equemene | RUNS,dim,dim); |
393 | 1 | equemene | |
394 | 1 | equemene | stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA); |
395 | 1 | equemene | stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB); |
396 | 1 | equemene | stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC); |
397 | 1 | equemene | stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD); |
398 | 1 | equemene | |
399 | 1 | equemene | if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
400 | 1 | equemene | (stat2 != CUBLAS_STATUS_SUCCESS) || |
401 | 1 | equemene | (stat3 != CUBLAS_STATUS_SUCCESS) || |
402 | 1 | equemene | (stat4 != CUBLAS_STATUS_SUCCESS) ) { |
403 | 5 | equemene | wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
|
404 | 1 | equemene | cublasFree (devPtrA); |
405 | 1 | equemene | cublasFree (devPtrB); |
406 | 1 | equemene | cublasFree (devPtrC); |
407 | 1 | equemene | cublasFree (devPtrD); |
408 | 1 | equemene | return 1; |
409 | 1 | equemene | } |
410 | 1 | equemene | |
411 | 1 | equemene | stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim); |
412 | 1 | equemene | stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim); |
413 | 1 | equemene | stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim); |
414 | 1 | equemene | stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim); |
415 | 1 | equemene | |
416 | 1 | equemene | if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
417 | 1 | equemene | (stat2 != CUBLAS_STATUS_SUCCESS) || |
418 | 1 | equemene | (stat3 != CUBLAS_STATUS_SUCCESS) || |
419 | 1 | equemene | (stat4 != CUBLAS_STATUS_SUCCESS) ) { |
420 | 5 | equemene | wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
|
421 | 1 | equemene | cublasFree (devPtrA); |
422 | 1 | equemene | cublasFree (devPtrB); |
423 | 1 | equemene | cublasFree (devPtrC); |
424 | 1 | equemene | cublasFree (devPtrD); |
425 | 1 | equemene | return 1; |
426 | 1 | equemene | } |
427 | 1 | equemene | |
428 | 1 | equemene | /* Get third timer after memory operation */
|
429 | 1 | equemene | gettimeofday(&tv3, &tz); |
430 | 1 | equemene | |
431 | 1 | equemene | #ifdef DOUBLE
|
432 | 1 | equemene | |
433 | 1 | equemene | for (i=0;i<RUNS;i++) |
434 | 1 | equemene | { |
435 | 1 | equemene | cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
436 | 1 | equemene | devPtrA,dim,beta,devPtrC,dim); |
437 | 1 | equemene | cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
438 | 1 | equemene | devPtrB,dim,beta,devPtrD,dim); |
439 | 1 | equemene | } |
440 | 1 | equemene | |
441 | 1 | equemene | #else
|
442 | 1 | equemene | |
443 | 1 | equemene | for (i=0;i<RUNS;i++) |
444 | 1 | equemene | { |
445 | 1 | equemene | cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
446 | 1 | equemene | devPtrA,dim,beta,devPtrC,dim); |
447 | 1 | equemene | cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
448 | 1 | equemene | devPtrB,dim,beta,devPtrD,dim); |
449 | 1 | equemene | } |
450 | 1 | equemene | |
451 | 1 | equemene | #endif
|
452 | 1 | equemene | |
453 | 1 | equemene | stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim); |
454 | 1 | equemene | stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim); |
455 | 1 | equemene | |
456 | 53 | equemene | /* Get fourth timer before memory free */
|
457 | 53 | equemene | gettimeofday(&tv4, &tz); |
458 | 53 | equemene | |
459 | 1 | equemene | cublasFree (devPtrA); |
460 | 1 | equemene | cublasFree (devPtrB); |
461 | 1 | equemene | cublasFree (devPtrC); |
462 | 1 | equemene | cublasFree (devPtrD); |
463 | 1 | equemene | |
464 | 1 | equemene | if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
|
465 | 5 | equemene | wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
|
466 | 1 | equemene | } |
467 | 1 | equemene | |
468 | 1 | equemene | |
469 | 1 | equemene | #elif THUNKING
|
470 | 1 | equemene | |
471 | 1 | equemene | /* Order is Row : Have to swap uplo='U' and trans='N' */
|
472 | 1 | equemene | char transa='N',transb='T'; |
473 | 1 | equemene | printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
|
474 | 1 | equemene | RUNS,dim,dim); |
475 | 1 | equemene | |
476 | 1 | equemene | #ifdef DOUBLE
|
477 | 1 | equemene | |
478 | 1 | equemene | for (i=0;i<RUNS;i++) |
479 | 1 | equemene | { |
480 | 1 | equemene | CUBLAS_DGEMM(&transa,&transa, |
481 | 7 | equemene | &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
482 | 1 | equemene | CUBLAS_DGEMM(&transb,&transb, |
483 | 7 | equemene | &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
484 | 1 | equemene | } |
485 | 1 | equemene | |
486 | 1 | equemene | #else
|
487 | 1 | equemene | |
488 | 1 | equemene | for (i=0;i<RUNS;i++) |
489 | 1 | equemene | { |
490 | 1 | equemene | CUBLAS_SGEMM(&transa,&transa, |
491 | 7 | equemene | &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
492 | 1 | equemene | CUBLAS_SGEMM(&transb,&transb, |
493 | 7 | equemene | &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
494 | 1 | equemene | } |
495 | 1 | equemene | |
496 | 1 | equemene | #endif
|
497 | 1 | equemene | |
498 | 1 | equemene | #elif FBLAS
|
499 | 1 | equemene | |
500 | 1 | equemene | /* Order is Row : Have to swap uplo='U' and trans='N' */
|
501 | 1 | equemene | char transa='N',transb='T'; |
502 | 1 | equemene | |
503 | 1 | equemene | printf("Using FBLAS: %i iterations for %ix%i matrix\n",
|
504 | 1 | equemene | RUNS,dim,dim); |
505 | 1 | equemene | |
506 | 1 | equemene | #ifdef DOUBLE
|
507 | 1 | equemene | |
508 | 1 | equemene | for (i=0;i<RUNS;i++) |
509 | 1 | equemene | { |
510 | 147 | equemene | F77_dgemm(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
511 | 147 | equemene | F77_dgemm(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
512 | 1 | equemene | } |
513 | 1 | equemene | |
514 | 1 | equemene | #else
|
515 | 1 | equemene | |
516 | 1 | equemene | for (i=0;i<RUNS;i++) |
517 | 1 | equemene | { |
518 | 147 | equemene | F77_sgemm(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
519 | 147 | equemene | F77_sgemm(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
520 | 1 | equemene | } |
521 | 1 | equemene | |
522 | 1 | equemene | #endif
|
523 | 1 | equemene | |
524 | 1 | equemene | #elif ACML
|
525 | 1 | equemene | |
526 | 1 | equemene | /* Order is Row : Have to swap uplo='U' and trans='N' */
|
527 | 1 | equemene | char transa='N',transb='T'; |
528 | 1 | equemene | |
529 | 1 | equemene | printf("Using ACML: %i iterations for %ix%i matrix\n",
|
530 | 1 | equemene | RUNS,dim,dim); |
531 | 1 | equemene | |
532 | 1 | equemene | #ifdef DOUBLE
|
533 | 1 | equemene | |
534 | 1 | equemene | for (i=0;i<RUNS;i++) |
535 | 1 | equemene | { |
536 | 1 | equemene | dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
537 | 1 | equemene | dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
538 | 1 | equemene | } |
539 | 1 | equemene | |
540 | 1 | equemene | #else
|
541 | 1 | equemene | |
542 | 1 | equemene | for (i=0;i<RUNS;i++) |
543 | 1 | equemene | { |
544 | 1 | equemene | sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
545 | 1 | equemene | sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
546 | 1 | equemene | } |
547 | 1 | equemene | |
548 | 1 | equemene | #endif
|
549 | 1 | equemene | |
550 | 1 | equemene | #elif GSL
|
551 | 1 | equemene | |
552 | 1 | equemene | printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
553 | 1 | equemene | |
554 | 1 | equemene | /*
|
555 | 1 | equemene | RowMajor : Matrix is read row by row
|
556 | 1 | equemene | Upper : the no null elements are on top
|
557 | 1 | equemene | NoTrans : no transposition before estimation
|
558 | 1 | equemene | NonUnit : Matrix is not unit
|
559 | 1 | equemene | */
|
560 | 1 | equemene | |
561 | 1 | equemene | #ifdef DOUBLE
|
562 | 1 | equemene | |
563 | 1 | equemene | for (i=0;i<RUNS;i++) |
564 | 1 | equemene | { |
565 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
566 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
567 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
568 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
569 | 1 | equemene | } |
570 | 1 | equemene | |
571 | 1 | equemene | #else
|
572 | 1 | equemene | |
573 | 1 | equemene | for (i=0;i<RUNS;i++) |
574 | 1 | equemene | { |
575 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
576 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
577 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
578 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
579 | 1 | equemene | } |
580 | 1 | equemene | |
581 | 1 | equemene | #endif
|
582 | 1 | equemene | |
583 | 1 | equemene | #else
|
584 | 1 | equemene | |
585 | 1 | equemene | printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
586 | 1 | equemene | |
587 | 1 | equemene | /*
|
588 | 1 | equemene | RowMajor : Matrix is read row bu row
|
589 | 1 | equemene | Upper : the no null elements are on top
|
590 | 1 | equemene | NoTrans : no transposition before estimation
|
591 | 1 | equemene | NonUnit : Matrix is not unit
|
592 | 1 | equemene | */
|
593 | 1 | equemene | |
594 | 1 | equemene | #ifdef DOUBLE
|
595 | 1 | equemene | |
596 | 1 | equemene | for (i=0;i<RUNS;i++) |
597 | 1 | equemene | { |
598 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
599 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
600 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
601 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
602 | 1 | equemene | } |
603 | 1 | equemene | |
604 | 1 | equemene | #else
|
605 | 1 | equemene | |
606 | 1 | equemene | for (i=0;i<RUNS;i++) |
607 | 1 | equemene | { |
608 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
609 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
610 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
611 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
612 | 1 | equemene | } |
613 | 1 | equemene | |
614 | 1 | equemene | #endif
|
615 | 1 | equemene | |
616 | 1 | equemene | #endif
|
617 | 1 | equemene | |
618 | 1 | equemene | /* Get second timer after launching */
|
619 | 1 | equemene | gettimeofday(&tv2, &tz); |
620 | 1 | equemene | |
621 | 1 | equemene | /* Store the checker of errors */
|
622 | 1 | equemene | checksA[0]=0.; |
623 | 1 | equemene | |
624 | 1 | equemene | for (i=0; i<dim; i++) { |
625 | 1 | equemene | for (j=0; j<dim; j++) { |
626 | 1 | equemene | checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]); |
627 | 1 | equemene | } |
628 | 1 | equemene | } |
629 | 1 | equemene | |
630 | 1 | equemene | /* Print the matrix */
|
631 | 1 | equemene | |
632 | 1 | equemene | #ifdef QUIET
|
633 | 1 | equemene | #else
|
634 | 1 | equemene | for (i=0; i<dim; i++) { |
635 | 1 | equemene | for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]); |
636 | 1 | equemene | putchar('\n');
|
637 | 1 | equemene | } |
638 | 1 | equemene | putchar('\n');
|
639 | 1 | equemene | for (i=0; i<dim; i++) { |
640 | 1 | equemene | for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]); |
641 | 1 | equemene | putchar('\n');
|
642 | 1 | equemene | } |
643 | 1 | equemene | putchar('\n');
|
644 | 1 | equemene | #endif
|
645 | 1 | equemene | |
646 | 1 | equemene | /* Free 1 Matrix and 2 Vectors of dimension dim */
|
647 | 1 | equemene | |
648 | 1 | equemene | free(A); |
649 | 1 | equemene | free(B); |
650 | 1 | equemene | free(C); |
651 | 1 | equemene | free(D); |
652 | 1 | equemene | |
653 | 1 | equemene | putchar('\n');
|
654 | 1 | equemene | |
655 | 53 | equemene | #ifdef CLBLAS
|
656 | 1 | equemene | double memoryIn,memoryOut;
|
657 | 1 | equemene | |
658 | 1 | equemene | memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
659 | 1 | equemene | (tv3.tv_usec-tv1.tv_usec))/1000000.; |
660 | 1 | equemene | |
661 | 1 | equemene | memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
662 | 1 | equemene | (tv2.tv_usec-tv4.tv_usec))/1000000.; |
663 | 1 | equemene | |
664 | 1 | equemene | duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
665 | 1 | equemene | (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
666 | 1 | equemene | |
667 | 1 | equemene | printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
668 | 1 | equemene | printf("Duration of memory free : %2.10f s\n",memoryOut);
|
669 | 53 | equemene | #elif CUBLAS
|
670 | 53 | equemene | double memoryIn,memoryOut;
|
671 | 53 | equemene | |
672 | 53 | equemene | memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
673 | 53 | equemene | (tv3.tv_usec-tv1.tv_usec))/1000000.; |
674 | 53 | equemene | |
675 | 53 | equemene | memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
676 | 53 | equemene | (tv2.tv_usec-tv4.tv_usec))/1000000.; |
677 | 53 | equemene | |
678 | 53 | equemene | duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
679 | 53 | equemene | (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
680 | 53 | equemene | |
681 | 53 | equemene | printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
682 | 53 | equemene | printf("Duration of memory free : %2.10f s\n",memoryOut);
|
683 | 1 | equemene | #else
|
684 | 1 | equemene | duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + \ |
685 | 1 | equemene | (tv2.tv_usec-tv1.tv_usec))/1000000./RUNS; |
686 | 1 | equemene | |
687 | 1 | equemene | #endif
|
688 | 1 | equemene | |
689 | 1 | equemene | printf("Duration of each cycle : %2.10f s\n",duration);
|
690 | 1 | equemene | |
691 | 1 | equemene | printf("Number of GFlops : %2.3f \n",
|
692 | 1 | equemene | dim*dim*2.*(2.*dim-1)/duration/1000000000.); |
693 | 1 | equemene | |
694 | 1 | equemene | printf("Error %1.10f\n",checksA[0]); |
695 | 1 | equemene | printResults(RUNS,checksA,"C","Errors cumulated"); |
696 | 1 | equemene | |
697 | 1 | equemene | putchar('\n');
|
698 | 1 | equemene | |
699 | 1 | equemene | /* Free 2 vectors for checker Before and After */
|
700 | 1 | equemene | |
701 | 1 | equemene | free(checksA); |
702 | 1 | equemene | free(checksB); |
703 | 1 | equemene | |
704 | 1 | equemene | return 0; |
705 | 1 | equemene | } |
706 | 1 | equemene | |
707 | 53 | equemene | #ifdef CLBLAS
|
708 | 53 | equemene | |
709 | 53 | equemene | int DelectOpenCLDevices()
|
710 | 53 | equemene | { |
711 | 53 | equemene | /* */
|
712 | 53 | equemene | /* Not needed to import CL.h, already done in CLBLAS.h */
|
713 | 53 | equemene | |
714 | 53 | equemene | int i, j;
|
715 | 53 | equemene | char* value;
|
716 | 53 | equemene | size_t valueSize; |
717 | 53 | equemene | cl_uint platformCount; |
718 | 53 | equemene | cl_platform_id* platforms; |
719 | 53 | equemene | cl_uint deviceCount; |
720 | 53 | equemene | cl_device_id* devices; |
721 | 53 | equemene | cl_uint maxComputeUnits; |
722 | 53 | equemene | cl_int maxWorkGroupSize; |
723 | 53 | equemene | cl_int maxWorkItemSizes; |
724 | 53 | equemene | cl_device_type dev_type; |
725 | 53 | equemene | |
726 | 53 | equemene | // get all platforms
|
727 | 53 | equemene | clGetPlatformIDs(0, NULL, &platformCount); |
728 | 53 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
729 | 53 | equemene | clGetPlatformIDs(platformCount, platforms, NULL);
|
730 | 53 | equemene | |
731 | 53 | equemene | |
732 | 53 | equemene | printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
733 | 53 | equemene | |
734 | 53 | equemene | for (i = 0; i < platformCount; i++) { |
735 | 53 | equemene | |
736 | 53 | equemene | // get all devices
|
737 | 53 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
738 | 53 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
739 | 53 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
740 | 53 | equemene | |
741 | 53 | equemene | // for each device print critical attributes
|
742 | 53 | equemene | for (j = 0; j < deviceCount; j++) { |
743 | 53 | equemene | |
744 | 53 | equemene | // print device name
|
745 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
746 | 53 | equemene | value = (char*) malloc(valueSize);
|
747 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
748 | 53 | equemene | printf("Device (%d,%d): %s\n",i, j, value);
|
749 | 53 | equemene | free(value); |
750 | 53 | equemene | |
751 | 53 | equemene | // print type device CPU/GPU/ACCELERATOR
|
752 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
753 | 53 | equemene | printf("\tDevice Type: ");
|
754 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_GPU)
|
755 | 53 | equemene | printf("CL_DEVICE_TYPE_GPU ");
|
756 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_CPU)
|
757 | 53 | equemene | printf("CL_DEVICE_TYPE_CPU ");
|
758 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
759 | 53 | equemene | printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
760 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
761 | 53 | equemene | printf("CL_DEVICE_TYPE_DEFAULT ");
|
762 | 53 | equemene | printf("\n");
|
763 | 53 | equemene | |
764 | 53 | equemene | // print device vendor
|
765 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
766 | 53 | equemene | value = (char*) malloc(valueSize);
|
767 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
768 | 53 | equemene | printf("\tDevice vendor: %s\n", value);
|
769 | 53 | equemene | free(value); |
770 | 53 | equemene | |
771 | 53 | equemene | // print hardware device version
|
772 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
773 | 53 | equemene | value = (char*) malloc(valueSize);
|
774 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
775 | 53 | equemene | printf("\tHardware version: %s\n", value);
|
776 | 53 | equemene | free(value); |
777 | 53 | equemene | |
778 | 53 | equemene | // print software driver version
|
779 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
780 | 53 | equemene | value = (char*) malloc(valueSize);
|
781 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
782 | 53 | equemene | printf("\tSoftware version: %s\n", value);
|
783 | 53 | equemene | free(value); |
784 | 53 | equemene | |
785 | 53 | equemene | // print c version supported by compiler for device
|
786 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
787 | 53 | equemene | value = (char*) malloc(valueSize);
|
788 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
789 | 53 | equemene | printf("\tOpenCL C version: %s\n", value);
|
790 | 53 | equemene | free(value); |
791 | 53 | equemene | |
792 | 53 | equemene | // print parallel compute units
|
793 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
794 | 53 | equemene | sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
795 | 53 | equemene | printf("\tParallel compute units: %d\n", maxComputeUnits);
|
796 | 53 | equemene | |
797 | 53 | equemene | // print max work group size
|
798 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
799 | 53 | equemene | sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
800 | 53 | equemene | printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
801 | 53 | equemene | |
802 | 53 | equemene | // print max work items size
|
803 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
804 | 53 | equemene | sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
805 | 53 | equemene | printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
806 | 53 | equemene | |
807 | 53 | equemene | } |
808 | 53 | equemene | printf("\n");
|
809 | 53 | equemene | free(devices); |
810 | 53 | equemene | } |
811 | 53 | equemene | |
812 | 53 | equemene | free(platforms); |
813 | 53 | equemene | return 0; |
814 | 53 | equemene | |
815 | 53 | equemene | } |
816 | 53 | equemene | #endif
|
817 | 53 | equemene | |
818 | 1 | equemene | int main(int argc,char **argv) |
819 | 1 | equemene | { |
820 | 1 | equemene | if ((argc==1)|| |
821 | 1 | equemene | (strcmp(argv[1],"-h")==0)|| |
822 | 1 | equemene | (strcmp(argv[1],"--help")==0)) |
823 | 1 | equemene | { |
824 | 53 | equemene | #ifdef CLBLAS
|
825 | 1 | equemene | printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
826 | 1 | equemene | "\t#1 Size of square matrices \n"
|
827 | 53 | equemene | "\t#2 Number of iterations \n"
|
828 | 53 | equemene | "\t#3 OpenCL Plateform ID\n"
|
829 | 53 | equemene | "\t#4 OpenCL Device ID\n\n");
|
830 | 53 | equemene | DelectOpenCLDevices(); |
831 | 53 | equemene | #else
|
832 | 53 | equemene | printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
833 | 53 | equemene | "\t#1 Size of square matrices \n"
|
834 | 1 | equemene | "\t#2 Number of iterations\n\n");
|
835 | 53 | equemene | #endif
|
836 | 1 | equemene | } |
837 | 1 | equemene | else if ((atoi(argv[1])>=2)&& |
838 | 1 | equemene | (atoi(argv[2])>=1)) |
839 | 1 | equemene | { |
840 | 53 | equemene | #ifdef CLBLAS
|
841 | 53 | equemene | MyPlatform=atoi(argv[3]);
|
842 | 53 | equemene | MyDevice=atoi(argv[4]);
|
843 | 53 | equemene | #endif
|
844 | 1 | equemene | bench(atoi(argv[1]),atoi(argv[2])); |
845 | 1 | equemene | } |
846 | 1 | equemene | |
847 | 1 | equemene | return 0; |
848 | 1 | equemene | } |