root / BLAS / xGEMM / xGEMM.c @ 245
Historique | Voir | Annoter | Télécharger (21,4 ko)
1 |
/*
|
---|---|
2 |
Performs matrix multiply on several BLAS implementations
|
3 |
Copyleft Emmanuel QUEMENER <emmanuel.quemener@gmail.com> under GPLv3
|
4 |
|
5 |
2014-03-14 : Add clBLAS implementation
|
6 |
|
7 |
Thanks for help from aurel32@debian.org
|
8 |
*/
|
9 |
|
10 |
#include <stdio.h> |
11 |
#include <math.h> |
12 |
#include <stdlib.h> |
13 |
#include <sys/time.h> |
14 |
#include <string.h> |
15 |
|
16 |
#ifdef CLBLAS
|
17 |
#include <clBLAS.h> |
18 |
/* Precise here to avoid new specific bench function */
|
19 |
int MyPlatform;
|
20 |
int MyDevice;
|
21 |
#elif CUBLAS
|
22 |
#include <cublas.h> |
23 |
#define CUBLAS_WRAPPER_ERROR_NOERR 0 |
24 |
#define CUBLAS_WRAPPER_ERROR_ALLOC 1 |
25 |
#define CUBLAS_WRAPPER_ERROR_SET 2 |
26 |
#define CUBLAS_WRAPPER_ERROR_GET 3 |
27 |
#define CUBLAS_WRAPPER_ERROR_STUB 4 |
28 |
#elif THUNKING
|
29 |
#include <cublas.h> |
30 |
#include "fortran_common.h" |
31 |
#include "fortran_thunking.h" |
32 |
#elif FBLAS
|
33 |
#include <cblas.h> |
34 |
#include <cblas_f77.h> |
35 |
#elif GSL
|
36 |
#include <gsl_cblas.h> |
37 |
#elif ACML
|
38 |
#include <acml.h> |
39 |
#else
|
40 |
#include <cblas.h> |
41 |
// #include <blaswrap.h>
|
42 |
#endif
|
43 |
|
44 |
#ifdef CLBLAS
|
45 |
|
46 |
#ifdef DOUBLE
|
47 |
#define LENGTH cl_double
|
48 |
#else
|
49 |
#define LENGTH cl_float
|
50 |
#endif
|
51 |
|
52 |
#else
|
53 |
|
54 |
#ifdef DOUBLE
|
55 |
#define LENGTH double |
56 |
#else
|
57 |
#define LENGTH float |
58 |
#endif
|
59 |
|
60 |
#endif
|
61 |
|
62 |
#ifdef FBLAS
|
63 |
|
64 |
#ifdef DOUBLE
|
65 |
|
66 |
void F77_dgemm(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, |
67 |
const double *, FINT, const double *, double *, FINT); |
68 |
|
69 |
#else
|
70 |
|
71 |
void F77_sgemm(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, |
72 |
const float *, FINT, const float *, float *, FINT); |
73 |
|
74 |
#endif
|
75 |
#endif
|
76 |
|
77 |
/* Matrix with only defined triangular terms */
|
78 |
/* Even if there are 0 in matrix, must be defined at all ! */
|
79 |
|
80 |
/* Get from fortran.c */
|
81 |
|
82 |
#ifdef CUBLAS
|
83 |
static char *errMsg[5] = |
84 |
{ |
85 |
"no error",
|
86 |
"allocation error",
|
87 |
"setVector/setMatrix error",
|
88 |
"getVector/getMatrix error",
|
89 |
"not implemented"
|
90 |
}; |
91 |
|
92 |
static void wrapperError (const char *funcName, int error) |
93 |
{ |
94 |
printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
|
95 |
fflush (stdout); |
96 |
} |
97 |
#endif
|
98 |
|
99 |
int printVector(const int dimVector,const LENGTH *dataVector, |
100 |
char *nameVector,char *mesgVector) |
101 |
{ |
102 |
#ifndef QUIET
|
103 |
|
104 |
int i;
|
105 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
106 |
for (i=0;i<dimVector;i++) |
107 |
{ |
108 |
printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
109 |
} |
110 |
#endif
|
111 |
|
112 |
return 0; |
113 |
} |
114 |
|
115 |
int printResults(const int dimVector,const LENGTH *dataVector, |
116 |
char *nameVector,char *mesgVector) |
117 |
{ |
118 |
#ifdef RESULTS
|
119 |
int i;
|
120 |
|
121 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
122 |
for (i=0;i<dimVector;i++) |
123 |
{ |
124 |
printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
125 |
} |
126 |
#endif
|
127 |
return 0; |
128 |
} |
129 |
|
130 |
#ifdef CUBLAS
|
131 |
int printVectorGPU(const int dimVector,const LENGTH *dataVector, |
132 |
char *nameVector,char *mesgVector) |
133 |
{ |
134 |
#ifndef QUIET
|
135 |
int i;
|
136 |
cublasStatus stat; |
137 |
LENGTH *P=0;
|
138 |
int incx=1; |
139 |
|
140 |
P=malloc(dimVector*sizeof(LENGTH));
|
141 |
|
142 |
stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx); |
143 |
|
144 |
if (stat != CUBLAS_STATUS_SUCCESS) {
|
145 |
wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
|
146 |
} |
147 |
|
148 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
149 |
for (i=0;i<dimVector;i++) |
150 |
{ |
151 |
printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
|
152 |
} |
153 |
|
154 |
free(P); |
155 |
#endif
|
156 |
|
157 |
return 0; |
158 |
} |
159 |
#endif
|
160 |
|
161 |
int bench(int dim,int RUNS) |
162 |
{ |
163 |
/*
|
164 |
int dim=1000;
|
165 |
int RUNS=100;
|
166 |
int incx=1;
|
167 |
*/
|
168 |
#ifdef PRINT
|
169 |
LENGTH factor=1.;
|
170 |
#endif
|
171 |
|
172 |
LENGTH alpha=1.,beta=0.; |
173 |
LENGTH *A,*B,*C,*D; |
174 |
|
175 |
/* checkBefore checkAfter checks */
|
176 |
LENGTH *checksA,*checksB; |
177 |
|
178 |
int i=0, j=0; |
179 |
|
180 |
double duration;
|
181 |
|
182 |
struct timeval tv1,tv2;
|
183 |
struct timezone tz;
|
184 |
|
185 |
/* Create 4 Matrix of dimension dim by dim */
|
186 |
|
187 |
A=malloc(dim*dim*sizeof(LENGTH));
|
188 |
B=malloc(dim*dim*sizeof(LENGTH));
|
189 |
C=malloc(dim*dim*sizeof(LENGTH));
|
190 |
D=malloc(dim*dim*sizeof(LENGTH));
|
191 |
|
192 |
/* Create 2 vectors for checker Before and After */
|
193 |
|
194 |
checksA=malloc(RUNS*sizeof(LENGTH));
|
195 |
checksB=malloc(RUNS*sizeof(LENGTH));
|
196 |
|
197 |
/* Initialize elements with random numbers */
|
198 |
/* Initialize the seed for rand() */
|
199 |
/* srand(time()); */
|
200 |
|
201 |
for (i=0; i<dim; i++) { |
202 |
for (j=0; j<dim; j++) { |
203 |
A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
204 |
*(LENGTH)(i+1.)/(LENGTH)(j+1.); |
205 |
B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
206 |
*(LENGTH)(i+1.)/(LENGTH)(j+1.); |
207 |
C[i*dim+j]=0.;
|
208 |
D[i*dim+j]=0.;
|
209 |
} |
210 |
} |
211 |
/*
|
212 |
A[0]=1;
|
213 |
A[1]=2;
|
214 |
A[2]=3;
|
215 |
A[3]=4;
|
216 |
|
217 |
B[0]=5;
|
218 |
B[1]=6;
|
219 |
B[2]=7;
|
220 |
B[3]=8;
|
221 |
*/
|
222 |
|
223 |
/* Print the matrix */
|
224 |
|
225 |
#ifdef QUIET
|
226 |
#else
|
227 |
for (i=0; i<dim; i++) { |
228 |
for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]); |
229 |
putchar('\n');
|
230 |
} |
231 |
putchar('\n');
|
232 |
for (i=0; i<dim; i++) { |
233 |
for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]); |
234 |
putchar('\n');
|
235 |
} |
236 |
putchar('\n');
|
237 |
#endif
|
238 |
|
239 |
/* Get first timer before launching */
|
240 |
gettimeofday(&tv1, &tz); |
241 |
|
242 |
/* Compute with CLBLAS library */
|
243 |
#ifdef CLBLAS
|
244 |
|
245 |
cl_uint platformCount; |
246 |
cl_platform_id* platforms; |
247 |
cl_uint deviceCount; |
248 |
cl_device_id* devices; |
249 |
|
250 |
cl_int err,errA,errB,errC,errD; |
251 |
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
252 |
cl_context ctx = 0;
|
253 |
cl_command_queue queue = 0;
|
254 |
cl_mem bufA, bufB, bufC, bufD; |
255 |
cl_event event = NULL;
|
256 |
|
257 |
char* value;
|
258 |
size_t valueSize; |
259 |
|
260 |
// tv3 Put on Device: Allocate & Write buffer
|
261 |
// tv4 Compute
|
262 |
struct timeval tv3,tv4;
|
263 |
|
264 |
printf("Using CLBLAS: %i iterations for %ix%i matrix on (%d,%d)\n",
|
265 |
RUNS,dim,dim,MyPlatform,MyDevice); |
266 |
|
267 |
/* Setup OpenCL environment. */
|
268 |
/* - get all platforms and select MyPlatform */
|
269 |
/* - get all devices from MyPlatform and select MyDevice */
|
270 |
|
271 |
// Get all platforms
|
272 |
err = clGetPlatformIDs(0, NULL, &platformCount); |
273 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
274 |
err = clGetPlatformIDs(platformCount, platforms, NULL);
|
275 |
|
276 |
// Get Device defined
|
277 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
278 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
279 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
280 |
|
281 |
// print device name
|
282 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
283 |
value = (char*) malloc(valueSize);
|
284 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, value, NULL);
|
285 |
printf("Device (%d,%d): %s\n",MyPlatform,MyDevice, value);
|
286 |
free(value); |
287 |
|
288 |
props[1] = (cl_context_properties)platforms[MyPlatform];
|
289 |
|
290 |
/* Initialize Context */
|
291 |
ctx = clCreateContext( props, 1, &devices[MyDevice], NULL, NULL, &err ); |
292 |
queue = clCreateCommandQueue( ctx, devices[MyDevice], 0, &err );
|
293 |
|
294 |
/* Setup clBLAS */
|
295 |
err = clblasSetup( ); |
296 |
|
297 |
/* Prepare OpenCL memory objects and place matrices inside them. */
|
298 |
bufA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*A),NULL,&errA ); |
299 |
bufB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*B),NULL,&errB ); |
300 |
bufC = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*C),NULL,&errC ); |
301 |
bufD = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*D),NULL,&errD ); |
302 |
|
303 |
errA = clEnqueueWriteBuffer( queue,bufA,CL_TRUE,0,
|
304 |
dim*dim*sizeof(*A),A,0,NULL,NULL ); |
305 |
errB = clEnqueueWriteBuffer( queue, bufB, CL_TRUE,0,
|
306 |
dim*dim*sizeof(*B),B,0,NULL,NULL ); |
307 |
errC = clEnqueueWriteBuffer( queue, bufC, CL_TRUE,0,
|
308 |
dim*dim*sizeof(*C),C,0,NULL,NULL ); |
309 |
errD = clEnqueueWriteBuffer( queue, bufD, CL_TRUE,0,
|
310 |
dim*dim*sizeof(*D),D,0,NULL,NULL ); |
311 |
|
312 |
/* Get third timer after memory operation */
|
313 |
gettimeofday(&tv3, &tz); |
314 |
|
315 |
#ifdef DOUBLE
|
316 |
|
317 |
for (i=0;i<RUNS;i++) |
318 |
{ |
319 |
err = clblasDgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
320 |
dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
321 |
bufC,0,dim,1,&queue,0,NULL,&event ); |
322 |
|
323 |
err = clblasDgemm( clblasRowMajor,clblasTrans,clblasTrans, |
324 |
dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
325 |
bufD,0,dim,1,&queue,0,NULL,&event ); |
326 |
|
327 |
} |
328 |
|
329 |
if (err != CL_SUCCESS) {
|
330 |
printf("clblasDgemm() failed with %d\n", err);
|
331 |
} |
332 |
|
333 |
#else
|
334 |
|
335 |
for (i=0;i<RUNS;i++) |
336 |
{ |
337 |
|
338 |
err = clblasSgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
339 |
dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
340 |
bufC,0,dim,1,&queue,0,NULL,&event ); |
341 |
|
342 |
err = clblasSgemm( clblasRowMajor,clblasTrans,clblasTrans, |
343 |
dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
344 |
bufD,0,dim,1,&queue,0,NULL,&event ); |
345 |
} |
346 |
|
347 |
if (err != CL_SUCCESS) {
|
348 |
printf("clblasSgemm() failed with %d\n", err);
|
349 |
} |
350 |
|
351 |
#endif
|
352 |
|
353 |
/* Wait for calculations to be finished. */
|
354 |
err = clWaitForEvents( 1, &event );
|
355 |
|
356 |
/* Get fourth timer after memory free */
|
357 |
gettimeofday(&tv4, &tz); |
358 |
|
359 |
/* Fetch results of calculations from GPU memory. */
|
360 |
errC = clEnqueueReadBuffer( queue,bufC,CL_TRUE,0,dim*dim * sizeof(*C), |
361 |
C,0,NULL,NULL ); |
362 |
|
363 |
/* Fetch results of calculations from GPU memory. */
|
364 |
errD = clEnqueueReadBuffer( queue,bufD,CL_TRUE,0,dim*dim*sizeof(*D), |
365 |
D,0,NULL,NULL ); |
366 |
|
367 |
/* Release OpenCL memory objects. */
|
368 |
clReleaseMemObject( bufD ); |
369 |
clReleaseMemObject( bufC ); |
370 |
clReleaseMemObject( bufB ); |
371 |
clReleaseMemObject( bufA ); |
372 |
|
373 |
/* Finalize work with clBLAS */
|
374 |
clblasTeardown( ); |
375 |
|
376 |
/* Release OpenCL working objects. */
|
377 |
clReleaseCommandQueue( queue ); |
378 |
clReleaseContext( ctx ); |
379 |
|
380 |
|
381 |
/* Compute with CuBLAS library */
|
382 |
#elif CUBLAS
|
383 |
LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0; |
384 |
cublasStatus stat1, stat2, stat3, stat4; |
385 |
struct timeval tv3,tv4;
|
386 |
|
387 |
/* Order is Row */
|
388 |
/* Have to swap uplo and trans */
|
389 |
char transa='N',transb='T'; |
390 |
|
391 |
printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
|
392 |
RUNS,dim,dim); |
393 |
|
394 |
stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA); |
395 |
stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB); |
396 |
stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC); |
397 |
stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD); |
398 |
|
399 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
400 |
(stat2 != CUBLAS_STATUS_SUCCESS) || |
401 |
(stat3 != CUBLAS_STATUS_SUCCESS) || |
402 |
(stat4 != CUBLAS_STATUS_SUCCESS) ) { |
403 |
wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
|
404 |
cublasFree (devPtrA); |
405 |
cublasFree (devPtrB); |
406 |
cublasFree (devPtrC); |
407 |
cublasFree (devPtrD); |
408 |
return 1; |
409 |
} |
410 |
|
411 |
stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim); |
412 |
stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim); |
413 |
stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim); |
414 |
stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim); |
415 |
|
416 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
417 |
(stat2 != CUBLAS_STATUS_SUCCESS) || |
418 |
(stat3 != CUBLAS_STATUS_SUCCESS) || |
419 |
(stat4 != CUBLAS_STATUS_SUCCESS) ) { |
420 |
wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
|
421 |
cublasFree (devPtrA); |
422 |
cublasFree (devPtrB); |
423 |
cublasFree (devPtrC); |
424 |
cublasFree (devPtrD); |
425 |
return 1; |
426 |
} |
427 |
|
428 |
/* Get third timer after memory operation */
|
429 |
gettimeofday(&tv3, &tz); |
430 |
|
431 |
#ifdef DOUBLE
|
432 |
|
433 |
for (i=0;i<RUNS;i++) |
434 |
{ |
435 |
cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
436 |
devPtrA,dim,beta,devPtrC,dim); |
437 |
cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
438 |
devPtrB,dim,beta,devPtrD,dim); |
439 |
} |
440 |
|
441 |
#else
|
442 |
|
443 |
for (i=0;i<RUNS;i++) |
444 |
{ |
445 |
cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
446 |
devPtrA,dim,beta,devPtrC,dim); |
447 |
cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
448 |
devPtrB,dim,beta,devPtrD,dim); |
449 |
} |
450 |
|
451 |
#endif
|
452 |
|
453 |
stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim); |
454 |
stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim); |
455 |
|
456 |
/* Get fourth timer before memory free */
|
457 |
gettimeofday(&tv4, &tz); |
458 |
|
459 |
cublasFree (devPtrA); |
460 |
cublasFree (devPtrB); |
461 |
cublasFree (devPtrC); |
462 |
cublasFree (devPtrD); |
463 |
|
464 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
|
465 |
wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
|
466 |
} |
467 |
|
468 |
|
469 |
#elif THUNKING
|
470 |
|
471 |
/* Order is Row : Have to swap uplo='U' and trans='N' */
|
472 |
char transa='N',transb='T'; |
473 |
printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
|
474 |
RUNS,dim,dim); |
475 |
|
476 |
#ifdef DOUBLE
|
477 |
|
478 |
for (i=0;i<RUNS;i++) |
479 |
{ |
480 |
CUBLAS_DGEMM(&transa,&transa, |
481 |
&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
482 |
CUBLAS_DGEMM(&transb,&transb, |
483 |
&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
484 |
} |
485 |
|
486 |
#else
|
487 |
|
488 |
for (i=0;i<RUNS;i++) |
489 |
{ |
490 |
CUBLAS_SGEMM(&transa,&transa, |
491 |
&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
492 |
CUBLAS_SGEMM(&transb,&transb, |
493 |
&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
494 |
} |
495 |
|
496 |
#endif
|
497 |
|
498 |
#elif FBLAS
|
499 |
|
500 |
/* Order is Row : Have to swap uplo='U' and trans='N' */
|
501 |
char transa='N',transb='T'; |
502 |
|
503 |
printf("Using FBLAS: %i iterations for %ix%i matrix\n",
|
504 |
RUNS,dim,dim); |
505 |
|
506 |
#ifdef DOUBLE
|
507 |
|
508 |
for (i=0;i<RUNS;i++) |
509 |
{ |
510 |
F77_dgemm(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
511 |
F77_dgemm(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
512 |
} |
513 |
|
514 |
#else
|
515 |
|
516 |
for (i=0;i<RUNS;i++) |
517 |
{ |
518 |
F77_sgemm(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
519 |
F77_sgemm(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
520 |
} |
521 |
|
522 |
#endif
|
523 |
|
524 |
#elif ACML
|
525 |
|
526 |
/* Order is Row : Have to swap uplo='U' and trans='N' */
|
527 |
char transa='N',transb='T'; |
528 |
|
529 |
printf("Using ACML: %i iterations for %ix%i matrix\n",
|
530 |
RUNS,dim,dim); |
531 |
|
532 |
#ifdef DOUBLE
|
533 |
|
534 |
for (i=0;i<RUNS;i++) |
535 |
{ |
536 |
dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
537 |
dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
538 |
} |
539 |
|
540 |
#else
|
541 |
|
542 |
for (i=0;i<RUNS;i++) |
543 |
{ |
544 |
sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
545 |
sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
546 |
} |
547 |
|
548 |
#endif
|
549 |
|
550 |
#elif GSL
|
551 |
|
552 |
printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
553 |
|
554 |
/*
|
555 |
RowMajor : Matrix is read row by row
|
556 |
Upper : the no null elements are on top
|
557 |
NoTrans : no transposition before estimation
|
558 |
NonUnit : Matrix is not unit
|
559 |
*/
|
560 |
|
561 |
#ifdef DOUBLE
|
562 |
|
563 |
for (i=0;i<RUNS;i++) |
564 |
{ |
565 |
cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
566 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
567 |
cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
568 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
569 |
} |
570 |
|
571 |
#else
|
572 |
|
573 |
for (i=0;i<RUNS;i++) |
574 |
{ |
575 |
cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
576 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
577 |
cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
578 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
579 |
} |
580 |
|
581 |
#endif
|
582 |
|
583 |
#else
|
584 |
|
585 |
printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
586 |
|
587 |
/*
|
588 |
RowMajor : Matrix is read row bu row
|
589 |
Upper : the no null elements are on top
|
590 |
NoTrans : no transposition before estimation
|
591 |
NonUnit : Matrix is not unit
|
592 |
*/
|
593 |
|
594 |
#ifdef DOUBLE
|
595 |
|
596 |
for (i=0;i<RUNS;i++) |
597 |
{ |
598 |
cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
599 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
600 |
cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
601 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
602 |
} |
603 |
|
604 |
#else
|
605 |
|
606 |
for (i=0;i<RUNS;i++) |
607 |
{ |
608 |
cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
609 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
610 |
cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
611 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
612 |
} |
613 |
|
614 |
#endif
|
615 |
|
616 |
#endif
|
617 |
|
618 |
/* Get second timer after launching */
|
619 |
gettimeofday(&tv2, &tz); |
620 |
|
621 |
/* Store the checker of errors */
|
622 |
checksA[0]=0.; |
623 |
|
624 |
for (i=0; i<dim; i++) { |
625 |
for (j=0; j<dim; j++) { |
626 |
checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]); |
627 |
} |
628 |
} |
629 |
|
630 |
/* Print the matrix */
|
631 |
|
632 |
#ifdef QUIET
|
633 |
#else
|
634 |
for (i=0; i<dim; i++) { |
635 |
for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]); |
636 |
putchar('\n');
|
637 |
} |
638 |
putchar('\n');
|
639 |
for (i=0; i<dim; i++) { |
640 |
for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]); |
641 |
putchar('\n');
|
642 |
} |
643 |
putchar('\n');
|
644 |
#endif
|
645 |
|
646 |
/* Free 1 Matrix and 2 Vectors of dimension dim */
|
647 |
|
648 |
free(A); |
649 |
free(B); |
650 |
free(C); |
651 |
free(D); |
652 |
|
653 |
putchar('\n');
|
654 |
|
655 |
#ifdef CLBLAS
|
656 |
double memoryIn,memoryOut;
|
657 |
|
658 |
memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
659 |
(tv3.tv_usec-tv1.tv_usec))/1000000.; |
660 |
|
661 |
memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
662 |
(tv2.tv_usec-tv4.tv_usec))/1000000.; |
663 |
|
664 |
duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
665 |
(tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
666 |
|
667 |
printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
668 |
printf("Duration of memory free : %2.10f s\n",memoryOut);
|
669 |
#elif CUBLAS
|
670 |
double memoryIn,memoryOut;
|
671 |
|
672 |
memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
673 |
(tv3.tv_usec-tv1.tv_usec))/1000000.; |
674 |
|
675 |
memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
676 |
(tv2.tv_usec-tv4.tv_usec))/1000000.; |
677 |
|
678 |
duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
679 |
(tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
680 |
|
681 |
printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
682 |
printf("Duration of memory free : %2.10f s\n",memoryOut);
|
683 |
#else
|
684 |
duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + \ |
685 |
(tv2.tv_usec-tv1.tv_usec))/1000000./RUNS; |
686 |
|
687 |
#endif
|
688 |
|
689 |
printf("Duration of each cycle : %2.10f s\n",duration);
|
690 |
|
691 |
printf("Number of GFlops : %2.3f \n",
|
692 |
dim*dim*2.*(2.*dim-1)/duration/1000000000.); |
693 |
|
694 |
printf("Error %1.10f\n",checksA[0]); |
695 |
printResults(RUNS,checksA,"C","Errors cumulated"); |
696 |
|
697 |
putchar('\n');
|
698 |
|
699 |
/* Free 2 vectors for checker Before and After */
|
700 |
|
701 |
free(checksA); |
702 |
free(checksB); |
703 |
|
704 |
return 0; |
705 |
} |
706 |
|
707 |
#ifdef CLBLAS
|
708 |
|
709 |
int DelectOpenCLDevices()
|
710 |
{ |
711 |
/* */
|
712 |
/* Not needed to import CL.h, already done in CLBLAS.h */
|
713 |
|
714 |
int i, j;
|
715 |
char* value;
|
716 |
size_t valueSize; |
717 |
cl_uint platformCount; |
718 |
cl_platform_id* platforms; |
719 |
cl_uint deviceCount; |
720 |
cl_device_id* devices; |
721 |
cl_uint maxComputeUnits; |
722 |
cl_int maxWorkGroupSize; |
723 |
cl_int maxWorkItemSizes; |
724 |
cl_device_type dev_type; |
725 |
|
726 |
// get all platforms
|
727 |
clGetPlatformIDs(0, NULL, &platformCount); |
728 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
729 |
clGetPlatformIDs(platformCount, platforms, NULL);
|
730 |
|
731 |
|
732 |
printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
733 |
|
734 |
for (i = 0; i < platformCount; i++) { |
735 |
|
736 |
// get all devices
|
737 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
738 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
739 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
740 |
|
741 |
// for each device print critical attributes
|
742 |
for (j = 0; j < deviceCount; j++) { |
743 |
|
744 |
// print device name
|
745 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
746 |
value = (char*) malloc(valueSize);
|
747 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
748 |
printf("Device (%d,%d): %s\n",i, j, value);
|
749 |
free(value); |
750 |
|
751 |
// print type device CPU/GPU/ACCELERATOR
|
752 |
clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
753 |
printf("\tDevice Type: ");
|
754 |
if(dev_type & CL_DEVICE_TYPE_GPU)
|
755 |
printf("CL_DEVICE_TYPE_GPU ");
|
756 |
if(dev_type & CL_DEVICE_TYPE_CPU)
|
757 |
printf("CL_DEVICE_TYPE_CPU ");
|
758 |
if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
759 |
printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
760 |
if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
761 |
printf("CL_DEVICE_TYPE_DEFAULT ");
|
762 |
printf("\n");
|
763 |
|
764 |
// print device vendor
|
765 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
766 |
value = (char*) malloc(valueSize);
|
767 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
768 |
printf("\tDevice vendor: %s\n", value);
|
769 |
free(value); |
770 |
|
771 |
// print hardware device version
|
772 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
773 |
value = (char*) malloc(valueSize);
|
774 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
775 |
printf("\tHardware version: %s\n", value);
|
776 |
free(value); |
777 |
|
778 |
// print software driver version
|
779 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
780 |
value = (char*) malloc(valueSize);
|
781 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
782 |
printf("\tSoftware version: %s\n", value);
|
783 |
free(value); |
784 |
|
785 |
// print c version supported by compiler for device
|
786 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
787 |
value = (char*) malloc(valueSize);
|
788 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
789 |
printf("\tOpenCL C version: %s\n", value);
|
790 |
free(value); |
791 |
|
792 |
// print parallel compute units
|
793 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
794 |
sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
795 |
printf("\tParallel compute units: %d\n", maxComputeUnits);
|
796 |
|
797 |
// print max work group size
|
798 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
799 |
sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
800 |
printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
801 |
|
802 |
// print max work items size
|
803 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
804 |
sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
805 |
printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
806 |
|
807 |
} |
808 |
printf("\n");
|
809 |
free(devices); |
810 |
} |
811 |
|
812 |
free(platforms); |
813 |
return 0; |
814 |
|
815 |
} |
816 |
#endif
|
817 |
|
818 |
int main(int argc,char **argv) |
819 |
{ |
820 |
if ((argc==1)|| |
821 |
(strcmp(argv[1],"-h")==0)|| |
822 |
(strcmp(argv[1],"--help")==0)) |
823 |
{ |
824 |
#ifdef CLBLAS
|
825 |
printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
826 |
"\t#1 Size of square matrices \n"
|
827 |
"\t#2 Number of iterations \n"
|
828 |
"\t#3 OpenCL Plateform ID\n"
|
829 |
"\t#4 OpenCL Device ID\n\n");
|
830 |
DelectOpenCLDevices(); |
831 |
#else
|
832 |
printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
833 |
"\t#1 Size of square matrices \n"
|
834 |
"\t#2 Number of iterations\n\n");
|
835 |
#endif
|
836 |
} |
837 |
else if ((atoi(argv[1])>=2)&& |
838 |
(atoi(argv[2])>=1)) |
839 |
{ |
840 |
#ifdef CLBLAS
|
841 |
MyPlatform=atoi(argv[3]);
|
842 |
MyDevice=atoi(argv[4]);
|
843 |
#endif
|
844 |
bench(atoi(argv[1]),atoi(argv[2])); |
845 |
} |
846 |
|
847 |
return 0; |
848 |
} |