root / BLAS / xGEMM / xGEMM.c @ 146
Historique | Voir | Annoter | Télécharger (21,82 ko)
1 |
/*
|
---|---|
2 |
Performs matrix multiply on several BLAS implementations
|
3 |
Copyleft Emmanuel QUEMENER <emmanuel.quemener@gmail.com> under GPLv3
|
4 |
|
5 |
2014-03-14 : Add clBLAS implementation
|
6 |
|
7 |
Thanks for help from aurel32@debian.org
|
8 |
*/
|
9 |
|
10 |
#include <stdio.h> |
11 |
#include <math.h> |
12 |
#include <stdlib.h> |
13 |
#include <sys/time.h> |
14 |
#include <string.h> |
15 |
|
16 |
#ifdef CLBLAS
|
17 |
#include <clBLAS.h> |
18 |
/* Precise here to avoid new specific bench function */
|
19 |
int MyPlatform;
|
20 |
int MyDevice;
|
21 |
#elif CUBLAS
|
22 |
#include <cublas.h> |
23 |
#define CUBLAS_WRAPPER_ERROR_NOERR 0 |
24 |
#define CUBLAS_WRAPPER_ERROR_ALLOC 1 |
25 |
#define CUBLAS_WRAPPER_ERROR_SET 2 |
26 |
#define CUBLAS_WRAPPER_ERROR_GET 3 |
27 |
#define CUBLAS_WRAPPER_ERROR_STUB 4 |
28 |
#elif THUNKING
|
29 |
#include <cublas.h> |
30 |
#include "fortran_common.h" |
31 |
#include "fortran_thunking.h" |
32 |
#elif FBLAS
|
33 |
#include <cblas_f77.h> |
34 |
#elif GSL
|
35 |
#include <gsl_cblas.h> |
36 |
#elif ACML
|
37 |
#include <acml.h> |
38 |
#else
|
39 |
#include <cblas.h> |
40 |
#include <blaswrap.h> |
41 |
#endif
|
42 |
|
43 |
#ifdef CLBLAS
|
44 |
|
45 |
#ifdef DOUBLE
|
46 |
#define LENGTH cl_double
|
47 |
#else
|
48 |
#define LENGTH cl_float
|
49 |
#endif
|
50 |
|
51 |
#else
|
52 |
|
53 |
#ifdef DOUBLE
|
54 |
#define LENGTH double |
55 |
#else
|
56 |
#define LENGTH float |
57 |
#endif
|
58 |
|
59 |
#endif
|
60 |
|
61 |
#ifdef FBLAS
|
62 |
|
63 |
#ifdef DOUBLE
|
64 |
|
65 |
void dgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, |
66 |
const double *, FINT, const double *, double *, FINT); |
67 |
|
68 |
#else
|
69 |
|
70 |
void sgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, |
71 |
const float *, FINT, const float *, float *, FINT); |
72 |
|
73 |
#endif
|
74 |
#endif
|
75 |
|
76 |
/* Matrix with only defined triangular terms */
|
77 |
/* Even if there are 0 in matrix, must be defined at all ! */
|
78 |
|
79 |
/* Get from fortran.c */
|
80 |
|
81 |
#ifdef CUBLAS
|
82 |
static char *errMsg[5] = |
83 |
{ |
84 |
"no error",
|
85 |
"allocation error",
|
86 |
"setVector/setMatrix error",
|
87 |
"getVector/getMatrix error",
|
88 |
"not implemented"
|
89 |
}; |
90 |
|
91 |
static void wrapperError (const char *funcName, int error) |
92 |
{ |
93 |
printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
|
94 |
fflush (stdout); |
95 |
} |
96 |
#endif
|
97 |
|
98 |
int printVector(const int dimVector,const LENGTH *dataVector, |
99 |
char *nameVector,char *mesgVector) |
100 |
{ |
101 |
#ifndef QUIET
|
102 |
|
103 |
int i;
|
104 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
105 |
for (i=0;i<dimVector;i++) |
106 |
{ |
107 |
printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
108 |
} |
109 |
#endif
|
110 |
|
111 |
return 0; |
112 |
} |
113 |
|
114 |
int printResults(const int dimVector,const LENGTH *dataVector, |
115 |
char *nameVector,char *mesgVector) |
116 |
{ |
117 |
#ifdef RESULTS
|
118 |
int i;
|
119 |
|
120 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
121 |
for (i=0;i<dimVector;i++) |
122 |
{ |
123 |
printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
124 |
} |
125 |
#endif
|
126 |
return 0; |
127 |
} |
128 |
|
129 |
#ifdef CUBLAS
|
130 |
int printVectorGPU(const int dimVector,const LENGTH *dataVector, |
131 |
char *nameVector,char *mesgVector) |
132 |
{ |
133 |
#ifndef QUIET
|
134 |
int i;
|
135 |
cublasStatus stat; |
136 |
LENGTH *P=0;
|
137 |
int incx=1; |
138 |
|
139 |
P=malloc(dimVector*sizeof(LENGTH));
|
140 |
|
141 |
stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx); |
142 |
|
143 |
if (stat != CUBLAS_STATUS_SUCCESS) {
|
144 |
wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
|
145 |
} |
146 |
|
147 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
148 |
for (i=0;i<dimVector;i++) |
149 |
{ |
150 |
printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
|
151 |
} |
152 |
|
153 |
free(P); |
154 |
#endif
|
155 |
|
156 |
return 0; |
157 |
} |
158 |
#endif
|
159 |
|
160 |
int bench(int dim,int RUNS) |
161 |
{ |
162 |
/*
|
163 |
int dim=1000;
|
164 |
int RUNS=100;
|
165 |
int incx=1;
|
166 |
*/
|
167 |
#ifdef PRINT
|
168 |
LENGTH factor=1.;
|
169 |
#endif
|
170 |
|
171 |
LENGTH alpha=1.,beta=0.; |
172 |
LENGTH *A,*B,*C,*D; |
173 |
|
174 |
/* checkBefore checkAfter checks */
|
175 |
LENGTH *checksA,*checksB; |
176 |
|
177 |
int i=0, j=0; |
178 |
|
179 |
double duration;
|
180 |
|
181 |
struct timeval tv1,tv2;
|
182 |
struct timezone tz;
|
183 |
|
184 |
/* Create 4 Matrix of dimension dim by dim */
|
185 |
|
186 |
A=malloc(dim*dim*sizeof(LENGTH));
|
187 |
B=malloc(dim*dim*sizeof(LENGTH));
|
188 |
C=malloc(dim*dim*sizeof(LENGTH));
|
189 |
D=malloc(dim*dim*sizeof(LENGTH));
|
190 |
|
191 |
/* Create 2 vectors for checker Before and After */
|
192 |
|
193 |
checksA=malloc(RUNS*sizeof(LENGTH));
|
194 |
checksB=malloc(RUNS*sizeof(LENGTH));
|
195 |
|
196 |
/* Initialize elements with random numbers */
|
197 |
/* Initialize the seed for rand() */
|
198 |
/* srand(time()); */
|
199 |
|
200 |
for (i=0; i<dim; i++) { |
201 |
for (j=0; j<dim; j++) { |
202 |
A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
203 |
*(LENGTH)(i+1.)/(LENGTH)(j+1.); |
204 |
B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
205 |
*(LENGTH)(i+1.)/(LENGTH)(j+1.); |
206 |
C[i*dim+j]=0.;
|
207 |
D[i*dim+j]=0.;
|
208 |
} |
209 |
} |
210 |
/*
|
211 |
A[0]=1;
|
212 |
A[1]=2;
|
213 |
A[2]=3;
|
214 |
A[3]=4;
|
215 |
|
216 |
B[0]=5;
|
217 |
B[1]=6;
|
218 |
B[2]=7;
|
219 |
B[3]=8;
|
220 |
*/
|
221 |
|
222 |
/* Print the matrix */
|
223 |
|
224 |
#ifdef QUIET
|
225 |
#else
|
226 |
for (i=0; i<dim; i++) { |
227 |
for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]); |
228 |
putchar('\n');
|
229 |
} |
230 |
putchar('\n');
|
231 |
for (i=0; i<dim; i++) { |
232 |
for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]); |
233 |
putchar('\n');
|
234 |
} |
235 |
putchar('\n');
|
236 |
#endif
|
237 |
|
238 |
/* Get first timer before launching */
|
239 |
gettimeofday(&tv1, &tz); |
240 |
|
241 |
/* Compute with CLBLAS library */
|
242 |
#ifdef CLBLAS
|
243 |
|
244 |
cl_uint platformCount; |
245 |
cl_platform_id* platforms; |
246 |
cl_uint deviceCount; |
247 |
cl_device_id* devices; |
248 |
|
249 |
cl_int err,errA,errB,errC,errD; |
250 |
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
251 |
cl_context ctx = 0;
|
252 |
cl_command_queue queue = 0;
|
253 |
cl_mem bufA, bufB, bufC, bufD; |
254 |
cl_event event = NULL;
|
255 |
|
256 |
char* value;
|
257 |
size_t valueSize; |
258 |
|
259 |
// tv3 Put on Device: Allocate & Write buffer
|
260 |
// tv4 Compute
|
261 |
struct timeval tv3,tv4;
|
262 |
|
263 |
printf("Using CLBLAS: %i iterations for %ix%i matrix on (%d,%d)\n",
|
264 |
RUNS,dim,dim,MyPlatform,MyDevice); |
265 |
|
266 |
/* Setup OpenCL environment. */
|
267 |
/* - get all platforms and select MyPlatform */
|
268 |
/* - get all devices from MyPlatform and select MyDevice */
|
269 |
|
270 |
// Get all platforms
|
271 |
err = clGetPlatformIDs(0, NULL, &platformCount); |
272 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
273 |
err = clGetPlatformIDs(platformCount, platforms, NULL);
|
274 |
|
275 |
// Get Device defined
|
276 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
277 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
278 |
err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
279 |
|
280 |
// print device name
|
281 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
282 |
value = (char*) malloc(valueSize);
|
283 |
err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, value, NULL);
|
284 |
printf("Device (%d,%d): %s\n",MyPlatform,MyDevice, value);
|
285 |
free(value); |
286 |
|
287 |
props[1] = (cl_context_properties)platforms[MyPlatform];
|
288 |
|
289 |
/* Initialize Context */
|
290 |
ctx = clCreateContext( props, 1, &devices[MyDevice], NULL, NULL, &err ); |
291 |
queue = clCreateCommandQueue( ctx, devices[MyDevice], 0, &err );
|
292 |
|
293 |
/* Setup clBLAS */
|
294 |
err = clblasSetup( ); |
295 |
|
296 |
/* Prepare OpenCL memory objects and place matrices inside them. */
|
297 |
bufA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*A),NULL,&errA ); |
298 |
bufB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*B),NULL,&errB ); |
299 |
bufC = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*C),NULL,&errC ); |
300 |
bufD = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*D),NULL,&errD ); |
301 |
|
302 |
errA = clEnqueueWriteBuffer( queue,bufA,CL_TRUE,0,
|
303 |
dim*dim*sizeof(*A),A,0,NULL,NULL ); |
304 |
errB = clEnqueueWriteBuffer( queue, bufB, CL_TRUE,0,
|
305 |
dim*dim*sizeof(*B),B,0,NULL,NULL ); |
306 |
errC = clEnqueueWriteBuffer( queue, bufC, CL_TRUE,0,
|
307 |
dim*dim*sizeof(*C),C,0,NULL,NULL ); |
308 |
errD = clEnqueueWriteBuffer( queue, bufD, CL_TRUE,0,
|
309 |
dim*dim*sizeof(*D),D,0,NULL,NULL ); |
310 |
|
311 |
/* Get third timer after memory operation */
|
312 |
gettimeofday(&tv3, &tz); |
313 |
|
314 |
#ifdef DOUBLE
|
315 |
|
316 |
for (i=0;i<RUNS;i++) |
317 |
{ |
318 |
err = clblasDgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
319 |
dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
320 |
bufC,0,dim,1,&queue,0,NULL,&event ); |
321 |
|
322 |
err = clblasDgemm( clblasRowMajor,clblasTrans,clblasTrans, |
323 |
dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
324 |
bufD,0,dim,1,&queue,0,NULL,&event ); |
325 |
|
326 |
} |
327 |
|
328 |
if (err != CL_SUCCESS) {
|
329 |
printf("clblasDgemm() failed with %d\n", err);
|
330 |
} |
331 |
|
332 |
#else
|
333 |
|
334 |
for (i=0;i<RUNS;i++) |
335 |
{ |
336 |
|
337 |
err = clblasSgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
338 |
dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
339 |
bufC,0,dim,1,&queue,0,NULL,&event ); |
340 |
|
341 |
err = clblasSgemm( clblasRowMajor,clblasTrans,clblasTrans, |
342 |
dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
343 |
bufD,0,dim,1,&queue,0,NULL,&event ); |
344 |
} |
345 |
|
346 |
if (err != CL_SUCCESS) {
|
347 |
printf("clblasSgemm() failed with %d\n", err);
|
348 |
} |
349 |
|
350 |
#endif
|
351 |
|
352 |
/* Wait for calculations to be finished. */
|
353 |
err = clWaitForEvents( 1, &event );
|
354 |
|
355 |
/* Get fourth timer after memory free */
|
356 |
gettimeofday(&tv4, &tz); |
357 |
|
358 |
/* Fetch results of calculations from GPU memory. */
|
359 |
errC = clEnqueueReadBuffer( queue,bufC,CL_TRUE,0,dim*dim * sizeof(*C), |
360 |
C,0,NULL,NULL ); |
361 |
|
362 |
/* Fetch results of calculations from GPU memory. */
|
363 |
errD = clEnqueueReadBuffer( queue,bufD,CL_TRUE,0,dim*dim*sizeof(*D), |
364 |
D,0,NULL,NULL ); |
365 |
|
366 |
/* Release OpenCL memory objects. */
|
367 |
clReleaseMemObject( bufD ); |
368 |
clReleaseMemObject( bufC ); |
369 |
clReleaseMemObject( bufB ); |
370 |
clReleaseMemObject( bufA ); |
371 |
|
372 |
/* Finalize work with clBLAS */
|
373 |
clblasTeardown( ); |
374 |
|
375 |
/* Release OpenCL working objects. */
|
376 |
clReleaseCommandQueue( queue ); |
377 |
clReleaseContext( ctx ); |
378 |
|
379 |
|
380 |
/* Compute with CuBLAS library */
|
381 |
#elif CUBLAS
|
382 |
LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0; |
383 |
cublasStatus stat1, stat2, stat3, stat4; |
384 |
struct timeval tv3,tv4;
|
385 |
|
386 |
/* Order is Row */
|
387 |
/* Have to swap uplo and trans */
|
388 |
char transa='N',transb='T'; |
389 |
|
390 |
printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
|
391 |
RUNS,dim,dim); |
392 |
|
393 |
stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA); |
394 |
stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB); |
395 |
stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC); |
396 |
stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD); |
397 |
|
398 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
399 |
(stat2 != CUBLAS_STATUS_SUCCESS) || |
400 |
(stat3 != CUBLAS_STATUS_SUCCESS) || |
401 |
(stat4 != CUBLAS_STATUS_SUCCESS) ) { |
402 |
wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
|
403 |
cublasFree (devPtrA); |
404 |
cublasFree (devPtrB); |
405 |
cublasFree (devPtrC); |
406 |
cublasFree (devPtrD); |
407 |
return 1; |
408 |
} |
409 |
|
410 |
stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim); |
411 |
stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim); |
412 |
stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim); |
413 |
stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim); |
414 |
|
415 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
416 |
(stat2 != CUBLAS_STATUS_SUCCESS) || |
417 |
(stat3 != CUBLAS_STATUS_SUCCESS) || |
418 |
(stat4 != CUBLAS_STATUS_SUCCESS) ) { |
419 |
wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
|
420 |
cublasFree (devPtrA); |
421 |
cublasFree (devPtrB); |
422 |
cublasFree (devPtrC); |
423 |
cublasFree (devPtrD); |
424 |
return 1; |
425 |
} |
426 |
|
427 |
/* Get third timer after memory operation */
|
428 |
gettimeofday(&tv3, &tz); |
429 |
|
430 |
#ifdef DOUBLE
|
431 |
|
432 |
for (i=0;i<RUNS;i++) |
433 |
{ |
434 |
cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
435 |
devPtrA,dim,beta,devPtrC,dim); |
436 |
cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
437 |
devPtrB,dim,beta,devPtrD,dim); |
438 |
} |
439 |
|
440 |
#else
|
441 |
|
442 |
for (i=0;i<RUNS;i++) |
443 |
{ |
444 |
cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
445 |
devPtrA,dim,beta,devPtrC,dim); |
446 |
cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
447 |
devPtrB,dim,beta,devPtrD,dim); |
448 |
} |
449 |
|
450 |
#endif
|
451 |
|
452 |
stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim); |
453 |
stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim); |
454 |
|
455 |
/* Get fourth timer before memory free */
|
456 |
gettimeofday(&tv4, &tz); |
457 |
|
458 |
cublasFree (devPtrA); |
459 |
cublasFree (devPtrB); |
460 |
cublasFree (devPtrC); |
461 |
cublasFree (devPtrD); |
462 |
|
463 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
|
464 |
wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
|
465 |
} |
466 |
|
467 |
|
468 |
#elif THUNKING
|
469 |
|
470 |
/* Order is Row : Have to swap uplo='U' and trans='N' */
|
471 |
char transa='N',transb='T'; |
472 |
printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
|
473 |
RUNS,dim,dim); |
474 |
|
475 |
#ifdef DOUBLE
|
476 |
|
477 |
for (i=0;i<RUNS;i++) |
478 |
{ |
479 |
CUBLAS_DGEMM(&transa,&transa, |
480 |
&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
481 |
CUBLAS_DGEMM(&transb,&transb, |
482 |
&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
483 |
} |
484 |
|
485 |
#else
|
486 |
|
487 |
for (i=0;i<RUNS;i++) |
488 |
{ |
489 |
CUBLAS_SGEMM(&transa,&transa, |
490 |
&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
491 |
CUBLAS_SGEMM(&transb,&transb, |
492 |
&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
493 |
} |
494 |
|
495 |
#endif
|
496 |
|
497 |
#elif FBLAS
|
498 |
|
499 |
/* Order is Row : Have to swap uplo='U' and trans='N' */
|
500 |
char transa='N',transb='T'; |
501 |
|
502 |
printf("Using FBLAS: %i iterations for %ix%i matrix\n",
|
503 |
RUNS,dim,dim); |
504 |
|
505 |
#ifdef DOUBLE
|
506 |
|
507 |
for (i=0;i<RUNS;i++) |
508 |
{ |
509 |
dgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
510 |
dgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
511 |
} |
512 |
|
513 |
#else
|
514 |
|
515 |
for (i=0;i<RUNS;i++) |
516 |
{ |
517 |
sgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
518 |
sgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
519 |
} |
520 |
|
521 |
#endif
|
522 |
|
523 |
#elif ACML
|
524 |
|
525 |
/* Order is Row : Have to swap uplo='U' and trans='N' */
|
526 |
char transa='N',transb='T'; |
527 |
|
528 |
printf("Using ACML: %i iterations for %ix%i matrix\n",
|
529 |
RUNS,dim,dim); |
530 |
|
531 |
#ifdef DOUBLE
|
532 |
|
533 |
for (i=0;i<RUNS;i++) |
534 |
{ |
535 |
dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
536 |
dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
537 |
} |
538 |
|
539 |
#else
|
540 |
|
541 |
for (i=0;i<RUNS;i++) |
542 |
{ |
543 |
sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
544 |
sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
545 |
} |
546 |
|
547 |
#endif
|
548 |
|
549 |
#elif GSL
|
550 |
|
551 |
printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
552 |
|
553 |
/*
|
554 |
RowMajor : Matrix is read row by row
|
555 |
Upper : the no null elements are on top
|
556 |
NoTrans : no transposition before estimation
|
557 |
NonUnit : Matrix is not unit
|
558 |
*/
|
559 |
|
560 |
#ifdef DOUBLE
|
561 |
|
562 |
for (i=0;i<RUNS;i++) |
563 |
{ |
564 |
cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
565 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
566 |
cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
567 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
568 |
} |
569 |
|
570 |
#else
|
571 |
|
572 |
for (i=0;i<RUNS;i++) |
573 |
{ |
574 |
cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
575 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
576 |
cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
577 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
578 |
} |
579 |
|
580 |
#endif
|
581 |
|
582 |
#else
|
583 |
|
584 |
printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
585 |
|
586 |
/*
|
587 |
RowMajor : Matrix is read row bu row
|
588 |
Upper : the no null elements are on top
|
589 |
NoTrans : no transposition before estimation
|
590 |
NonUnit : Matrix is not unit
|
591 |
*/
|
592 |
|
593 |
#ifdef DOUBLE
|
594 |
|
595 |
for (i=0;i<RUNS;i++) |
596 |
{ |
597 |
cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
598 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
599 |
cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
600 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
601 |
/* dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, */
|
602 |
/* dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); */
|
603 |
/* dgemm(CblasRowMajor,CblasTrans,CblasTrans, */
|
604 |
/* dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); */
|
605 |
} |
606 |
|
607 |
#else
|
608 |
|
609 |
for (i=0;i<RUNS;i++) |
610 |
{ |
611 |
cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
612 |
dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
613 |
cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
614 |
dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
615 |
/* sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, */
|
616 |
/* dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); */
|
617 |
/* sgemm(CblasRowMajor,CblasTrans,CblasTrans, */
|
618 |
/* dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); */
|
619 |
} |
620 |
|
621 |
#endif
|
622 |
|
623 |
#endif
|
624 |
|
625 |
/* Get second timer after launching */
|
626 |
gettimeofday(&tv2, &tz); |
627 |
|
628 |
/* Store the checker of errors */
|
629 |
checksA[0]=0.; |
630 |
|
631 |
for (i=0; i<dim; i++) { |
632 |
for (j=0; j<dim; j++) { |
633 |
checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]); |
634 |
} |
635 |
} |
636 |
|
637 |
/* Print the matrix */
|
638 |
|
639 |
#ifdef QUIET
|
640 |
#else
|
641 |
for (i=0; i<dim; i++) { |
642 |
for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]); |
643 |
putchar('\n');
|
644 |
} |
645 |
putchar('\n');
|
646 |
for (i=0; i<dim; i++) { |
647 |
for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]); |
648 |
putchar('\n');
|
649 |
} |
650 |
putchar('\n');
|
651 |
#endif
|
652 |
|
653 |
/* Free 1 Matrix and 2 Vectors of dimension dim */
|
654 |
|
655 |
free(A); |
656 |
free(B); |
657 |
free(C); |
658 |
free(D); |
659 |
|
660 |
putchar('\n');
|
661 |
|
662 |
#ifdef CLBLAS
|
663 |
double memoryIn,memoryOut;
|
664 |
|
665 |
memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
666 |
(tv3.tv_usec-tv1.tv_usec))/1000000.; |
667 |
|
668 |
memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
669 |
(tv2.tv_usec-tv4.tv_usec))/1000000.; |
670 |
|
671 |
duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
672 |
(tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
673 |
|
674 |
printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
675 |
printf("Duration of memory free : %2.10f s\n",memoryOut);
|
676 |
#elif CUBLAS
|
677 |
double memoryIn,memoryOut;
|
678 |
|
679 |
memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
680 |
(tv3.tv_usec-tv1.tv_usec))/1000000.; |
681 |
|
682 |
memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
683 |
(tv2.tv_usec-tv4.tv_usec))/1000000.; |
684 |
|
685 |
duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
686 |
(tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
687 |
|
688 |
printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
689 |
printf("Duration of memory free : %2.10f s\n",memoryOut);
|
690 |
#else
|
691 |
duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + \ |
692 |
(tv2.tv_usec-tv1.tv_usec))/1000000./RUNS; |
693 |
|
694 |
#endif
|
695 |
|
696 |
printf("Duration of each cycle : %2.10f s\n",duration);
|
697 |
|
698 |
printf("Number of GFlops : %2.3f \n",
|
699 |
dim*dim*2.*(2.*dim-1)/duration/1000000000.); |
700 |
|
701 |
printf("Error %1.10f\n",checksA[0]); |
702 |
printResults(RUNS,checksA,"C","Errors cumulated"); |
703 |
|
704 |
putchar('\n');
|
705 |
|
706 |
/* Free 2 vectors for checker Before and After */
|
707 |
|
708 |
free(checksA); |
709 |
free(checksB); |
710 |
|
711 |
return 0; |
712 |
} |
713 |
|
714 |
#ifdef CLBLAS
|
715 |
|
716 |
int DelectOpenCLDevices()
|
717 |
{ |
718 |
/* */
|
719 |
/* Not needed to import CL.h, already done in CLBLAS.h */
|
720 |
|
721 |
int i, j;
|
722 |
char* value;
|
723 |
size_t valueSize; |
724 |
cl_uint platformCount; |
725 |
cl_platform_id* platforms; |
726 |
cl_uint deviceCount; |
727 |
cl_device_id* devices; |
728 |
cl_uint maxComputeUnits; |
729 |
cl_int maxWorkGroupSize; |
730 |
cl_int maxWorkItemSizes; |
731 |
cl_device_type dev_type; |
732 |
|
733 |
// get all platforms
|
734 |
clGetPlatformIDs(0, NULL, &platformCount); |
735 |
platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
736 |
clGetPlatformIDs(platformCount, platforms, NULL);
|
737 |
|
738 |
|
739 |
printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
740 |
|
741 |
for (i = 0; i < platformCount; i++) { |
742 |
|
743 |
// get all devices
|
744 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
745 |
devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
746 |
clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
747 |
|
748 |
// for each device print critical attributes
|
749 |
for (j = 0; j < deviceCount; j++) { |
750 |
|
751 |
// print device name
|
752 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
753 |
value = (char*) malloc(valueSize);
|
754 |
clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
755 |
printf("Device (%d,%d): %s\n",i, j, value);
|
756 |
free(value); |
757 |
|
758 |
// print type device CPU/GPU/ACCELERATOR
|
759 |
clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
760 |
printf("\tDevice Type: ");
|
761 |
if(dev_type & CL_DEVICE_TYPE_GPU)
|
762 |
printf("CL_DEVICE_TYPE_GPU ");
|
763 |
if(dev_type & CL_DEVICE_TYPE_CPU)
|
764 |
printf("CL_DEVICE_TYPE_CPU ");
|
765 |
if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
766 |
printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
767 |
if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
768 |
printf("CL_DEVICE_TYPE_DEFAULT ");
|
769 |
printf("\n");
|
770 |
|
771 |
// print device vendor
|
772 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
773 |
value = (char*) malloc(valueSize);
|
774 |
clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
775 |
printf("\tDevice vendor: %s\n", value);
|
776 |
free(value); |
777 |
|
778 |
// print hardware device version
|
779 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
780 |
value = (char*) malloc(valueSize);
|
781 |
clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
782 |
printf("\tHardware version: %s\n", value);
|
783 |
free(value); |
784 |
|
785 |
// print software driver version
|
786 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
787 |
value = (char*) malloc(valueSize);
|
788 |
clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
789 |
printf("\tSoftware version: %s\n", value);
|
790 |
free(value); |
791 |
|
792 |
// print c version supported by compiler for device
|
793 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
794 |
value = (char*) malloc(valueSize);
|
795 |
clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
796 |
printf("\tOpenCL C version: %s\n", value);
|
797 |
free(value); |
798 |
|
799 |
// print parallel compute units
|
800 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
801 |
sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
802 |
printf("\tParallel compute units: %d\n", maxComputeUnits);
|
803 |
|
804 |
// print max work group size
|
805 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
806 |
sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
807 |
printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
808 |
|
809 |
// print max work items size
|
810 |
clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
811 |
sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
812 |
printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
813 |
|
814 |
} |
815 |
printf("\n");
|
816 |
free(devices); |
817 |
} |
818 |
|
819 |
free(platforms); |
820 |
return 0; |
821 |
|
822 |
} |
823 |
#endif
|
824 |
|
825 |
int main(int argc,char **argv) |
826 |
{ |
827 |
if ((argc==1)|| |
828 |
(strcmp(argv[1],"-h")==0)|| |
829 |
(strcmp(argv[1],"--help")==0)) |
830 |
{ |
831 |
#ifdef CLBLAS
|
832 |
printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
833 |
"\t#1 Size of square matrices \n"
|
834 |
"\t#2 Number of iterations \n"
|
835 |
"\t#3 OpenCL Plateform ID\n"
|
836 |
"\t#4 OpenCL Device ID\n\n");
|
837 |
DelectOpenCLDevices(); |
838 |
#else
|
839 |
printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
840 |
"\t#1 Size of square matrices \n"
|
841 |
"\t#2 Number of iterations\n\n");
|
842 |
#endif
|
843 |
} |
844 |
else if ((atoi(argv[1])>=2)&& |
845 |
(atoi(argv[2])>=1)) |
846 |
{ |
847 |
#ifdef CLBLAS
|
848 |
MyPlatform=atoi(argv[3]);
|
849 |
MyDevice=atoi(argv[4]);
|
850 |
#endif
|
851 |
bench(atoi(argv[1]),atoi(argv[2])); |
852 |
} |
853 |
|
854 |
return 0; |
855 |
} |