Statistiques
| Révision :

root / BLAS / xGEMM / xGEMM.c @ 250

Historique | Voir | Annoter | Télécharger (21,4 ko)

1 1 equemene
/*
2 51 equemene
   Performs matrix multiply on several BLAS implementations
3 51 equemene
   Copyleft Emmanuel QUEMENER <emmanuel.quemener@gmail.com> under GPLv3
4 1 equemene

5 51 equemene
   2014-03-14 : Add clBLAS implementation
6 51 equemene

7 1 equemene
   Thanks for help from aurel32@debian.org
8 1 equemene
*/
9 1 equemene
10 1 equemene
#include <stdio.h>
11 1 equemene
#include <math.h>
12 1 equemene
#include <stdlib.h>
13 1 equemene
#include <sys/time.h>
14 1 equemene
#include <string.h>
15 1 equemene
16 51 equemene
#ifdef CLBLAS
17 51 equemene
#include <clBLAS.h>
18 53 equemene
/* Precise here to avoid new specific bench function */
19 53 equemene
int MyPlatform;
20 53 equemene
int MyDevice;
21 51 equemene
#elif CUBLAS
22 1 equemene
#include <cublas.h>
23 1 equemene
#define CUBLAS_WRAPPER_ERROR_NOERR      0
24 1 equemene
#define CUBLAS_WRAPPER_ERROR_ALLOC      1
25 1 equemene
#define CUBLAS_WRAPPER_ERROR_SET        2
26 1 equemene
#define CUBLAS_WRAPPER_ERROR_GET        3
27 1 equemene
#define CUBLAS_WRAPPER_ERROR_STUB       4
28 1 equemene
#elif THUNKING
29 1 equemene
#include <cublas.h>
30 7 equemene
#include "fortran_common.h"
31 7 equemene
#include "fortran_thunking.h"
32 1 equemene
#elif FBLAS
33 250 equemene
#include <f77blas.h>
34 1 equemene
#elif GSL
35 1 equemene
#include <gsl_cblas.h>
36 1 equemene
#elif ACML
37 1 equemene
#include <acml.h>
38 1 equemene
#else
39 1 equemene
#include <cblas.h>
40 247 equemene
//#include <blaswrap.h>
41 1 equemene
#endif
42 1 equemene
43 51 equemene
#ifdef CLBLAS
44 51 equemene
45 250 equemene
#ifdef FP64
46 51 equemene
#define LENGTH cl_double
47 51 equemene
#else
48 51 equemene
#define LENGTH cl_float
49 51 equemene
#endif
50 51 equemene
51 51 equemene
#else
52 51 equemene
53 250 equemene
#ifdef FP64
54 1 equemene
#define LENGTH double
55 1 equemene
#else
56 1 equemene
#define LENGTH float
57 1 equemene
#endif
58 1 equemene
59 51 equemene
#endif
60 51 equemene
61 250 equemene
/* #ifdef FBLAS */
62 1 equemene
63 250 equemene
/* #ifdef FP64 */
64 1 equemene
65 250 equemene
/* void F77_dgemm(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT,  */
66 250 equemene
/*                const double *, FINT, const double *, double *, FINT); */
67 1 equemene
68 250 equemene
/* #else */
69 1 equemene
70 250 equemene
/* void F77_sgemm(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT,  */
71 250 equemene
/*                const float *, FINT, const float *, float *, FINT); */
72 1 equemene
73 250 equemene
/* #endif */
74 250 equemene
/* #endif */
75 1 equemene
76 1 equemene
/* Matrix with only defined triangular terms */
77 1 equemene
/* Even if there are 0 in matrix, must be defined at all ! */
78 1 equemene
79 1 equemene
/* Get from fortran.c */
80 1 equemene
81 1 equemene
#ifdef CUBLAS
82 1 equemene
static char *errMsg[5] =
83 1 equemene
{
84 1 equemene
    "no error",
85 1 equemene
    "allocation error",
86 1 equemene
    "setVector/setMatrix error",
87 1 equemene
    "getVector/getMatrix error",
88 1 equemene
    "not implemented"
89 1 equemene
};
90 1 equemene
91 1 equemene
static void wrapperError (const char *funcName, int error)
92 1 equemene
{
93 1 equemene
    printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
94 1 equemene
    fflush (stdout);
95 1 equemene
}
96 1 equemene
#endif
97 1 equemene
98 1 equemene
int printVector(const int dimVector,const LENGTH *dataVector,
99 1 equemene
                char *nameVector,char *mesgVector)
100 1 equemene
{
101 1 equemene
#ifndef QUIET
102 1 equemene
103 1 equemene
  int i;
104 1 equemene
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
105 1 equemene
  for (i=0;i<dimVector;i++)
106 1 equemene
    {
107 1 equemene
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
108 1 equemene
    }
109 1 equemene
#endif
110 1 equemene
111 1 equemene
  return 0;
112 1 equemene
}
113 1 equemene
114 1 equemene
int printResults(const int dimVector,const LENGTH *dataVector,
115 1 equemene
                 char *nameVector,char *mesgVector)
116 1 equemene
{
117 1 equemene
#ifdef RESULTS
118 1 equemene
  int i;
119 1 equemene
120 1 equemene
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
121 1 equemene
  for (i=0;i<dimVector;i++)
122 1 equemene
    {
123 1 equemene
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
124 1 equemene
    }
125 1 equemene
#endif
126 1 equemene
  return 0;
127 1 equemene
}
128 1 equemene
129 1 equemene
#ifdef CUBLAS
130 1 equemene
int printVectorGPU(const int dimVector,const LENGTH *dataVector,
131 1 equemene
                   char *nameVector,char *mesgVector)
132 1 equemene
{
133 1 equemene
#ifndef QUIET
134 1 equemene
  int i;
135 1 equemene
  cublasStatus stat;
136 1 equemene
  LENGTH *P=0;
137 1 equemene
  int incx=1;
138 1 equemene
139 1 equemene
  P=malloc(dimVector*sizeof(LENGTH));
140 1 equemene
141 1 equemene
  stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx);
142 1 equemene
143 1 equemene
  if (stat != CUBLAS_STATUS_SUCCESS) {
144 1 equemene
    wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
145 1 equemene
  }
146 1 equemene
147 1 equemene
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
148 1 equemene
  for (i=0;i<dimVector;i++)
149 1 equemene
    {
150 1 equemene
      printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
151 1 equemene
    }
152 1 equemene
153 1 equemene
  free(P);
154 1 equemene
#endif
155 1 equemene
156 1 equemene
  return 0;
157 1 equemene
}
158 1 equemene
#endif
159 1 equemene
160 1 equemene
int bench(int dim,int RUNS)
161 1 equemene
{
162 1 equemene
  /*
163 1 equemene
  int dim=1000;
164 1 equemene
  int RUNS=100;
165 1 equemene
  int incx=1;
166 1 equemene
  */
167 1 equemene
#ifdef PRINT
168 1 equemene
  LENGTH factor=1.;
169 1 equemene
#endif
170 1 equemene
171 1 equemene
  LENGTH alpha=1.,beta=0.;
172 1 equemene
  LENGTH *A,*B,*C,*D;
173 1 equemene
174 1 equemene
  /* checkBefore checkAfter checks */
175 1 equemene
  LENGTH *checksA,*checksB;
176 1 equemene
177 1 equemene
  int i=0, j=0;
178 1 equemene
179 1 equemene
  double duration;
180 1 equemene
181 1 equemene
  struct timeval tv1,tv2;
182 1 equemene
  struct timezone tz;
183 1 equemene
184 1 equemene
  /* Create 4 Matrix of dimension dim by dim  */
185 1 equemene
186 1 equemene
  A=malloc(dim*dim*sizeof(LENGTH));
187 1 equemene
  B=malloc(dim*dim*sizeof(LENGTH));
188 1 equemene
  C=malloc(dim*dim*sizeof(LENGTH));
189 1 equemene
  D=malloc(dim*dim*sizeof(LENGTH));
190 1 equemene
191 1 equemene
  /* Create 2 vectors for checker Before and After */
192 1 equemene
193 1 equemene
  checksA=malloc(RUNS*sizeof(LENGTH));
194 1 equemene
  checksB=malloc(RUNS*sizeof(LENGTH));
195 1 equemene
196 1 equemene
  /* Initialize elements with random numbers */
197 1 equemene
  /* Initialize the seed for rand() */
198 1 equemene
  /* srand(time()); */
199 1 equemene
200 1 equemene
  for (i=0; i<dim; i++) {
201 1 equemene
    for (j=0; j<dim; j++) {
202 1 equemene
        A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
203 1 equemene
          *(LENGTH)(i+1.)/(LENGTH)(j+1.);
204 1 equemene
        B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
205 1 equemene
          *(LENGTH)(i+1.)/(LENGTH)(j+1.);
206 1 equemene
        C[i*dim+j]=0.;
207 1 equemene
        D[i*dim+j]=0.;
208 1 equemene
    }
209 1 equemene
  }
210 1 equemene
  /*
211 1 equemene
  A[0]=1;
212 1 equemene
  A[1]=2;
213 1 equemene
  A[2]=3;
214 1 equemene
  A[3]=4;
215 1 equemene

216 1 equemene
  B[0]=5;
217 1 equemene
  B[1]=6;
218 1 equemene
  B[2]=7;
219 1 equemene
  B[3]=8;
220 1 equemene
  */
221 1 equemene
222 1 equemene
  /* Print the matrix */
223 1 equemene
224 1 equemene
#ifdef QUIET
225 1 equemene
#else
226 1 equemene
  for (i=0; i<dim; i++) {
227 1 equemene
    for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]);
228 1 equemene
    putchar('\n');
229 1 equemene
  }
230 1 equemene
  putchar('\n');
231 1 equemene
  for (i=0; i<dim; i++) {
232 1 equemene
    for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]);
233 1 equemene
    putchar('\n');
234 1 equemene
  }
235 1 equemene
  putchar('\n');
236 1 equemene
#endif
237 1 equemene
238 1 equemene
 /* Get first timer before launching */
239 1 equemene
  gettimeofday(&tv1, &tz);
240 1 equemene
241 51 equemene
  /* Compute with CLBLAS library  */
242 51 equemene
#ifdef CLBLAS
243 51 equemene
244 53 equemene
  cl_uint platformCount;
245 53 equemene
  cl_platform_id* platforms;
246 53 equemene
  cl_uint deviceCount;
247 53 equemene
  cl_device_id* devices;
248 53 equemene
249 51 equemene
  cl_int err,errA,errB,errC,errD;
250 51 equemene
  cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
251 51 equemene
  cl_context ctx = 0;
252 51 equemene
  cl_command_queue queue = 0;
253 51 equemene
  cl_mem bufA, bufB, bufC, bufD;
254 51 equemene
  cl_event event = NULL;
255 51 equemene
256 53 equemene
  char* value;
257 53 equemene
  size_t valueSize;
258 53 equemene
259 53 equemene
  // tv3 Put on Device: Allocate & Write buffer
260 53 equemene
  // tv4 Compute
261 51 equemene
  struct timeval tv3,tv4;
262 51 equemene
263 53 equemene
  printf("Using CLBLAS: %i iterations for %ix%i matrix on (%d,%d)\n",
264 53 equemene
         RUNS,dim,dim,MyPlatform,MyDevice);
265 51 equemene
266 51 equemene
  /* Setup OpenCL environment. */
267 53 equemene
  /* - get all platforms and select MyPlatform */
268 53 equemene
  /* - get all devices from MyPlatform and select MyDevice */
269 51 equemene
270 53 equemene
  // Get all platforms
271 53 equemene
  err = clGetPlatformIDs(0, NULL, &platformCount);
272 53 equemene
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
273 53 equemene
  err = clGetPlatformIDs(platformCount, platforms, NULL);
274 53 equemene
275 53 equemene
  // Get Device defined
276 53 equemene
  err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
277 53 equemene
  devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
278 53 equemene
  err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
279 53 equemene
280 53 equemene
  // print device name
281 53 equemene
  err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize);
282 53 equemene
  value = (char*) malloc(valueSize);
283 53 equemene
  err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, value, NULL);
284 53 equemene
  printf("Device (%d,%d): %s\n",MyPlatform,MyDevice, value);
285 53 equemene
  free(value);
286 53 equemene
287 53 equemene
  props[1] = (cl_context_properties)platforms[MyPlatform];
288 53 equemene
289 53 equemene
  /* Initialize Context */
290 53 equemene
  ctx = clCreateContext( props, 1, &devices[MyDevice], NULL, NULL, &err );
291 53 equemene
  queue = clCreateCommandQueue( ctx, devices[MyDevice], 0, &err );
292 53 equemene
293 51 equemene
  /* Setup clBLAS */
294 51 equemene
  err = clblasSetup( );
295 51 equemene
296 51 equemene
  /* Prepare OpenCL memory objects and place matrices inside them. */
297 52 equemene
  bufA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*A),NULL,&errA );
298 52 equemene
  bufB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*B),NULL,&errB );
299 52 equemene
  bufC = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*C),NULL,&errC );
300 52 equemene
  bufD = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*D),NULL,&errD );
301 51 equemene
302 52 equemene
  errA = clEnqueueWriteBuffer( queue,bufA,CL_TRUE,0,
303 52 equemene
                               dim*dim*sizeof(*A),A,0,NULL,NULL );
304 52 equemene
  errB = clEnqueueWriteBuffer( queue, bufB, CL_TRUE,0,
305 52 equemene
                               dim*dim*sizeof(*B),B,0,NULL,NULL );
306 52 equemene
  errC = clEnqueueWriteBuffer( queue, bufC, CL_TRUE,0,
307 54 equemene
                                 dim*dim*sizeof(*C),C,0,NULL,NULL );
308 52 equemene
  errD = clEnqueueWriteBuffer( queue, bufD, CL_TRUE,0,
309 54 equemene
                                 dim*dim*sizeof(*D),D,0,NULL,NULL );
310 51 equemene
311 51 equemene
  /* Get third timer after memory operation */
312 51 equemene
  gettimeofday(&tv3, &tz);
313 51 equemene
314 250 equemene
#ifdef FP64
315 51 equemene
316 51 equemene
  for (i=0;i<RUNS;i++)
317 51 equemene
    {
318 52 equemene
      err = clblasDgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans,
319 52 equemene
                         dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta,
320 52 equemene
                         bufC,0,dim,1,&queue,0,NULL,&event );
321 51 equemene
322 52 equemene
      err = clblasDgemm( clblasRowMajor,clblasTrans,clblasTrans,
323 52 equemene
                         dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta,
324 52 equemene
                         bufD,0,dim,1,&queue,0,NULL,&event );
325 51 equemene
326 51 equemene
    }
327 51 equemene
328 53 equemene
  if (err != CL_SUCCESS) {
329 53 equemene
    printf("clblasDgemm() failed with %d\n", err);
330 53 equemene
  }
331 53 equemene
332 51 equemene
#else
333 51 equemene
334 51 equemene
  for (i=0;i<RUNS;i++)
335 51 equemene
    {
336 51 equemene
337 52 equemene
      err = clblasSgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans,
338 52 equemene
                         dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta,
339 52 equemene
                         bufC,0,dim,1,&queue,0,NULL,&event );
340 51 equemene
341 52 equemene
      err = clblasSgemm( clblasRowMajor,clblasTrans,clblasTrans,
342 52 equemene
                         dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta,
343 52 equemene
                         bufD,0,dim,1,&queue,0,NULL,&event );
344 51 equemene
    }
345 51 equemene
346 53 equemene
  if (err != CL_SUCCESS) {
347 53 equemene
    printf("clblasSgemm() failed with %d\n", err);
348 53 equemene
  }
349 53 equemene
350 51 equemene
#endif
351 51 equemene
352 51 equemene
  /* Wait for calculations to be finished. */
353 51 equemene
  err = clWaitForEvents( 1, &event );
354 51 equemene
355 53 equemene
  /* Get fourth timer after memory free */
356 53 equemene
  gettimeofday(&tv4, &tz);
357 53 equemene
358 51 equemene
  /* Fetch results of calculations from GPU memory. */
359 53 equemene
  errC = clEnqueueReadBuffer( queue,bufC,CL_TRUE,0,dim*dim * sizeof(*C),
360 52 equemene
                             C,0,NULL,NULL );
361 51 equemene
362 51 equemene
  /* Fetch results of calculations from GPU memory. */
363 53 equemene
  errD = clEnqueueReadBuffer( queue,bufD,CL_TRUE,0,dim*dim*sizeof(*D),
364 52 equemene
                             D,0,NULL,NULL );
365 51 equemene
366 51 equemene
  /* Release OpenCL memory objects. */
367 51 equemene
  clReleaseMemObject( bufD );
368 51 equemene
  clReleaseMemObject( bufC );
369 51 equemene
  clReleaseMemObject( bufB );
370 51 equemene
  clReleaseMemObject( bufA );
371 51 equemene
372 51 equemene
  /* Finalize work with clBLAS */
373 51 equemene
  clblasTeardown( );
374 51 equemene
375 51 equemene
  /* Release OpenCL working objects. */
376 51 equemene
  clReleaseCommandQueue( queue );
377 51 equemene
  clReleaseContext( ctx );
378 51 equemene
379 51 equemene
380 1 equemene
  /* Compute with CuBLAS library  */
381 51 equemene
#elif CUBLAS
382 1 equemene
  LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0;
383 1 equemene
  cublasStatus stat1, stat2, stat3, stat4;
384 1 equemene
  struct timeval tv3,tv4;
385 1 equemene
386 1 equemene
  /* Order is Row */
387 1 equemene
  /* Have to swap uplo and trans */
388 1 equemene
  char transa='N',transb='T';
389 1 equemene
390 1 equemene
  printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
391 1 equemene
         RUNS,dim,dim);
392 1 equemene
393 1 equemene
  stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA);
394 1 equemene
  stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB);
395 1 equemene
  stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC);
396 1 equemene
  stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD);
397 1 equemene
398 1 equemene
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
399 1 equemene
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
400 1 equemene
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
401 1 equemene
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
402 5 equemene
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
403 1 equemene
    cublasFree (devPtrA);
404 1 equemene
    cublasFree (devPtrB);
405 1 equemene
    cublasFree (devPtrC);
406 1 equemene
    cublasFree (devPtrD);
407 1 equemene
    return 1;
408 1 equemene
  }
409 1 equemene
410 1 equemene
  stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim);
411 1 equemene
  stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim);
412 1 equemene
  stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim);
413 1 equemene
  stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim);
414 1 equemene
415 1 equemene
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
416 1 equemene
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
417 1 equemene
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
418 1 equemene
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
419 5 equemene
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
420 1 equemene
    cublasFree (devPtrA);
421 1 equemene
    cublasFree (devPtrB);
422 1 equemene
    cublasFree (devPtrC);
423 1 equemene
    cublasFree (devPtrD);
424 1 equemene
    return 1;
425 1 equemene
  }
426 1 equemene
427 1 equemene
  /* Get third timer after memory operation */
428 1 equemene
  gettimeofday(&tv3, &tz);
429 1 equemene
430 250 equemene
#ifdef FP64
431 1 equemene
432 1 equemene
  for (i=0;i<RUNS;i++)
433 1 equemene
    {
434 1 equemene
      cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
435 1 equemene
                  devPtrA,dim,beta,devPtrC,dim);
436 1 equemene
      cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
437 1 equemene
                  devPtrB,dim,beta,devPtrD,dim);
438 1 equemene
    }
439 1 equemene
440 1 equemene
#else
441 1 equemene
442 1 equemene
  for (i=0;i<RUNS;i++)
443 1 equemene
    {
444 1 equemene
      cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
445 1 equemene
                  devPtrA,dim,beta,devPtrC,dim);
446 1 equemene
      cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
447 1 equemene
                  devPtrB,dim,beta,devPtrD,dim);
448 1 equemene
    }
449 1 equemene
450 1 equemene
#endif
451 1 equemene
452 1 equemene
  stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim);
453 1 equemene
  stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim);
454 1 equemene
455 53 equemene
  /* Get fourth timer before memory free */
456 53 equemene
  gettimeofday(&tv4, &tz);
457 53 equemene
458 1 equemene
  cublasFree (devPtrA);
459 1 equemene
  cublasFree (devPtrB);
460 1 equemene
  cublasFree (devPtrC);
461 1 equemene
  cublasFree (devPtrD);
462 1 equemene
463 1 equemene
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
464 5 equemene
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
465 1 equemene
  }
466 1 equemene
467 1 equemene
468 1 equemene
#elif THUNKING
469 1 equemene
470 1 equemene
  /* Order is Row : Have to swap uplo='U' and trans='N' */
471 1 equemene
  char transa='N',transb='T';
472 1 equemene
  printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
473 1 equemene
         RUNS,dim,dim);
474 1 equemene
475 250 equemene
#ifdef FP64
476 1 equemene
477 1 equemene
  for (i=0;i<RUNS;i++)
478 1 equemene
    {
479 1 equemene
      CUBLAS_DGEMM(&transa,&transa,
480 7 equemene
                         &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
481 1 equemene
      CUBLAS_DGEMM(&transb,&transb,
482 7 equemene
                         &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
483 1 equemene
    }
484 1 equemene
485 1 equemene
#else
486 1 equemene
487 1 equemene
  for (i=0;i<RUNS;i++)
488 1 equemene
    {
489 1 equemene
      CUBLAS_SGEMM(&transa,&transa,
490 7 equemene
                         &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
491 1 equemene
      CUBLAS_SGEMM(&transb,&transb,
492 7 equemene
                         &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
493 1 equemene
    }
494 1 equemene
495 1 equemene
#endif
496 1 equemene
497 1 equemene
#elif FBLAS
498 1 equemene
499 1 equemene
  /* Order is Row : Have to swap uplo='U' and trans='N' */
500 1 equemene
      char transa='N',transb='T';
501 1 equemene
502 1 equemene
  printf("Using FBLAS: %i iterations for %ix%i matrix\n",
503 1 equemene
         RUNS,dim,dim);
504 1 equemene
505 250 equemene
#ifdef FP64
506 1 equemene
507 1 equemene
  for (i=0;i<RUNS;i++)
508 1 equemene
    {
509 250 equemene
      dgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
510 250 equemene
      dgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
511 1 equemene
    }
512 1 equemene
513 1 equemene
#else
514 1 equemene
515 1 equemene
  for (i=0;i<RUNS;i++)
516 1 equemene
    {
517 250 equemene
      sgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
518 250 equemene
      sgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
519 1 equemene
    }
520 1 equemene
521 1 equemene
#endif
522 1 equemene
523 1 equemene
#elif ACML
524 1 equemene
525 1 equemene
  /* Order is Row : Have to swap uplo='U' and trans='N' */
526 1 equemene
      char transa='N',transb='T';
527 1 equemene
528 1 equemene
  printf("Using ACML: %i iterations for %ix%i matrix\n",
529 1 equemene
         RUNS,dim,dim);
530 1 equemene
531 250 equemene
#ifdef FP64
532 1 equemene
533 1 equemene
  for (i=0;i<RUNS;i++)
534 1 equemene
    {
535 1 equemene
      dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
536 1 equemene
      dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
537 1 equemene
    }
538 1 equemene
539 1 equemene
#else
540 1 equemene
541 1 equemene
  for (i=0;i<RUNS;i++)
542 1 equemene
    {
543 1 equemene
      sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
544 1 equemene
      sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
545 1 equemene
    }
546 1 equemene
547 1 equemene
#endif
548 1 equemene
549 1 equemene
#elif GSL
550 1 equemene
551 1 equemene
  printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
552 1 equemene
553 1 equemene
  /*
554 1 equemene
     RowMajor : Matrix is read row by row
555 1 equemene
     Upper : the no null elements are on top
556 1 equemene
     NoTrans : no transposition before estimation
557 1 equemene
     NonUnit : Matrix is not unit
558 1 equemene
   */
559 1 equemene
560 250 equemene
#ifdef FP64
561 1 equemene
562 1 equemene
  for (i=0;i<RUNS;i++)
563 1 equemene
    {
564 1 equemene
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
565 1 equemene
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
566 1 equemene
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
567 1 equemene
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
568 1 equemene
    }
569 1 equemene
570 1 equemene
#else
571 1 equemene
572 1 equemene
  for (i=0;i<RUNS;i++)
573 1 equemene
    {
574 1 equemene
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
575 1 equemene
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
576 1 equemene
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
577 1 equemene
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
578 1 equemene
    }
579 1 equemene
580 1 equemene
#endif
581 1 equemene
582 1 equemene
#else
583 1 equemene
584 1 equemene
  printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
585 1 equemene
586 1 equemene
  /*
587 1 equemene
     RowMajor : Matrix is read row bu row
588 1 equemene
     Upper : the no null elements are on top
589 1 equemene
     NoTrans : no transposition before estimation
590 1 equemene
     NonUnit : Matrix is not unit
591 1 equemene
   */
592 1 equemene
593 250 equemene
#ifdef FP64
594 1 equemene
595 1 equemene
  for (i=0;i<RUNS;i++)
596 1 equemene
    {
597 1 equemene
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
598 1 equemene
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
599 1 equemene
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
600 1 equemene
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
601 1 equemene
    }
602 1 equemene
603 1 equemene
#else
604 1 equemene
605 1 equemene
  for (i=0;i<RUNS;i++)
606 1 equemene
    {
607 1 equemene
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
608 1 equemene
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
609 1 equemene
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
610 1 equemene
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
611 1 equemene
    }
612 1 equemene
613 1 equemene
#endif
614 1 equemene
615 1 equemene
#endif
616 1 equemene
617 1 equemene
  /* Get second timer after launching */
618 1 equemene
  gettimeofday(&tv2, &tz);
619 1 equemene
620 1 equemene
  /* Store the checker of errors */
621 1 equemene
  checksA[0]=0.;
622 1 equemene
623 1 equemene
  for (i=0; i<dim; i++) {
624 1 equemene
    for (j=0; j<dim; j++) {
625 1 equemene
      checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]);
626 1 equemene
    }
627 1 equemene
  }
628 1 equemene
629 1 equemene
  /* Print the matrix */
630 1 equemene
631 1 equemene
#ifdef QUIET
632 1 equemene
#else
633 1 equemene
  for (i=0; i<dim; i++) {
634 1 equemene
    for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]);
635 1 equemene
    putchar('\n');
636 1 equemene
  }
637 1 equemene
  putchar('\n');
638 1 equemene
  for (i=0; i<dim; i++) {
639 1 equemene
    for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]);
640 1 equemene
    putchar('\n');
641 1 equemene
  }
642 1 equemene
  putchar('\n');
643 1 equemene
#endif
644 1 equemene
645 1 equemene
  /* Free 1 Matrix and 2 Vectors of dimension dim  */
646 1 equemene
647 1 equemene
  free(A);
648 1 equemene
  free(B);
649 1 equemene
  free(C);
650 1 equemene
  free(D);
651 1 equemene
652 1 equemene
  putchar('\n');
653 1 equemene
654 53 equemene
#ifdef CLBLAS
655 1 equemene
  double memoryIn,memoryOut;
656 1 equemene
657 1 equemene
  memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L +        \
658 1 equemene
                    (tv3.tv_usec-tv1.tv_usec))/1000000.;
659 1 equemene
660 1 equemene
  memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L +        \
661 1 equemene
                    (tv2.tv_usec-tv4.tv_usec))/1000000.;
662 1 equemene
663 1 equemene
  duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L +        \
664 1 equemene
                    (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS;
665 1 equemene
666 1 equemene
  printf("Duration of memory allocation : %2.10f s\n",memoryIn);
667 1 equemene
  printf("Duration of memory free : %2.10f s\n",memoryOut);
668 53 equemene
#elif CUBLAS
669 53 equemene
  double memoryIn,memoryOut;
670 53 equemene
671 53 equemene
  memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L +        \
672 53 equemene
                    (tv3.tv_usec-tv1.tv_usec))/1000000.;
673 53 equemene
674 53 equemene
  memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L +        \
675 53 equemene
                    (tv2.tv_usec-tv4.tv_usec))/1000000.;
676 53 equemene
677 53 equemene
  duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L +        \
678 53 equemene
                    (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS;
679 53 equemene
680 53 equemene
  printf("Duration of memory allocation : %2.10f s\n",memoryIn);
681 53 equemene
  printf("Duration of memory free : %2.10f s\n",memoryOut);
682 1 equemene
#else
683 1 equemene
  duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L +        \
684 1 equemene
                    (tv2.tv_usec-tv1.tv_usec))/1000000./RUNS;
685 1 equemene
686 1 equemene
#endif
687 1 equemene
688 1 equemene
  printf("Duration of each cycle : %2.10f s\n",duration);
689 1 equemene
690 1 equemene
  printf("Number of GFlops : %2.3f \n",
691 1 equemene
         dim*dim*2.*(2.*dim-1)/duration/1000000000.);
692 1 equemene
693 1 equemene
  printf("Error %1.10f\n",checksA[0]);
694 1 equemene
  printResults(RUNS,checksA,"C","Errors cumulated");
695 1 equemene
696 1 equemene
  putchar('\n');
697 1 equemene
698 1 equemene
  /* Free 2 vectors for checker Before and After */
699 1 equemene
700 1 equemene
  free(checksA);
701 1 equemene
  free(checksB);
702 1 equemene
703 1 equemene
  return 0;
704 1 equemene
}
705 1 equemene
706 53 equemene
#ifdef CLBLAS
707 53 equemene
708 53 equemene
int DelectOpenCLDevices()
709 53 equemene
{
710 53 equemene
  /* */
711 53 equemene
  /* Not needed to import CL.h, already done in CLBLAS.h */
712 53 equemene
713 53 equemene
  int i, j;
714 53 equemene
  char* value;
715 53 equemene
  size_t valueSize;
716 53 equemene
  cl_uint platformCount;
717 53 equemene
  cl_platform_id* platforms;
718 53 equemene
  cl_uint deviceCount;
719 53 equemene
  cl_device_id* devices;
720 53 equemene
  cl_uint maxComputeUnits;
721 53 equemene
  cl_int maxWorkGroupSize;
722 53 equemene
  cl_int maxWorkItemSizes;
723 53 equemene
  cl_device_type dev_type;
724 53 equemene
725 53 equemene
  // get all platforms
726 53 equemene
  clGetPlatformIDs(0, NULL, &platformCount);
727 53 equemene
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
728 53 equemene
  clGetPlatformIDs(platformCount, platforms, NULL);
729 53 equemene
730 53 equemene
731 53 equemene
  printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
732 53 equemene
733 53 equemene
  for (i = 0; i < platformCount; i++) {
734 53 equemene
735 53 equemene
    // get all devices
736 53 equemene
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
737 53 equemene
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
738 53 equemene
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
739 53 equemene
740 53 equemene
    // for each device print critical attributes
741 53 equemene
    for (j = 0; j < deviceCount; j++) {
742 53 equemene
743 53 equemene
      // print device name
744 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
745 53 equemene
      value = (char*) malloc(valueSize);
746 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
747 53 equemene
      printf("Device (%d,%d): %s\n",i, j, value);
748 53 equemene
      free(value);
749 53 equemene
750 53 equemene
      // print type device CPU/GPU/ACCELERATOR
751 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
752 53 equemene
      printf("\tDevice Type: ");
753 53 equemene
      if(dev_type & CL_DEVICE_TYPE_GPU)
754 53 equemene
        printf("CL_DEVICE_TYPE_GPU ");
755 53 equemene
      if(dev_type & CL_DEVICE_TYPE_CPU)
756 53 equemene
        printf("CL_DEVICE_TYPE_CPU ");
757 53 equemene
      if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
758 53 equemene
        printf("CL_DEVICE_TYPE_ACCELERATOR ");
759 53 equemene
      if(dev_type & CL_DEVICE_TYPE_DEFAULT)
760 53 equemene
        printf("CL_DEVICE_TYPE_DEFAULT ");
761 53 equemene
      printf("\n");
762 53 equemene
763 53 equemene
      // print device vendor
764 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
765 53 equemene
      value = (char*) malloc(valueSize);
766 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
767 53 equemene
      printf("\tDevice vendor: %s\n", value);
768 53 equemene
      free(value);
769 53 equemene
770 53 equemene
      // print hardware device version
771 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);
772 53 equemene
      value = (char*) malloc(valueSize);
773 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
774 53 equemene
      printf("\tHardware version: %s\n", value);
775 53 equemene
      free(value);
776 53 equemene
777 53 equemene
      // print software driver version
778 53 equemene
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);
779 53 equemene
      value = (char*) malloc(valueSize);
780 53 equemene
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
781 53 equemene
      printf("\tSoftware version: %s\n", value);
782 53 equemene
      free(value);
783 53 equemene
784 53 equemene
      // print c version supported by compiler for device
785 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize);
786 53 equemene
      value = (char*) malloc(valueSize);
787 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
788 53 equemene
      printf("\tOpenCL C version: %s\n", value);
789 53 equemene
      free(value);
790 53 equemene
791 53 equemene
      // print parallel compute units
792 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,
793 53 equemene
                      sizeof(maxComputeUnits), &maxComputeUnits, NULL);
794 53 equemene
      printf("\tParallel compute units: %d\n", maxComputeUnits);
795 53 equemene
796 53 equemene
      // print max work group size
797 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE,
798 53 equemene
                      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
799 53 equemene
      printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
800 53 equemene
801 53 equemene
      // print max work items size
802 53 equemene
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES,
803 53 equemene
                      sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL);
804 53 equemene
      printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
805 53 equemene
806 53 equemene
    }
807 53 equemene
    printf("\n");
808 53 equemene
    free(devices);
809 53 equemene
  }
810 53 equemene
811 53 equemene
  free(platforms);
812 53 equemene
  return 0;
813 53 equemene
814 53 equemene
}
815 53 equemene
#endif
816 53 equemene
817 1 equemene
int main(int argc,char **argv)
818 1 equemene
{
819 1 equemene
  if ((argc==1)||
820 1 equemene
      (strcmp(argv[1],"-h")==0)||
821 1 equemene
      (strcmp(argv[1],"--help")==0))
822 1 equemene
    {
823 53 equemene
#ifdef CLBLAS
824 1 equemene
      printf("\nPerforms a bench using BLAS library implementation:\n\n"
825 1 equemene
             "\t#1 Size of square matrices \n"
826 53 equemene
             "\t#2 Number of iterations \n"
827 53 equemene
             "\t#3 OpenCL Plateform ID\n"
828 53 equemene
             "\t#4 OpenCL Device ID\n\n");
829 53 equemene
      DelectOpenCLDevices();
830 53 equemene
#else
831 53 equemene
      printf("\nPerforms a bench using BLAS library implementation:\n\n"
832 53 equemene
             "\t#1 Size of square matrices \n"
833 1 equemene
             "\t#2 Number of iterations\n\n");
834 53 equemene
#endif
835 1 equemene
    }
836 1 equemene
  else if ((atoi(argv[1])>=2)&&
837 1 equemene
           (atoi(argv[2])>=1))
838 1 equemene
    {
839 53 equemene
#ifdef CLBLAS
840 53 equemene
      MyPlatform=atoi(argv[3]);
841 53 equemene
      MyDevice=atoi(argv[4]);
842 53 equemene
#endif
843 1 equemene
      bench(atoi(argv[1]),atoi(argv[2]));
844 1 equemene
    }
845 1 equemene
846 1 equemene
  return 0;
847 1 equemene
}