Statistiques
| Révision :

root / BLAS / xGEMM / xGEMM.c @ 146

Historique | Voir | Annoter | Télécharger (21,82 ko)

1
/* 
2
   Performs matrix multiply on several BLAS implementations 
3
   Copyleft Emmanuel QUEMENER <emmanuel.quemener@gmail.com> under GPLv3
4

5
   2014-03-14 : Add clBLAS implementation
6

7
   Thanks for help from aurel32@debian.org
8
*/
9

    
10
#include <stdio.h>
11
#include <math.h>
12
#include <stdlib.h>
13
#include <sys/time.h>
14
#include <string.h>
15

    
16
#ifdef CLBLAS
17
#include <clBLAS.h>
18
/* Precise here to avoid new specific bench function */
19
int MyPlatform;
20
int MyDevice;
21
#elif CUBLAS
22
#include <cublas.h>
23
#define CUBLAS_WRAPPER_ERROR_NOERR      0
24
#define CUBLAS_WRAPPER_ERROR_ALLOC      1
25
#define CUBLAS_WRAPPER_ERROR_SET        2
26
#define CUBLAS_WRAPPER_ERROR_GET        3
27
#define CUBLAS_WRAPPER_ERROR_STUB       4
28
#elif THUNKING
29
#include <cublas.h>
30
#include "fortran_common.h"
31
#include "fortran_thunking.h"
32
#elif FBLAS
33
#include <cblas_f77.h>
34
#elif GSL
35
#include <gsl_cblas.h>
36
#elif ACML
37
#include <acml.h>
38
#else
39
#include <cblas.h>
40
#include <blaswrap.h>
41
#endif
42

    
43
#ifdef CLBLAS
44

    
45
#ifdef DOUBLE
46
#define LENGTH cl_double
47
#else
48
#define LENGTH cl_float
49
#endif
50

    
51
#else
52

    
53
#ifdef DOUBLE
54
#define LENGTH double
55
#else
56
#define LENGTH float
57
#endif
58

    
59
#endif
60

    
61
#ifdef FBLAS
62

    
63
#ifdef DOUBLE
64

    
65
void dgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, 
66
               const double *, FINT, const double *, double *, FINT);
67

    
68
#else
69

    
70
void sgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, 
71
               const float *, FINT, const float *, float *, FINT);
72

    
73
#endif
74
#endif
75

    
76
/* Matrix with only defined triangular terms */
77
/* Even if there are 0 in matrix, must be defined at all ! */
78

    
79
/* Get from fortran.c */
80

    
81
#ifdef CUBLAS
82
static char *errMsg[5] = 
83
{
84
    "no error",
85
    "allocation error",
86
    "setVector/setMatrix error",
87
    "getVector/getMatrix error",
88
    "not implemented"
89
};
90

    
91
static void wrapperError (const char *funcName, int error)
92
{
93
    printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
94
    fflush (stdout);
95
}
96
#endif
97

    
98
int printVector(const int dimVector,const LENGTH *dataVector,
99
                char *nameVector,char *mesgVector)
100
{
101
#ifndef QUIET
102

    
103
  int i;
104
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
105
  for (i=0;i<dimVector;i++)
106
    {
107
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
108
    }
109
#endif
110

    
111
  return 0;
112
}
113
  
114
int printResults(const int dimVector,const LENGTH *dataVector,
115
                 char *nameVector,char *mesgVector)
116
{
117
#ifdef RESULTS
118
  int i;
119

    
120
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
121
  for (i=0;i<dimVector;i++)
122
    {
123
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
124
    }
125
#endif
126
  return 0;
127
}
128
  
129
#ifdef CUBLAS
130
int printVectorGPU(const int dimVector,const LENGTH *dataVector,
131
                   char *nameVector,char *mesgVector)
132
{
133
#ifndef QUIET
134
  int i;
135
  cublasStatus stat;
136
  LENGTH *P=0;
137
  int incx=1;
138

    
139
  P=malloc(dimVector*sizeof(LENGTH));
140
  
141
  stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx);
142

    
143
  if (stat != CUBLAS_STATUS_SUCCESS) {
144
    wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
145
  }  
146

    
147
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
148
  for (i=0;i<dimVector;i++)
149
    {
150
      printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
151
    }
152

    
153
  free(P);  
154
#endif
155

    
156
  return 0;
157
}
158
#endif
159

    
160
int bench(int dim,int RUNS)
161
{
162
  /*
163
  int dim=1000;
164
  int RUNS=100;
165
  int incx=1;
166
  */
167
#ifdef PRINT
168
  LENGTH factor=1.;
169
#endif
170

    
171
  LENGTH alpha=1.,beta=0.;
172
  LENGTH *A,*B,*C,*D;
173

    
174
  /* checkBefore checkAfter checks */
175
  LENGTH *checksA,*checksB;
176

    
177
  int i=0, j=0;
178

    
179
  double duration;
180

    
181
  struct timeval tv1,tv2;
182
  struct timezone tz;
183

    
184
  /* Create 4 Matrix of dimension dim by dim  */
185

    
186
  A=malloc(dim*dim*sizeof(LENGTH));
187
  B=malloc(dim*dim*sizeof(LENGTH));
188
  C=malloc(dim*dim*sizeof(LENGTH));
189
  D=malloc(dim*dim*sizeof(LENGTH));
190

    
191
  /* Create 2 vectors for checker Before and After */
192

    
193
  checksA=malloc(RUNS*sizeof(LENGTH));
194
  checksB=malloc(RUNS*sizeof(LENGTH));
195

    
196
  /* Initialize elements with random numbers */
197
  /* Initialize the seed for rand() */
198
  /* srand(time()); */
199

    
200
  for (i=0; i<dim; i++) {
201
    for (j=0; j<dim; j++) {
202
        A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
203
          *(LENGTH)(i+1.)/(LENGTH)(j+1.); 
204
        B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
205
          *(LENGTH)(i+1.)/(LENGTH)(j+1.);
206
        C[i*dim+j]=0.;
207
        D[i*dim+j]=0.;
208
    }
209
  }
210
  /*
211
  A[0]=1;
212
  A[1]=2;
213
  A[2]=3;
214
  A[3]=4;
215
  
216
  B[0]=5;
217
  B[1]=6;
218
  B[2]=7;
219
  B[3]=8;
220
  */
221

    
222
  /* Print the matrix */
223
 
224
#ifdef QUIET
225
#else
226
  for (i=0; i<dim; i++) {
227
    for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]);
228
    putchar('\n');
229
  }
230
  putchar('\n');
231
  for (i=0; i<dim; i++) {
232
    for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]);
233
    putchar('\n');
234
  }
235
  putchar('\n');
236
#endif
237

    
238
 /* Get first timer before launching */
239
  gettimeofday(&tv1, &tz);
240

    
241
  /* Compute with CLBLAS library  */
242
#ifdef CLBLAS
243

    
244
  cl_uint platformCount;
245
  cl_platform_id* platforms;
246
  cl_uint deviceCount;
247
  cl_device_id* devices;
248

    
249
  cl_int err,errA,errB,errC,errD;
250
  cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
251
  cl_context ctx = 0;
252
  cl_command_queue queue = 0;
253
  cl_mem bufA, bufB, bufC, bufD;
254
  cl_event event = NULL;
255

    
256
  char* value;
257
  size_t valueSize;
258

    
259
  // tv3 Put on Device: Allocate & Write buffer
260
  // tv4 Compute
261
  struct timeval tv3,tv4;
262

    
263
  printf("Using CLBLAS: %i iterations for %ix%i matrix on (%d,%d)\n",
264
         RUNS,dim,dim,MyPlatform,MyDevice);
265

    
266
  /* Setup OpenCL environment. */
267
  /* - get all platforms and select MyPlatform */
268
  /* - get all devices from MyPlatform and select MyDevice */
269

    
270
  // Get all platforms
271
  err = clGetPlatformIDs(0, NULL, &platformCount);
272
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
273
  err = clGetPlatformIDs(platformCount, platforms, NULL);
274

    
275
  // Get Device defined
276
  err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
277
  devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
278
  err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);  
279

    
280
  // print device name
281
  err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize);
282
  value = (char*) malloc(valueSize);
283
  err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, value, NULL);
284
  printf("Device (%d,%d): %s\n",MyPlatform,MyDevice, value);
285
  free(value);
286

    
287
  props[1] = (cl_context_properties)platforms[MyPlatform];
288

    
289
  /* Initialize Context */
290
  ctx = clCreateContext( props, 1, &devices[MyDevice], NULL, NULL, &err );
291
  queue = clCreateCommandQueue( ctx, devices[MyDevice], 0, &err );
292

    
293
  /* Setup clBLAS */
294
  err = clblasSetup( );
295

    
296
  /* Prepare OpenCL memory objects and place matrices inside them. */
297
  bufA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*A),NULL,&errA );
298
  bufB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*B),NULL,&errB );
299
  bufC = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*C),NULL,&errC );
300
  bufD = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*D),NULL,&errD );
301

    
302
  errA = clEnqueueWriteBuffer( queue,bufA,CL_TRUE,0,
303
                               dim*dim*sizeof(*A),A,0,NULL,NULL );
304
  errB = clEnqueueWriteBuffer( queue, bufB, CL_TRUE,0,
305
                               dim*dim*sizeof(*B),B,0,NULL,NULL );
306
  errC = clEnqueueWriteBuffer( queue, bufC, CL_TRUE,0,
307
                                 dim*dim*sizeof(*C),C,0,NULL,NULL );
308
  errD = clEnqueueWriteBuffer( queue, bufD, CL_TRUE,0,
309
                                 dim*dim*sizeof(*D),D,0,NULL,NULL );
310
  
311
  /* Get third timer after memory operation */
312
  gettimeofday(&tv3, &tz);
313

    
314
#ifdef DOUBLE
315

    
316
  for (i=0;i<RUNS;i++)
317
    {
318
      err = clblasDgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, 
319
                         dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta,
320
                         bufC,0,dim,1,&queue,0,NULL,&event );
321

    
322
      err = clblasDgemm( clblasRowMajor,clblasTrans,clblasTrans, 
323
                         dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta,
324
                         bufD,0,dim,1,&queue,0,NULL,&event );
325

    
326
    }
327
  
328
  if (err != CL_SUCCESS) {
329
    printf("clblasDgemm() failed with %d\n", err);
330
  }
331

    
332
#else
333

    
334
  for (i=0;i<RUNS;i++)
335
    {
336

    
337
      err = clblasSgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, 
338
                         dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta,
339
                         bufC,0,dim,1,&queue,0,NULL,&event );
340

    
341
      err = clblasSgemm( clblasRowMajor,clblasTrans,clblasTrans, 
342
                         dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta,
343
                         bufD,0,dim,1,&queue,0,NULL,&event );
344
    }
345
  
346
  if (err != CL_SUCCESS) {
347
    printf("clblasSgemm() failed with %d\n", err);
348
  }
349

    
350
#endif
351

    
352
  /* Wait for calculations to be finished. */
353
  err = clWaitForEvents( 1, &event );
354
  
355
  /* Get fourth timer after memory free */
356
  gettimeofday(&tv4, &tz);
357

    
358
  /* Fetch results of calculations from GPU memory. */
359
  errC = clEnqueueReadBuffer( queue,bufC,CL_TRUE,0,dim*dim * sizeof(*C),
360
                             C,0,NULL,NULL );
361

    
362
  /* Fetch results of calculations from GPU memory. */
363
  errD = clEnqueueReadBuffer( queue,bufD,CL_TRUE,0,dim*dim*sizeof(*D),
364
                             D,0,NULL,NULL );
365

    
366
  /* Release OpenCL memory objects. */
367
  clReleaseMemObject( bufD );
368
  clReleaseMemObject( bufC );
369
  clReleaseMemObject( bufB );
370
  clReleaseMemObject( bufA );
371

    
372
  /* Finalize work with clBLAS */
373
  clblasTeardown( );
374

    
375
  /* Release OpenCL working objects. */
376
  clReleaseCommandQueue( queue );
377
  clReleaseContext( ctx );
378
  
379

    
380
  /* Compute with CuBLAS library  */
381
#elif CUBLAS
382
  LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0;
383
  cublasStatus stat1, stat2, stat3, stat4;
384
  struct timeval tv3,tv4;
385

    
386
  /* Order is Row */
387
  /* Have to swap uplo and trans */
388
  char transa='N',transb='T';
389

    
390
  printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
391
         RUNS,dim,dim);
392

    
393
  stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA);
394
  stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB);
395
  stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC);
396
  stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD);
397

    
398
  if ((stat1 != CUBLAS_STATUS_SUCCESS) || 
399
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
400
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
401
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
402
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
403
    cublasFree (devPtrA);
404
    cublasFree (devPtrB);
405
    cublasFree (devPtrC);
406
    cublasFree (devPtrD);
407
    return 1;
408
  }
409

    
410
  stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim);
411
  stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim);
412
  stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim);
413
  stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim);
414
  
415
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
416
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
417
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
418
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
419
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
420
    cublasFree (devPtrA);
421
    cublasFree (devPtrB);
422
    cublasFree (devPtrC);
423
    cublasFree (devPtrD);
424
    return 1;
425
  }
426

    
427
  /* Get third timer after memory operation */
428
  gettimeofday(&tv3, &tz);
429

    
430
#ifdef DOUBLE
431

    
432
  for (i=0;i<RUNS;i++)
433
    {
434
      cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
435
                  devPtrA,dim,beta,devPtrC,dim);
436
      cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
437
                  devPtrB,dim,beta,devPtrD,dim);
438
    }
439
  
440
#else
441

    
442
  for (i=0;i<RUNS;i++)
443
    {
444
      cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
445
                  devPtrA,dim,beta,devPtrC,dim);
446
      cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
447
                  devPtrB,dim,beta,devPtrD,dim);
448
    }
449
  
450
#endif
451

    
452
  stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim);
453
  stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim);
454
  
455
  /* Get fourth timer before memory free */
456
  gettimeofday(&tv4, &tz);
457

    
458
  cublasFree (devPtrA);
459
  cublasFree (devPtrB);
460
  cublasFree (devPtrC);
461
  cublasFree (devPtrD);
462
  
463
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
464
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
465
  }
466
  
467

    
468
#elif THUNKING
469
  
470
  /* Order is Row : Have to swap uplo='U' and trans='N' */
471
  char transa='N',transb='T';
472
  printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
473
         RUNS,dim,dim);
474

    
475
#ifdef DOUBLE
476

    
477
  for (i=0;i<RUNS;i++)
478
    {      
479
      CUBLAS_DGEMM(&transa,&transa,
480
                         &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
481
      CUBLAS_DGEMM(&transb,&transb,
482
                         &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
483
    }
484

    
485
#else
486

    
487
  for (i=0;i<RUNS;i++)
488
    {      
489
      CUBLAS_SGEMM(&transa,&transa,
490
                         &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
491
      CUBLAS_SGEMM(&transb,&transb,
492
                         &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
493
    }
494
  
495
#endif
496

    
497
#elif FBLAS
498
  
499
  /* Order is Row : Have to swap uplo='U' and trans='N' */
500
      char transa='N',transb='T';
501
  
502
  printf("Using FBLAS: %i iterations for %ix%i matrix\n",
503
         RUNS,dim,dim);
504
  
505
#ifdef DOUBLE
506

    
507
  for (i=0;i<RUNS;i++)
508
    {    
509
      dgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
510
      dgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
511
    }
512

    
513
#else
514

    
515
  for (i=0;i<RUNS;i++)
516
    {    
517
      sgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
518
      sgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
519
    }
520

    
521
#endif
522

    
523
#elif ACML
524
  
525
  /* Order is Row : Have to swap uplo='U' and trans='N' */
526
      char transa='N',transb='T';
527
  
528
  printf("Using ACML: %i iterations for %ix%i matrix\n",
529
         RUNS,dim,dim);
530
  
531
#ifdef DOUBLE
532

    
533
  for (i=0;i<RUNS;i++)
534
    {    
535
      dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
536
      dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
537
    }
538

    
539
#else
540

    
541
  for (i=0;i<RUNS;i++)
542
    {    
543
      sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
544
      sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
545
    }
546

    
547
#endif
548

    
549
#elif GSL
550

    
551
  printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
552

    
553
  /* 
554
     RowMajor : Matrix is read row by row
555
     Upper : the no null elements are on top
556
     NoTrans : no transposition before estimation
557
     NonUnit : Matrix is not unit
558
   */
559

    
560
#ifdef DOUBLE
561

    
562
  for (i=0;i<RUNS;i++)
563
    {  
564
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
565
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
566
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
567
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
568
    }
569
  
570
#else
571

    
572
  for (i=0;i<RUNS;i++)
573
    {  
574
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
575
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
576
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
577
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
578
    }
579
  
580
#endif
581
      
582
#else
583

    
584
  printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
585

    
586
  /* 
587
     RowMajor : Matrix is read row bu row
588
     Upper : the no null elements are on top
589
     NoTrans : no transposition before estimation
590
     NonUnit : Matrix is not unit
591
   */
592

    
593
#ifdef DOUBLE
594

    
595
  for (i=0;i<RUNS;i++)
596
    {  
597
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
598
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
599
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
600
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
601
      /* dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, */
602
      /*                   dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); */
603
      /* dgemm(CblasRowMajor,CblasTrans,CblasTrans, */
604
      /*                   dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); */
605
    }
606
  
607
#else
608

    
609
  for (i=0;i<RUNS;i++)
610
    {  
611
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
612
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
613
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
614
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
615
      /* sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, */
616
      /*                   dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); */
617
      /* sgemm(CblasRowMajor,CblasTrans,CblasTrans, */
618
      /*                   dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); */
619
    }
620
  
621
#endif
622

    
623
#endif
624

    
625
  /* Get second timer after launching */
626
  gettimeofday(&tv2, &tz);
627

    
628
  /* Store the checker of errors */
629
  checksA[0]=0.;
630
  
631
  for (i=0; i<dim; i++) {
632
    for (j=0; j<dim; j++) {
633
      checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]);
634
    }
635
  }
636

    
637
  /* Print the matrix */
638
 
639
#ifdef QUIET
640
#else
641
  for (i=0; i<dim; i++) {
642
    for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]);
643
    putchar('\n');
644
  }
645
  putchar('\n');
646
  for (i=0; i<dim; i++) {
647
    for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]);
648
    putchar('\n');
649
  }
650
  putchar('\n');
651
#endif
652

    
653
  /* Free 1 Matrix and 2 Vectors of dimension dim  */
654

    
655
  free(A);
656
  free(B);
657
  free(C);
658
  free(D);
659

    
660
  putchar('\n');
661

    
662
#ifdef CLBLAS
663
  double memoryIn,memoryOut;
664

    
665
  memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L +        \
666
                    (tv3.tv_usec-tv1.tv_usec))/1000000.;  
667

    
668
  memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L +        \
669
                    (tv2.tv_usec-tv4.tv_usec))/1000000.;  
670

    
671
  duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L +        \
672
                    (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS;  
673

    
674
  printf("Duration of memory allocation : %2.10f s\n",memoryIn);
675
  printf("Duration of memory free : %2.10f s\n",memoryOut);
676
#elif CUBLAS
677
  double memoryIn,memoryOut;
678

    
679
  memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L +        \
680
                    (tv3.tv_usec-tv1.tv_usec))/1000000.;  
681

    
682
  memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L +        \
683
                    (tv2.tv_usec-tv4.tv_usec))/1000000.;  
684

    
685
  duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L +        \
686
                    (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS;  
687

    
688
  printf("Duration of memory allocation : %2.10f s\n",memoryIn);
689
  printf("Duration of memory free : %2.10f s\n",memoryOut);
690
#else
691
  duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L +        \
692
                    (tv2.tv_usec-tv1.tv_usec))/1000000./RUNS;  
693

    
694
#endif
695

    
696
  printf("Duration of each cycle : %2.10f s\n",duration);
697

    
698
  printf("Number of GFlops : %2.3f \n",
699
         dim*dim*2.*(2.*dim-1)/duration/1000000000.);
700

    
701
  printf("Error %1.10f\n",checksA[0]);
702
  printResults(RUNS,checksA,"C","Errors cumulated");
703

    
704
  putchar('\n');
705

    
706
  /* Free 2 vectors for checker Before and After */
707

    
708
  free(checksA);
709
  free(checksB);
710

    
711
  return 0;
712
}
713

    
714
#ifdef CLBLAS
715

    
716
int DelectOpenCLDevices() 
717
{
718
  /* */
719
  /* Not needed to import CL.h, already done in CLBLAS.h */
720

    
721
  int i, j;
722
  char* value;
723
  size_t valueSize;
724
  cl_uint platformCount;
725
  cl_platform_id* platforms;
726
  cl_uint deviceCount;
727
  cl_device_id* devices;
728
  cl_uint maxComputeUnits;
729
  cl_int maxWorkGroupSize;
730
  cl_int maxWorkItemSizes;
731
  cl_device_type dev_type;
732

    
733
  // get all platforms
734
  clGetPlatformIDs(0, NULL, &platformCount);
735
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
736
  clGetPlatformIDs(platformCount, platforms, NULL);
737

    
738

    
739
  printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
740

    
741
  for (i = 0; i < platformCount; i++) {
742

    
743
    // get all devices
744
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
745
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
746
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
747

    
748
    // for each device print critical attributes
749
    for (j = 0; j < deviceCount; j++) {
750
      
751
      // print device name
752
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
753
      value = (char*) malloc(valueSize);
754
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
755
      printf("Device (%d,%d): %s\n",i, j, value);
756
      free(value);
757

    
758
      // print type device CPU/GPU/ACCELERATOR
759
      clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
760
      printf("\tDevice Type: ");
761
      if(dev_type & CL_DEVICE_TYPE_GPU)
762
        printf("CL_DEVICE_TYPE_GPU ");
763
      if(dev_type & CL_DEVICE_TYPE_CPU)
764
        printf("CL_DEVICE_TYPE_CPU ");
765
      if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
766
        printf("CL_DEVICE_TYPE_ACCELERATOR ");
767
      if(dev_type & CL_DEVICE_TYPE_DEFAULT)
768
        printf("CL_DEVICE_TYPE_DEFAULT ");
769
      printf("\n");
770

    
771
      // print device vendor
772
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
773
      value = (char*) malloc(valueSize);
774
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
775
      printf("\tDevice vendor: %s\n", value);
776
      free(value);
777

    
778
      // print hardware device version
779
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);
780
      value = (char*) malloc(valueSize);
781
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
782
      printf("\tHardware version: %s\n", value);
783
      free(value);
784

    
785
      // print software driver version
786
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);
787
      value = (char*) malloc(valueSize);
788
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
789
      printf("\tSoftware version: %s\n", value);
790
      free(value);
791
      
792
      // print c version supported by compiler for device
793
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize);
794
      value = (char*) malloc(valueSize);
795
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
796
      printf("\tOpenCL C version: %s\n", value);
797
      free(value);
798

    
799
      // print parallel compute units
800
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,
801
                      sizeof(maxComputeUnits), &maxComputeUnits, NULL);
802
      printf("\tParallel compute units: %d\n", maxComputeUnits);
803
      
804
      // print max work group size
805
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE,
806
                      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
807
      printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
808
      
809
      // print max work items size
810
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES,
811
                      sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL);
812
      printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
813
      
814
    }
815
    printf("\n");
816
    free(devices);
817
  }
818

    
819
  free(platforms);
820
  return 0;
821

    
822
}
823
#endif
824

    
825
int main(int argc,char **argv)
826
{
827
  if ((argc==1)||
828
      (strcmp(argv[1],"-h")==0)||
829
      (strcmp(argv[1],"--help")==0))
830
    {
831
#ifdef CLBLAS
832
      printf("\nPerforms a bench using BLAS library implementation:\n\n"
833
             "\t#1 Size of square matrices \n"
834
             "\t#2 Number of iterations \n"
835
             "\t#3 OpenCL Plateform ID\n"
836
             "\t#4 OpenCL Device ID\n\n");
837
      DelectOpenCLDevices();
838
#else
839
      printf("\nPerforms a bench using BLAS library implementation:\n\n"
840
             "\t#1 Size of square matrices \n"
841
             "\t#2 Number of iterations\n\n");
842
#endif
843
    }
844
  else if ((atoi(argv[1])>=2)&&
845
           (atoi(argv[2])>=1))
846
    {
847
#ifdef CLBLAS
848
      MyPlatform=atoi(argv[3]);
849
      MyDevice=atoi(argv[4]);
850
#endif
851
      bench(atoi(argv[1]),atoi(argv[2]));
852
    }
853

    
854
  return 0;
855
}