Statistiques
| Révision :

root / BLAS / xGEMM / xGEMM.c @ 154

Historique | Voir | Annoter | Télécharger (21,4 ko)

1
/* 
2
   Performs matrix multiply on several BLAS implementations 
3
   Copyleft Emmanuel QUEMENER <emmanuel.quemener@gmail.com> under GPLv3
4

5
   2014-03-14 : Add clBLAS implementation
6

7
   Thanks for help from aurel32@debian.org
8
*/
9

    
10
#include <stdio.h>
11
#include <math.h>
12
#include <stdlib.h>
13
#include <sys/time.h>
14
#include <string.h>
15

    
16
#ifdef CLBLAS
17
#include <clBLAS.h>
18
/* Precise here to avoid new specific bench function */
19
int MyPlatform;
20
int MyDevice;
21
#elif CUBLAS
22
#include <cublas.h>
23
#define CUBLAS_WRAPPER_ERROR_NOERR      0
24
#define CUBLAS_WRAPPER_ERROR_ALLOC      1
25
#define CUBLAS_WRAPPER_ERROR_SET        2
26
#define CUBLAS_WRAPPER_ERROR_GET        3
27
#define CUBLAS_WRAPPER_ERROR_STUB       4
28
#elif THUNKING
29
#include <cublas.h>
30
#include "fortran_common.h"
31
#include "fortran_thunking.h"
32
#elif FBLAS
33
#include <cblas.h>
34
#include <cblas_f77.h>
35
#elif GSL
36
#include <gsl_cblas.h>
37
#elif ACML
38
#include <acml.h>
39
#else
40
#include <cblas.h>
41
#include <blaswrap.h>
42
#endif
43

    
44
#ifdef CLBLAS
45

    
46
#ifdef DOUBLE
47
#define LENGTH cl_double
48
#else
49
#define LENGTH cl_float
50
#endif
51

    
52
#else
53

    
54
#ifdef DOUBLE
55
#define LENGTH double
56
#else
57
#define LENGTH float
58
#endif
59

    
60
#endif
61

    
62
#ifdef FBLAS
63

    
64
#ifdef DOUBLE
65

    
66
void F77_dgemm(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, 
67
               const double *, FINT, const double *, double *, FINT);
68

    
69
#else
70

    
71
void F77_sgemm(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, 
72
               const float *, FINT, const float *, float *, FINT);
73

    
74
#endif
75
#endif
76

    
77
/* Matrix with only defined triangular terms */
78
/* Even if there are 0 in matrix, must be defined at all ! */
79

    
80
/* Get from fortran.c */
81

    
82
#ifdef CUBLAS
83
static char *errMsg[5] = 
84
{
85
    "no error",
86
    "allocation error",
87
    "setVector/setMatrix error",
88
    "getVector/getMatrix error",
89
    "not implemented"
90
};
91

    
92
static void wrapperError (const char *funcName, int error)
93
{
94
    printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
95
    fflush (stdout);
96
}
97
#endif
98

    
99
int printVector(const int dimVector,const LENGTH *dataVector,
100
                char *nameVector,char *mesgVector)
101
{
102
#ifndef QUIET
103

    
104
  int i;
105
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
106
  for (i=0;i<dimVector;i++)
107
    {
108
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
109
    }
110
#endif
111

    
112
  return 0;
113
}
114
  
115
int printResults(const int dimVector,const LENGTH *dataVector,
116
                 char *nameVector,char *mesgVector)
117
{
118
#ifdef RESULTS
119
  int i;
120

    
121
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
122
  for (i=0;i<dimVector;i++)
123
    {
124
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
125
    }
126
#endif
127
  return 0;
128
}
129
  
130
#ifdef CUBLAS
131
int printVectorGPU(const int dimVector,const LENGTH *dataVector,
132
                   char *nameVector,char *mesgVector)
133
{
134
#ifndef QUIET
135
  int i;
136
  cublasStatus stat;
137
  LENGTH *P=0;
138
  int incx=1;
139

    
140
  P=malloc(dimVector*sizeof(LENGTH));
141
  
142
  stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx);
143

    
144
  if (stat != CUBLAS_STATUS_SUCCESS) {
145
    wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
146
  }  
147

    
148
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
149
  for (i=0;i<dimVector;i++)
150
    {
151
      printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
152
    }
153

    
154
  free(P);  
155
#endif
156

    
157
  return 0;
158
}
159
#endif
160

    
161
int bench(int dim,int RUNS)
162
{
163
  /*
164
  int dim=1000;
165
  int RUNS=100;
166
  int incx=1;
167
  */
168
#ifdef PRINT
169
  LENGTH factor=1.;
170
#endif
171

    
172
  LENGTH alpha=1.,beta=0.;
173
  LENGTH *A,*B,*C,*D;
174

    
175
  /* checkBefore checkAfter checks */
176
  LENGTH *checksA,*checksB;
177

    
178
  int i=0, j=0;
179

    
180
  double duration;
181

    
182
  struct timeval tv1,tv2;
183
  struct timezone tz;
184

    
185
  /* Create 4 Matrix of dimension dim by dim  */
186

    
187
  A=malloc(dim*dim*sizeof(LENGTH));
188
  B=malloc(dim*dim*sizeof(LENGTH));
189
  C=malloc(dim*dim*sizeof(LENGTH));
190
  D=malloc(dim*dim*sizeof(LENGTH));
191

    
192
  /* Create 2 vectors for checker Before and After */
193

    
194
  checksA=malloc(RUNS*sizeof(LENGTH));
195
  checksB=malloc(RUNS*sizeof(LENGTH));
196

    
197
  /* Initialize elements with random numbers */
198
  /* Initialize the seed for rand() */
199
  /* srand(time()); */
200

    
201
  for (i=0; i<dim; i++) {
202
    for (j=0; j<dim; j++) {
203
        A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
204
          *(LENGTH)(i+1.)/(LENGTH)(j+1.); 
205
        B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
206
          *(LENGTH)(i+1.)/(LENGTH)(j+1.);
207
        C[i*dim+j]=0.;
208
        D[i*dim+j]=0.;
209
    }
210
  }
211
  /*
212
  A[0]=1;
213
  A[1]=2;
214
  A[2]=3;
215
  A[3]=4;
216
  
217
  B[0]=5;
218
  B[1]=6;
219
  B[2]=7;
220
  B[3]=8;
221
  */
222

    
223
  /* Print the matrix */
224
 
225
#ifdef QUIET
226
#else
227
  for (i=0; i<dim; i++) {
228
    for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]);
229
    putchar('\n');
230
  }
231
  putchar('\n');
232
  for (i=0; i<dim; i++) {
233
    for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]);
234
    putchar('\n');
235
  }
236
  putchar('\n');
237
#endif
238

    
239
 /* Get first timer before launching */
240
  gettimeofday(&tv1, &tz);
241

    
242
  /* Compute with CLBLAS library  */
243
#ifdef CLBLAS
244

    
245
  cl_uint platformCount;
246
  cl_platform_id* platforms;
247
  cl_uint deviceCount;
248
  cl_device_id* devices;
249

    
250
  cl_int err,errA,errB,errC,errD;
251
  cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
252
  cl_context ctx = 0;
253
  cl_command_queue queue = 0;
254
  cl_mem bufA, bufB, bufC, bufD;
255
  cl_event event = NULL;
256

    
257
  char* value;
258
  size_t valueSize;
259

    
260
  // tv3 Put on Device: Allocate & Write buffer
261
  // tv4 Compute
262
  struct timeval tv3,tv4;
263

    
264
  printf("Using CLBLAS: %i iterations for %ix%i matrix on (%d,%d)\n",
265
         RUNS,dim,dim,MyPlatform,MyDevice);
266

    
267
  /* Setup OpenCL environment. */
268
  /* - get all platforms and select MyPlatform */
269
  /* - get all devices from MyPlatform and select MyDevice */
270

    
271
  // Get all platforms
272
  err = clGetPlatformIDs(0, NULL, &platformCount);
273
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
274
  err = clGetPlatformIDs(platformCount, platforms, NULL);
275

    
276
  // Get Device defined
277
  err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
278
  devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
279
  err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);  
280

    
281
  // print device name
282
  err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize);
283
  value = (char*) malloc(valueSize);
284
  err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, value, NULL);
285
  printf("Device (%d,%d): %s\n",MyPlatform,MyDevice, value);
286
  free(value);
287

    
288
  props[1] = (cl_context_properties)platforms[MyPlatform];
289

    
290
  /* Initialize Context */
291
  ctx = clCreateContext( props, 1, &devices[MyDevice], NULL, NULL, &err );
292
  queue = clCreateCommandQueue( ctx, devices[MyDevice], 0, &err );
293

    
294
  /* Setup clBLAS */
295
  err = clblasSetup( );
296

    
297
  /* Prepare OpenCL memory objects and place matrices inside them. */
298
  bufA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*A),NULL,&errA );
299
  bufB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*B),NULL,&errB );
300
  bufC = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*C),NULL,&errC );
301
  bufD = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*D),NULL,&errD );
302

    
303
  errA = clEnqueueWriteBuffer( queue,bufA,CL_TRUE,0,
304
                               dim*dim*sizeof(*A),A,0,NULL,NULL );
305
  errB = clEnqueueWriteBuffer( queue, bufB, CL_TRUE,0,
306
                               dim*dim*sizeof(*B),B,0,NULL,NULL );
307
  errC = clEnqueueWriteBuffer( queue, bufC, CL_TRUE,0,
308
                                 dim*dim*sizeof(*C),C,0,NULL,NULL );
309
  errD = clEnqueueWriteBuffer( queue, bufD, CL_TRUE,0,
310
                                 dim*dim*sizeof(*D),D,0,NULL,NULL );
311
  
312
  /* Get third timer after memory operation */
313
  gettimeofday(&tv3, &tz);
314

    
315
#ifdef DOUBLE
316

    
317
  for (i=0;i<RUNS;i++)
318
    {
319
      err = clblasDgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, 
320
                         dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta,
321
                         bufC,0,dim,1,&queue,0,NULL,&event );
322

    
323
      err = clblasDgemm( clblasRowMajor,clblasTrans,clblasTrans, 
324
                         dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta,
325
                         bufD,0,dim,1,&queue,0,NULL,&event );
326

    
327
    }
328
  
329
  if (err != CL_SUCCESS) {
330
    printf("clblasDgemm() failed with %d\n", err);
331
  }
332

    
333
#else
334

    
335
  for (i=0;i<RUNS;i++)
336
    {
337

    
338
      err = clblasSgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, 
339
                         dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta,
340
                         bufC,0,dim,1,&queue,0,NULL,&event );
341

    
342
      err = clblasSgemm( clblasRowMajor,clblasTrans,clblasTrans, 
343
                         dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta,
344
                         bufD,0,dim,1,&queue,0,NULL,&event );
345
    }
346
  
347
  if (err != CL_SUCCESS) {
348
    printf("clblasSgemm() failed with %d\n", err);
349
  }
350

    
351
#endif
352

    
353
  /* Wait for calculations to be finished. */
354
  err = clWaitForEvents( 1, &event );
355
  
356
  /* Get fourth timer after memory free */
357
  gettimeofday(&tv4, &tz);
358

    
359
  /* Fetch results of calculations from GPU memory. */
360
  errC = clEnqueueReadBuffer( queue,bufC,CL_TRUE,0,dim*dim * sizeof(*C),
361
                             C,0,NULL,NULL );
362

    
363
  /* Fetch results of calculations from GPU memory. */
364
  errD = clEnqueueReadBuffer( queue,bufD,CL_TRUE,0,dim*dim*sizeof(*D),
365
                             D,0,NULL,NULL );
366

    
367
  /* Release OpenCL memory objects. */
368
  clReleaseMemObject( bufD );
369
  clReleaseMemObject( bufC );
370
  clReleaseMemObject( bufB );
371
  clReleaseMemObject( bufA );
372

    
373
  /* Finalize work with clBLAS */
374
  clblasTeardown( );
375

    
376
  /* Release OpenCL working objects. */
377
  clReleaseCommandQueue( queue );
378
  clReleaseContext( ctx );
379
  
380

    
381
  /* Compute with CuBLAS library  */
382
#elif CUBLAS
383
  LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0;
384
  cublasStatus stat1, stat2, stat3, stat4;
385
  struct timeval tv3,tv4;
386

    
387
  /* Order is Row */
388
  /* Have to swap uplo and trans */
389
  char transa='N',transb='T';
390

    
391
  printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
392
         RUNS,dim,dim);
393

    
394
  stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA);
395
  stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB);
396
  stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC);
397
  stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD);
398

    
399
  if ((stat1 != CUBLAS_STATUS_SUCCESS) || 
400
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
401
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
402
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
403
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
404
    cublasFree (devPtrA);
405
    cublasFree (devPtrB);
406
    cublasFree (devPtrC);
407
    cublasFree (devPtrD);
408
    return 1;
409
  }
410

    
411
  stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim);
412
  stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim);
413
  stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim);
414
  stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim);
415
  
416
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
417
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
418
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
419
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
420
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
421
    cublasFree (devPtrA);
422
    cublasFree (devPtrB);
423
    cublasFree (devPtrC);
424
    cublasFree (devPtrD);
425
    return 1;
426
  }
427

    
428
  /* Get third timer after memory operation */
429
  gettimeofday(&tv3, &tz);
430

    
431
#ifdef DOUBLE
432

    
433
  for (i=0;i<RUNS;i++)
434
    {
435
      cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
436
                  devPtrA,dim,beta,devPtrC,dim);
437
      cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
438
                  devPtrB,dim,beta,devPtrD,dim);
439
    }
440
  
441
#else
442

    
443
  for (i=0;i<RUNS;i++)
444
    {
445
      cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
446
                  devPtrA,dim,beta,devPtrC,dim);
447
      cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
448
                  devPtrB,dim,beta,devPtrD,dim);
449
    }
450
  
451
#endif
452

    
453
  stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim);
454
  stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim);
455
  
456
  /* Get fourth timer before memory free */
457
  gettimeofday(&tv4, &tz);
458

    
459
  cublasFree (devPtrA);
460
  cublasFree (devPtrB);
461
  cublasFree (devPtrC);
462
  cublasFree (devPtrD);
463
  
464
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
465
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
466
  }
467
  
468

    
469
#elif THUNKING
470
  
471
  /* Order is Row : Have to swap uplo='U' and trans='N' */
472
  char transa='N',transb='T';
473
  printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
474
         RUNS,dim,dim);
475

    
476
#ifdef DOUBLE
477

    
478
  for (i=0;i<RUNS;i++)
479
    {      
480
      CUBLAS_DGEMM(&transa,&transa,
481
                         &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
482
      CUBLAS_DGEMM(&transb,&transb,
483
                         &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
484
    }
485

    
486
#else
487

    
488
  for (i=0;i<RUNS;i++)
489
    {      
490
      CUBLAS_SGEMM(&transa,&transa,
491
                         &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
492
      CUBLAS_SGEMM(&transb,&transb,
493
                         &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
494
    }
495
  
496
#endif
497

    
498
#elif FBLAS
499
  
500
  /* Order is Row : Have to swap uplo='U' and trans='N' */
501
      char transa='N',transb='T';
502
  
503
  printf("Using FBLAS: %i iterations for %ix%i matrix\n",
504
         RUNS,dim,dim);
505
  
506
#ifdef DOUBLE
507

    
508
  for (i=0;i<RUNS;i++)
509
    {    
510
      F77_dgemm(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
511
      F77_dgemm(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
512
    }
513

    
514
#else
515

    
516
  for (i=0;i<RUNS;i++)
517
    {    
518
      F77_sgemm(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
519
      F77_sgemm(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
520
    }
521

    
522
#endif
523

    
524
#elif ACML
525
  
526
  /* Order is Row : Have to swap uplo='U' and trans='N' */
527
      char transa='N',transb='T';
528
  
529
  printf("Using ACML: %i iterations for %ix%i matrix\n",
530
         RUNS,dim,dim);
531
  
532
#ifdef DOUBLE
533

    
534
  for (i=0;i<RUNS;i++)
535
    {    
536
      dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
537
      dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
538
    }
539

    
540
#else
541

    
542
  for (i=0;i<RUNS;i++)
543
    {    
544
      sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
545
      sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
546
    }
547

    
548
#endif
549

    
550
#elif GSL
551

    
552
  printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
553

    
554
  /* 
555
     RowMajor : Matrix is read row by row
556
     Upper : the no null elements are on top
557
     NoTrans : no transposition before estimation
558
     NonUnit : Matrix is not unit
559
   */
560

    
561
#ifdef DOUBLE
562

    
563
  for (i=0;i<RUNS;i++)
564
    {  
565
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
566
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
567
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
568
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
569
    }
570
  
571
#else
572

    
573
  for (i=0;i<RUNS;i++)
574
    {  
575
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
576
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
577
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
578
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
579
    }
580
  
581
#endif
582
      
583
#else
584

    
585
  printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
586

    
587
  /* 
588
     RowMajor : Matrix is read row bu row
589
     Upper : the no null elements are on top
590
     NoTrans : no transposition before estimation
591
     NonUnit : Matrix is not unit
592
   */
593

    
594
#ifdef DOUBLE
595

    
596
  for (i=0;i<RUNS;i++)
597
    {  
598
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
599
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
600
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
601
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
602
    }
603
  
604
#else
605

    
606
  for (i=0;i<RUNS;i++)
607
    {  
608
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
609
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
610
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
611
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
612
    }
613
  
614
#endif
615

    
616
#endif
617

    
618
  /* Get second timer after launching */
619
  gettimeofday(&tv2, &tz);
620

    
621
  /* Store the checker of errors */
622
  checksA[0]=0.;
623
  
624
  for (i=0; i<dim; i++) {
625
    for (j=0; j<dim; j++) {
626
      checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]);
627
    }
628
  }
629

    
630
  /* Print the matrix */
631
 
632
#ifdef QUIET
633
#else
634
  for (i=0; i<dim; i++) {
635
    for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]);
636
    putchar('\n');
637
  }
638
  putchar('\n');
639
  for (i=0; i<dim; i++) {
640
    for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]);
641
    putchar('\n');
642
  }
643
  putchar('\n');
644
#endif
645

    
646
  /* Free 1 Matrix and 2 Vectors of dimension dim  */
647

    
648
  free(A);
649
  free(B);
650
  free(C);
651
  free(D);
652

    
653
  putchar('\n');
654

    
655
#ifdef CLBLAS
656
  double memoryIn,memoryOut;
657

    
658
  memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L +        \
659
                    (tv3.tv_usec-tv1.tv_usec))/1000000.;  
660

    
661
  memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L +        \
662
                    (tv2.tv_usec-tv4.tv_usec))/1000000.;  
663

    
664
  duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L +        \
665
                    (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS;  
666

    
667
  printf("Duration of memory allocation : %2.10f s\n",memoryIn);
668
  printf("Duration of memory free : %2.10f s\n",memoryOut);
669
#elif CUBLAS
670
  double memoryIn,memoryOut;
671

    
672
  memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L +        \
673
                    (tv3.tv_usec-tv1.tv_usec))/1000000.;  
674

    
675
  memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L +        \
676
                    (tv2.tv_usec-tv4.tv_usec))/1000000.;  
677

    
678
  duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L +        \
679
                    (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS;  
680

    
681
  printf("Duration of memory allocation : %2.10f s\n",memoryIn);
682
  printf("Duration of memory free : %2.10f s\n",memoryOut);
683
#else
684
  duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L +        \
685
                    (tv2.tv_usec-tv1.tv_usec))/1000000./RUNS;  
686

    
687
#endif
688

    
689
  printf("Duration of each cycle : %2.10f s\n",duration);
690

    
691
  printf("Number of GFlops : %2.3f \n",
692
         dim*dim*2.*(2.*dim-1)/duration/1000000000.);
693

    
694
  printf("Error %1.10f\n",checksA[0]);
695
  printResults(RUNS,checksA,"C","Errors cumulated");
696

    
697
  putchar('\n');
698

    
699
  /* Free 2 vectors for checker Before and After */
700

    
701
  free(checksA);
702
  free(checksB);
703

    
704
  return 0;
705
}
706

    
707
#ifdef CLBLAS
708

    
709
int DelectOpenCLDevices() 
710
{
711
  /* */
712
  /* Not needed to import CL.h, already done in CLBLAS.h */
713

    
714
  int i, j;
715
  char* value;
716
  size_t valueSize;
717
  cl_uint platformCount;
718
  cl_platform_id* platforms;
719
  cl_uint deviceCount;
720
  cl_device_id* devices;
721
  cl_uint maxComputeUnits;
722
  cl_int maxWorkGroupSize;
723
  cl_int maxWorkItemSizes;
724
  cl_device_type dev_type;
725

    
726
  // get all platforms
727
  clGetPlatformIDs(0, NULL, &platformCount);
728
  platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
729
  clGetPlatformIDs(platformCount, platforms, NULL);
730

    
731

    
732
  printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
733

    
734
  for (i = 0; i < platformCount; i++) {
735

    
736
    // get all devices
737
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount);
738
    devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
739
    clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
740

    
741
    // for each device print critical attributes
742
    for (j = 0; j < deviceCount; j++) {
743
      
744
      // print device name
745
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize);
746
      value = (char*) malloc(valueSize);
747
      clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
748
      printf("Device (%d,%d): %s\n",i, j, value);
749
      free(value);
750

    
751
      // print type device CPU/GPU/ACCELERATOR
752
      clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
753
      printf("\tDevice Type: ");
754
      if(dev_type & CL_DEVICE_TYPE_GPU)
755
        printf("CL_DEVICE_TYPE_GPU ");
756
      if(dev_type & CL_DEVICE_TYPE_CPU)
757
        printf("CL_DEVICE_TYPE_CPU ");
758
      if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
759
        printf("CL_DEVICE_TYPE_ACCELERATOR ");
760
      if(dev_type & CL_DEVICE_TYPE_DEFAULT)
761
        printf("CL_DEVICE_TYPE_DEFAULT ");
762
      printf("\n");
763

    
764
      // print device vendor
765
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize);
766
      value = (char*) malloc(valueSize);
767
      clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
768
      printf("\tDevice vendor: %s\n", value);
769
      free(value);
770

    
771
      // print hardware device version
772
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize);
773
      value = (char*) malloc(valueSize);
774
      clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
775
      printf("\tHardware version: %s\n", value);
776
      free(value);
777

    
778
      // print software driver version
779
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize);
780
      value = (char*) malloc(valueSize);
781
      clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
782
      printf("\tSoftware version: %s\n", value);
783
      free(value);
784
      
785
      // print c version supported by compiler for device
786
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize);
787
      value = (char*) malloc(valueSize);
788
      clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
789
      printf("\tOpenCL C version: %s\n", value);
790
      free(value);
791

    
792
      // print parallel compute units
793
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS,
794
                      sizeof(maxComputeUnits), &maxComputeUnits, NULL);
795
      printf("\tParallel compute units: %d\n", maxComputeUnits);
796
      
797
      // print max work group size
798
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE,
799
                      sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL);
800
      printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
801
      
802
      // print max work items size
803
      clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES,
804
                      sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL);
805
      printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
806
      
807
    }
808
    printf("\n");
809
    free(devices);
810
  }
811

    
812
  free(platforms);
813
  return 0;
814

    
815
}
816
#endif
817

    
818
int main(int argc,char **argv)
819
{
820
  if ((argc==1)||
821
      (strcmp(argv[1],"-h")==0)||
822
      (strcmp(argv[1],"--help")==0))
823
    {
824
#ifdef CLBLAS
825
      printf("\nPerforms a bench using BLAS library implementation:\n\n"
826
             "\t#1 Size of square matrices \n"
827
             "\t#2 Number of iterations \n"
828
             "\t#3 OpenCL Plateform ID\n"
829
             "\t#4 OpenCL Device ID\n\n");
830
      DelectOpenCLDevices();
831
#else
832
      printf("\nPerforms a bench using BLAS library implementation:\n\n"
833
             "\t#1 Size of square matrices \n"
834
             "\t#2 Number of iterations\n\n");
835
#endif
836
    }
837
  else if ((atoi(argv[1])>=2)&&
838
           (atoi(argv[2])>=1))
839
    {
840
#ifdef CLBLAS
841
      MyPlatform=atoi(argv[3]);
842
      MyDevice=atoi(argv[4]);
843
#endif
844
      bench(atoi(argv[1]),atoi(argv[2]));
845
    }
846

    
847
  return 0;
848
}