Statistiques
| Révision :

root / BLAS / xGEMM / xGEMM.c @ 5

Historique | Voir | Annoter | Télécharger (12,4 ko)

1
/* 
2
   Performs matrix multiply
3

4
   Thanks for help from aurel32@debian.org
5
*/
6

    
7
#include <stdio.h>
8
#include <math.h>
9
#include <stdlib.h>
10
#include <sys/time.h>
11
#include <string.h>
12

    
13
#ifdef CUBLAS
14
#include <cublas.h>
15
#define CUBLAS_WRAPPER_ERROR_NOERR      0
16
#define CUBLAS_WRAPPER_ERROR_ALLOC      1
17
#define CUBLAS_WRAPPER_ERROR_SET        2
18
#define CUBLAS_WRAPPER_ERROR_GET        3
19
#define CUBLAS_WRAPPER_ERROR_STUB       4
20
#elif THUNKING
21
#include <cublas.h>
22
#elif FBLAS
23
#include <cblas_f77.h>
24
#elif GSL
25
#include <gsl_cblas.h>
26
#elif ACML
27
#include <acml.h>
28
#include <acml_mv.h>
29
#else
30
#include <cblas.h>
31
#endif
32

    
33
#ifdef DOUBLE
34
#define LENGTH double
35
#else
36
#define LENGTH float
37
#endif
38

    
39
#ifdef THUNKING
40
#include "fortran_thunking.h"
41

    
42
#elif FBLAS
43

    
44
#ifdef DOUBLE
45

    
46
void dgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, 
47
               const double *, FINT, const double *, double *, FINT);
48

    
49
#else
50

    
51
void sgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, 
52
               const float *, FINT, const float *, float *, FINT);
53

    
54
#endif
55

    
56
#endif
57

    
58
/* Matrix with only defined triangular terms */
59
/* Even if there are 0 in matrix, must be defined at all ! */
60

    
61
/* Get from fortran.c */
62

    
63
#ifdef CUBLAS
64
static char *errMsg[5] = 
65
{
66
    "no error",
67
    "allocation error",
68
    "setVector/setMatrix error",
69
    "getVector/getMatrix error",
70
    "not implemented"
71
};
72

    
73
static void wrapperError (const char *funcName, int error)
74
{
75
    printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
76
    fflush (stdout);
77
}
78
#endif
79

    
80
int printVector(const int dimVector,const LENGTH *dataVector,
81
                char *nameVector,char *mesgVector)
82
{
83
#ifndef QUIET
84

    
85
  int i;
86
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
87
  for (i=0;i<dimVector;i++)
88
    {
89
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
90
    }
91
#endif
92

    
93
  return 0;
94
}
95
  
96
int printResults(const int dimVector,const LENGTH *dataVector,
97
                 char *nameVector,char *mesgVector)
98
{
99
#ifdef RESULTS
100
  int i;
101

    
102
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
103
  for (i=0;i<dimVector;i++)
104
    {
105
      printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
106
    }
107
#endif
108
  return 0;
109
}
110
  
111
#ifdef CUBLAS
112
int printVectorGPU(const int dimVector,const LENGTH *dataVector,
113
                   char *nameVector,char *mesgVector)
114
{
115
#ifndef QUIET
116
  int i;
117
  cublasStatus stat;
118
  LENGTH *P=0;
119
  int incx=1;
120

    
121
  P=malloc(dimVector*sizeof(LENGTH));
122
  
123
  stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx);
124

    
125
  if (stat != CUBLAS_STATUS_SUCCESS) {
126
    wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
127
  }  
128

    
129
  printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
130
  for (i=0;i<dimVector;i++)
131
    {
132
      printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
133
    }
134

    
135
  free(P);  
136
#endif
137

    
138
  return 0;
139
}
140
#endif
141

    
142
int bench(int dim,int RUNS)
143
{
144
  /*
145
  int dim=1000;
146
  int RUNS=100;
147
  int incx=1;
148
  */
149
#ifdef PRINT
150
  LENGTH factor=1.;
151
#endif
152

    
153
  LENGTH alpha=1.,beta=0.;
154
  LENGTH *A,*B,*C,*D;
155

    
156
  /* checkBefore checkAfter checks */
157
  LENGTH *checksA,*checksB;
158

    
159
  int i=0, j=0;
160

    
161
  double duration;
162

    
163
  struct timeval tv1,tv2;
164
  struct timezone tz;
165

    
166
  /* Create 4 Matrix of dimension dim by dim  */
167

    
168
  A=malloc(dim*dim*sizeof(LENGTH));
169
  B=malloc(dim*dim*sizeof(LENGTH));
170
  C=malloc(dim*dim*sizeof(LENGTH));
171
  D=malloc(dim*dim*sizeof(LENGTH));
172

    
173
  /* Create 2 vectors for checker Before and After */
174

    
175
  checksA=malloc(RUNS*sizeof(LENGTH));
176
  checksB=malloc(RUNS*sizeof(LENGTH));
177

    
178
  /* Initialize elements with random numbers */
179
  /* Initialize the seed for rand() */
180
  /* srand(time()); */
181

    
182
  for (i=0; i<dim; i++) {
183
    for (j=0; j<dim; j++) {
184
        A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
185
          *(LENGTH)(i+1.)/(LENGTH)(j+1.); 
186
        B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
187
          *(LENGTH)(i+1.)/(LENGTH)(j+1.);
188
        C[i*dim+j]=0.;
189
        D[i*dim+j]=0.;
190
    }
191
  }
192
  /*
193
  A[0]=1;
194
  A[1]=2;
195
  A[2]=3;
196
  A[3]=4;
197
  
198
  B[0]=5;
199
  B[1]=6;
200
  B[2]=7;
201
  B[3]=8;
202
  */
203

    
204
  /* Print the matrix */
205
 
206
#ifdef QUIET
207
#else
208
  for (i=0; i<dim; i++) {
209
    for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]);
210
    putchar('\n');
211
  }
212
  putchar('\n');
213
  for (i=0; i<dim; i++) {
214
    for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]);
215
    putchar('\n');
216
  }
217
  putchar('\n');
218
#endif
219

    
220
 /* Get first timer before launching */
221
  gettimeofday(&tv1, &tz);
222

    
223
  /* Compute with CuBLAS library  */
224
#ifdef CUBLAS
225
  LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0;
226
  cublasStatus stat1, stat2, stat3, stat4;
227
  struct timeval tv3,tv4;
228

    
229
  /* Order is Row */
230
  /* Have to swap uplo and trans */
231
  char transa='N',transb='T';
232

    
233
  printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
234
         RUNS,dim,dim);
235

    
236
  stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA);
237
  stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB);
238
  stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC);
239
  stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD);
240

    
241
  if ((stat1 != CUBLAS_STATUS_SUCCESS) || 
242
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
243
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
244
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
245
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
246
    cublasFree (devPtrA);
247
    cublasFree (devPtrB);
248
    cublasFree (devPtrC);
249
    cublasFree (devPtrD);
250
    return 1;
251
  }
252

    
253
  stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim);
254
  stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim);
255
  stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim);
256
  stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim);
257
  
258
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
259
      (stat2 != CUBLAS_STATUS_SUCCESS) ||
260
      (stat3 != CUBLAS_STATUS_SUCCESS) ||
261
      (stat4 != CUBLAS_STATUS_SUCCESS) ) {
262
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
263
    cublasFree (devPtrA);
264
    cublasFree (devPtrB);
265
    cublasFree (devPtrC);
266
    cublasFree (devPtrD);
267
    return 1;
268
  }
269

    
270
  /* Get third timer after memory operation */
271
  gettimeofday(&tv3, &tz);
272

    
273
#ifdef DOUBLE
274

    
275
  for (i=0;i<RUNS;i++)
276
    {
277
      cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
278
                  devPtrA,dim,beta,devPtrC,dim);
279
      cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
280
                  devPtrB,dim,beta,devPtrD,dim);
281
    }
282
  
283
#else
284

    
285
  for (i=0;i<RUNS;i++)
286
    {
287
      cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim,
288
                  devPtrA,dim,beta,devPtrC,dim);
289
      cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim,
290
                  devPtrB,dim,beta,devPtrD,dim);
291
    }
292
  
293
#endif
294

    
295
  stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim);
296
  stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim);
297
  
298
  cublasFree (devPtrA);
299
  cublasFree (devPtrB);
300
  cublasFree (devPtrC);
301
  cublasFree (devPtrD);
302
  
303
  if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
304
    wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
305
  }
306
  
307
  /* Get fourth timer after memory free */
308
  gettimeofday(&tv4, &tz);
309

    
310
#elif THUNKING
311
  
312
  /* Order is Row : Have to swap uplo='U' and trans='N' */
313
  char transa='N',transb='T';
314
  printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
315
         RUNS,dim,dim);
316

    
317
#ifdef DOUBLE
318

    
319
  for (i=0;i<RUNS;i++)
320
    {      
321
      /* Multiply A by X as Y <- A.X */
322
      CUBLAS_DGEMM(&transa,&transa,
323
                   &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
324
      CUBLAS_DGEMM(&transb,&transb,
325
                   &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
326
    }
327

    
328
#else
329

    
330
  for (i=0;i<RUNS;i++)
331
    {      
332
      /* Multiply A by X as Y <- A.X */
333
      CUBLAS_SGEMM(&transa,&transa,
334
                   &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
335
      CUBLAS_SGEMM(&transb,&transb,
336
                   &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
337
    }
338
  
339
#endif
340

    
341
#elif FBLAS
342
  
343
  /* Order is Row : Have to swap uplo='U' and trans='N' */
344
      char transa='N',transb='T';
345
  
346
  printf("Using FBLAS: %i iterations for %ix%i matrix\n",
347
         RUNS,dim,dim);
348
  
349
#ifdef DOUBLE
350

    
351
  for (i=0;i<RUNS;i++)
352
    {    
353
      /* Multiply A by X as Y <- A.X */
354
      dgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
355
      dgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
356
    }
357

    
358
#else
359

    
360
  for (i=0;i<RUNS;i++)
361
    {    
362
      /* Multiply A by X as Y <- A.X */
363
      sgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim);
364
      sgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim);
365
    }
366

    
367
#endif
368

    
369
#elif ACML
370
  
371
  /* Order is Row : Have to swap uplo='U' and trans='N' */
372
      char transa='N',transb='T';
373
  
374
  printf("Using ACML: %i iterations for %ix%i matrix\n",
375
         RUNS,dim,dim);
376
  
377
#ifdef DOUBLE
378

    
379
  for (i=0;i<RUNS;i++)
380
    {    
381
      /* Multiply A by X as Y <- A.X */
382
      dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
383
      dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
384
    }
385

    
386
#else
387

    
388
  for (i=0;i<RUNS;i++)
389
    {    
390
      /* Multiply A by X as Y <- A.X */
391
      sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim);
392
      sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim);
393
    }
394

    
395
#endif
396

    
397
#elif GSL
398

    
399
  printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
400

    
401
  /* 
402
     RowMajor : Matrix is read row by row
403
     Upper : the no null elements are on top
404
     NoTrans : no transposition before estimation
405
     NonUnit : Matrix is not unit
406
   */
407

    
408
#ifdef DOUBLE
409

    
410
  for (i=0;i<RUNS;i++)
411
    {  
412
      /* Multiply A by X as Y <- A.X */
413
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
414
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
415
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
416
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
417
    }
418
  
419
#else
420

    
421
  for (i=0;i<RUNS;i++)
422
    {  
423
      /* Multiply A by X as Y <- A.X */
424
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
425
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
426
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
427
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
428
    }
429
  
430
#endif
431
      
432
#else
433

    
434
  printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
435

    
436
  /* 
437
     RowMajor : Matrix is read row bu row
438
     Upper : the no null elements are on top
439
     NoTrans : no transposition before estimation
440
     NonUnit : Matrix is not unit
441
   */
442

    
443
#ifdef DOUBLE
444

    
445
  for (i=0;i<RUNS;i++)
446
    {  
447
      cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
448
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
449
      cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans,
450
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
451
    }
452
  
453
#else
454

    
455
  for (i=0;i<RUNS;i++)
456
    {  
457
      cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans,
458
                  dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim);
459
      cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans,
460
                  dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim);
461
    }
462
  
463
#endif
464

    
465
#endif
466

    
467
  /* Get second timer after launching */
468
  gettimeofday(&tv2, &tz);
469

    
470
  /* Store the checker of errors */
471
  checksA[0]=0.;
472
  
473
  for (i=0; i<dim; i++) {
474
    for (j=0; j<dim; j++) {
475
      checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]);
476
    }
477
  }
478

    
479
  /* Print the matrix */
480
 
481
#ifdef QUIET
482
#else
483
  for (i=0; i<dim; i++) {
484
    for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]);
485
    putchar('\n');
486
  }
487
  putchar('\n');
488
  for (i=0; i<dim; i++) {
489
    for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]);
490
    putchar('\n');
491
  }
492
  putchar('\n');
493
#endif
494

    
495
  /* Free 1 Matrix and 2 Vectors of dimension dim  */
496

    
497
  free(A);
498
  free(B);
499
  free(C);
500
  free(D);
501

    
502
  putchar('\n');
503

    
504
#ifdef CUBLAS
505
  double memoryIn,memoryOut;
506

    
507
  memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L +        \
508
                    (tv3.tv_usec-tv1.tv_usec))/1000000.;  
509

    
510
  memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L +        \
511
                    (tv2.tv_usec-tv4.tv_usec))/1000000.;  
512

    
513
  duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L +        \
514
                    (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS;  
515

    
516
  printf("Duration of memory allocation : %2.10f s\n",memoryIn);
517
  printf("Duration of memory free : %2.10f s\n",memoryOut);
518
#else
519
  duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L +        \
520
                    (tv2.tv_usec-tv1.tv_usec))/1000000./RUNS;  
521

    
522
#endif
523

    
524
  printf("Duration of each cycle : %2.10f s\n",duration);
525

    
526
  printf("Number of GFlops : %2.3f \n",
527
         dim*dim*2.*(2.*dim-1)/duration/1000000000.);
528

    
529
  printf("Error %1.10f\n",checksA[0]);
530
  printResults(RUNS,checksA,"C","Errors cumulated");
531

    
532
  putchar('\n');
533

    
534
  /* Free 2 vectors for checker Before and After */
535

    
536
  free(checksA);
537
  free(checksB);
538

    
539
  return 0;
540
}
541

    
542
int main(int argc,char **argv)
543
{
544
  if ((argc==1)||
545
      (strcmp(argv[1],"-h")==0)||
546
      (strcmp(argv[1],"--help")==0))
547
    {
548
      printf("\nPerforms a bench using BLAS library implementation:\n\n"
549
             "\t#1 Size of square matrices \n"
550
             "\t#2 Number of iterations\n\n");
551
    }
552
  else if ((atoi(argv[1])>=2)&&
553
           (atoi(argv[2])>=1))
554
    {
555
      bench(atoi(argv[1]),atoi(argv[2]));
556
    }
557

    
558
  return 0;
559
}