Révision 1
BLAS/xTRSV/patch_thunking.h (revision 1) | ||
---|---|---|
1 |
41c41 |
|
2 |
< #define CUBLAS_FORTRAN_COMPILER CUBLAS_G95 |
|
3 |
--- |
|
4 |
> #define CUBLAS_FORTRAN_COMPILER CUBLAS_INTEL_FORTRAN |
BLAS/xTRSV/Makefile (revision 1) | ||
---|---|---|
1 |
SOURCE=xTRSV.c |
|
2 |
|
|
3 |
CC=gcc |
|
4 |
CFLAGS=-Wall -O3 |
|
5 |
LDFLAGS=-lm |
|
6 |
CUDADIR=/opt/cuda |
|
7 |
CUDASRC=$(CUDADIR)/src |
|
8 |
THUNKING=fortran_thunking.c |
|
9 |
CUDASRCINC=fortran_common.h |
|
10 |
CUDAINC=$(CUDADIR)/include |
|
11 |
CUDALIB=$(CUDADIR)/lib64 |
|
12 |
|
|
13 |
PATCHTHUNKING=patch_thunking.h |
|
14 |
|
|
15 |
GSLINC=/usr/include/gsl |
|
16 |
|
|
17 |
GOTO2=/opt/GotoBLAS2 |
|
18 |
|
|
19 |
ACML=/opt/acml |
|
20 |
ACMLINC=$(ACML)/gfortran64_mp/include |
|
21 |
ACMLLIB=$(ACML)/gfortran64_mp/lib |
|
22 |
|
|
23 |
EXECUTABLE=cblas fblas gsl cublas thunking gotoblas acml |
|
24 |
|
|
25 |
#FORMAT=DOUBLE |
|
26 |
FORMAT=FLOAT |
|
27 |
|
|
28 |
#DIRECTIVES=-D$(FORMAT) -DPRINT -DUNIT |
|
29 |
#DIRECTIVES=-D$(FORMAT) -DUNIT -DRESULTS -DQUIET |
|
30 |
DIRECTIVES=-DQUIET -DUNIT |
|
31 |
|
|
32 |
all: $(EXECUTABLE) |
|
33 |
|
|
34 |
cblas: $(SOURCE) |
|
35 |
|
|
36 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DCBLAS $(LDFLAGS) \ |
|
37 |
$(SOURCE) -lcblas -o $(SOURCE:.c=)_SP_$@ |
|
38 |
|
|
39 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DCBLAS $(LDFLAGS) \ |
|
40 |
$(SOURCE) -lcblas -o $(SOURCE:.c=)_DP_$@ |
|
41 |
|
|
42 |
gotoblas: $(SOURCE) |
|
43 |
|
|
44 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DCBLAS $(LDFLAGS) \ |
|
45 |
$(SOURCE) $(GOTO2)/libgoto2.a -lpthread -o $(SOURCE:.c=)_SP_$@ |
|
46 |
|
|
47 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DCBLAS $(LDFLAGS) \ |
|
48 |
$(SOURCE) $(GOTO2)/libgoto2.a -lpthread -o $(SOURCE:.c=)_DP_$@ |
|
49 |
|
|
50 |
acml: $(SOURCE) |
|
51 |
|
|
52 |
$(CC) -I$(ACMLINC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DACML $(LDFLAGS) \ |
|
53 |
$(SOURCE) -L$(ACMLLIB) -lacml_mp -lacml_mv \ |
|
54 |
-lgomp -lgfortran -lpthread -o $(SOURCE:.c=)_SP_$@ |
|
55 |
|
|
56 |
$(CC) -I$(ACMLINC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DACML $(LDFLAGS) \ |
|
57 |
$(SOURCE) -L$(ACMLLIB) -lacml_mp -lacml_mv \ |
|
58 |
-lgomp -lgfortran -lpthread -o $(SOURCE:.c=)_DP_$@ |
|
59 |
|
|
60 |
fblas: $(SOURCE) |
|
61 |
|
|
62 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DFBLAS $(LDFLAGS) \ |
|
63 |
$(SOURCE) -lf77blas -o $(SOURCE:.c=)_SP_$@ |
|
64 |
|
|
65 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DFBLAS $(LDFLAGS) \ |
|
66 |
$(SOURCE) -lf77blas -o $(SOURCE:.c=)_DP_$@ |
|
67 |
|
|
68 |
|
|
69 |
gsl: $(SOURCE) |
|
70 |
|
|
71 |
$(CC) -I$(GSLINC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DGSL $(LDFLAGS) \ |
|
72 |
$(SOURCE) -lgslcblas -o $(SOURCE:.c=)_SP_$@ |
|
73 |
|
|
74 |
$(CC) -I$(GSLINC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DGSL $(LDFLAGS) \ |
|
75 |
$(SOURCE) -lgslcblas -o $(SOURCE:.c=)_DP_$@ |
|
76 |
|
|
77 |
|
|
78 |
cublas: $(SOURCE) |
|
79 |
|
|
80 |
$(CC) -I$(CUDAINC) -L$(CUDALIB) $(CFLAGS) -DCUBLAS -DFLOAT $(LDFLAGS) \ |
|
81 |
$(DIRECTIVES) $(SOURCE) -lcublas -o $(SOURCE:.c=)_SP_$@ |
|
82 |
|
|
83 |
$(CC) -I$(CUDAINC) -L$(CUDALIB) $(CFLAGS) -DCUBLAS -DDOUBLE $(LDFLAGS) \ |
|
84 |
$(DIRECTIVES) $(SOURCE) -lcublas -o $(SOURCE:.c=)_DP_$@ |
|
85 |
|
|
86 |
thunking: $(SOURCE) |
|
87 |
|
|
88 |
# Copy of source for thunking CUBLAS approach |
|
89 |
cp $(CUDASRC)/$(THUNKING) $(CUDASRC)/$(THUNKING:.c=.h) \ |
|
90 |
$(CUDASRC)/$(CUDASRCINC) . |
|
91 |
# Patch Thunking prototypes to compile on Debian Lenny |
|
92 |
patch $(CUDASRCINC) $(PATCHTHUNKING) |
|
93 |
$(CC) -I$(CUDAINC) $(CFLAGS) -c $(THUNKING) |
|
94 |
|
|
95 |
$(CC) -I$(CUDAINC) -L$(CUDALIB) $(CFLAGS) -DTHUNKING \ |
|
96 |
$(LDFLAGS) $(DIRECTIVES) $(SOURCE) -DFLOAT \ |
|
97 |
$(THUNKING:.c=.o) -lcublas -o $(SOURCE:.c=)_SP_$@ |
|
98 |
|
|
99 |
$(CC) -I$(CUDAINC) -L$(CUDALIB) $(CFLAGS) -DTHUNKING \ |
|
100 |
$(LDFLAGS) $(DIRECTIVES) $(SOURCE) -DDOUBLE \ |
|
101 |
$(THUNKING:.c=.o) -lcublas -o $(SOURCE:.c=)_DP_$@ |
|
102 |
|
|
103 |
clean: $(SOURCE) |
|
104 |
find . -name "$(SOURCE:.c=)_*" -exec rm {} \; |
|
105 |
find . -name "$(THUNKING:.c=)*" -exec rm {} \; |
|
106 |
find . -name "*~" -exec rm {} \; |
|
107 |
find . -name "$(CUDASRCINC)" -exec rm {} \; |
BLAS/xTRSV/xTRSV.c (revision 1) | ||
---|---|---|
1 |
/* |
|
2 |
Performs a linear system solving of random generated system |
|
3 |
Estimates a test |
|
4 |
|
|
5 |
Matrix is triangular |
|
6 |
|
|
7 |
Thanks for help from aurel32@debian.org |
|
8 |
*/ |
|
9 |
|
|
10 |
#include <stdio.h> |
|
11 |
#include <math.h> |
|
12 |
#include <stdlib.h> |
|
13 |
#include <sys/time.h> |
|
14 |
#include <string.h> |
|
15 |
|
|
16 |
#ifdef CUBLAS |
|
17 |
#include <cublas.h> |
|
18 |
#define CUBLAS_WRAPPER_ERROR_NOERR 0 |
|
19 |
#define CUBLAS_WRAPPER_ERROR_ALLOC 1 |
|
20 |
#define CUBLAS_WRAPPER_ERROR_SET 2 |
|
21 |
#define CUBLAS_WRAPPER_ERROR_GET 3 |
|
22 |
#define CUBLAS_WRAPPER_ERROR_STUB 4 |
|
23 |
#elif THUNKING |
|
24 |
#include <cublas.h> |
|
25 |
#elif FBLAS |
|
26 |
#include <cblas_f77.h> |
|
27 |
#elif GSL |
|
28 |
#include <gsl_cblas.h> |
|
29 |
#elif ACML |
|
30 |
#include <acml.h> |
|
31 |
#include <acml_mv.h> |
|
32 |
#else |
|
33 |
#include <cblas.h> |
|
34 |
#endif |
|
35 |
|
|
36 |
#ifdef DOUBLE |
|
37 |
#define LENGTH double |
|
38 |
#else |
|
39 |
#define LENGTH float |
|
40 |
#endif |
|
41 |
|
|
42 |
#ifdef THUNKING |
|
43 |
/* WARNING ! |
|
44 |
Prototypes from fortran.c functions used MUST be defined here ! |
|
45 |
*/ |
|
46 |
#include "fortran_thunking.h" |
|
47 |
|
|
48 |
/* |
|
49 |
#ifdef DOUBLE |
|
50 |
|
|
51 |
void CUBLAS_DCOPY (const int *n, const double *x, const int *incx, double *y, |
|
52 |
const int *incy); |
|
53 |
|
|
54 |
double CUBLAS_DNRM2 (const int *dim, const double *X, const int *incx); |
|
55 |
|
|
56 |
void CUBLAS_DTRSV (const char *uplo, const char *trans, const char *diag, |
|
57 |
const int *n, const double *A, const int *lda, double *x, |
|
58 |
const int *incx); |
|
59 |
|
|
60 |
void CUBLAS_DGEMV (const char *trans, const int *m, const int *n, |
|
61 |
const double *alpha, const double *A, const int *lda, |
|
62 |
const double *x, const int *incx, const double *beta, |
|
63 |
double *y, const int *incy); |
|
64 |
|
|
65 |
void CUBLAS_DSWAP (const int *n, double *x, const int *incx, double *y, |
|
66 |
const int *incy); |
|
67 |
|
|
68 |
void CUBLAS_DAXPY (const int *n, const double *alpha, const double *x, |
|
69 |
const int *incx, double *y, const int *incy); |
|
70 |
|
|
71 |
#else |
|
72 |
void CUBLAS_SCOPY (const int *n, const float *x, const int *incx, float *y, |
|
73 |
const int *incy); |
|
74 |
|
|
75 |
float CUBLAS_SNRM2 (const int *dim, const float *X, const int *incx); |
|
76 |
|
|
77 |
void CUBLAS_STRSV (const char *uplo, const char *trans, const char *diag, |
|
78 |
const int *n, const float *A, const int *lda, float *x, |
|
79 |
const int *incx); |
|
80 |
|
|
81 |
void CUBLAS_SGEMV (const char *trans, const int *m, const int *n, |
|
82 |
const float *alpha, const float *A, const int *lda, |
|
83 |
const float *x, const int *incx, const float *beta, |
|
84 |
float *y, const int *incy); |
|
85 |
|
|
86 |
void CUBLAS_SSWAP (const int *n, float *x, const int *incx, float *y, |
|
87 |
const int *incy); |
|
88 |
|
|
89 |
void CUBLAS_SAXPY (const int *n, const float *alpha, const float *x, |
|
90 |
const int *incx, float *y, const int *incy); |
|
91 |
|
|
92 |
#endif |
|
93 |
*/ |
|
94 |
|
|
95 |
#elif FBLAS |
|
96 |
|
|
97 |
#ifdef DOUBLE |
|
98 |
|
|
99 |
void dtrsv_( FCHAR, FCHAR, FCHAR, FINT, const double *, FINT, double *, FINT); |
|
100 |
|
|
101 |
void dgemv_(FCHAR, FINT, FINT, const double *, const double *, FINT, |
|
102 |
const double *, FINT, const double *, double *, FINT); |
|
103 |
|
|
104 |
void dswap_( FINT, double *, FINT, double *, FINT); |
|
105 |
|
|
106 |
void daxpy_( FINT, const double *, const double *, FINT, double *, FINT); |
|
107 |
|
|
108 |
void dnrm2_( FINT, const double *, FINT, double *); |
|
109 |
|
|
110 |
#else |
|
111 |
|
|
112 |
void strsv_( FCHAR, FCHAR, FCHAR, FINT, const float *, FINT, float *, FINT); |
|
113 |
|
|
114 |
void sgemv_(FCHAR, FINT, FINT, const float *, const float *, FINT, |
|
115 |
const float *, FINT, const float *, float *, FINT); |
|
116 |
|
|
117 |
void sswap_( FINT, float *, FINT, float *, FINT); |
|
118 |
|
|
119 |
void saxpy_( FINT, const float *, const float *, FINT, float *, FINT); |
|
120 |
|
|
121 |
void snrm2_( FINT, const float *, FINT, float *); |
|
122 |
|
|
123 |
#endif |
|
124 |
|
|
125 |
#endif |
|
126 |
|
|
127 |
/* Matrix with only defined triangular terms */ |
|
128 |
/* Even if there are 0 in matrix, must be defined at all ! */ |
|
129 |
|
|
130 |
/* Get from fortran.c */ |
|
131 |
|
|
132 |
#ifdef CUBLAS |
|
133 |
static char *errMsg[5] = |
|
134 |
{ |
|
135 |
"no error", |
|
136 |
"allocation error", |
|
137 |
"setVector/setMatrix error", |
|
138 |
"getVector/getMatrix error", |
|
139 |
"not implemented" |
|
140 |
}; |
|
141 |
|
|
142 |
static void wrapperError (const char *funcName, int error) |
|
143 |
{ |
|
144 |
printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]); |
|
145 |
fflush (stdout); |
|
146 |
} |
|
147 |
#endif |
|
148 |
|
|
149 |
int printVector(const int dimVector,const LENGTH *dataVector, |
|
150 |
char *nameVector,char *mesgVector) |
|
151 |
{ |
|
152 |
#ifndef QUIET |
|
153 |
|
|
154 |
int i; |
|
155 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector); |
|
156 |
for (i=0;i<dimVector;i++) |
|
157 |
{ |
|
158 |
printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]); |
|
159 |
} |
|
160 |
#endif |
|
161 |
|
|
162 |
return 0; |
|
163 |
} |
|
164 |
|
|
165 |
int printResults(const int dimVector,const LENGTH *dataVector, |
|
166 |
char *nameVector,char *mesgVector) |
|
167 |
{ |
|
168 |
#ifdef RESULTS |
|
169 |
int i; |
|
170 |
|
|
171 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector); |
|
172 |
for (i=0;i<dimVector;i++) |
|
173 |
{ |
|
174 |
printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]); |
|
175 |
} |
|
176 |
#endif |
|
177 |
return 0; |
|
178 |
} |
|
179 |
|
|
180 |
#ifdef CUBLAS |
|
181 |
int printVectorGPU(const int dimVector,const LENGTH *dataVector, |
|
182 |
char *nameVector,char *mesgVector) |
|
183 |
{ |
|
184 |
#ifndef QUIET |
|
185 |
int i; |
|
186 |
cublasStatus stat; |
|
187 |
LENGTH *P=0; |
|
188 |
int incx=1; |
|
189 |
|
|
190 |
P=malloc(dimVector*sizeof(LENGTH)); |
|
191 |
|
|
192 |
stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx); |
|
193 |
|
|
194 |
if (stat != CUBLAS_STATUS_SUCCESS) { |
|
195 |
wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET); |
|
196 |
} |
|
197 |
|
|
198 |
printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector); |
|
199 |
for (i=0;i<dimVector;i++) |
|
200 |
{ |
|
201 |
printf("%s[%i]=%2.10e\n",nameVector,i,P[i]); |
|
202 |
} |
|
203 |
|
|
204 |
free(P); |
|
205 |
#endif |
|
206 |
|
|
207 |
return 0; |
|
208 |
} |
|
209 |
#endif |
|
210 |
|
|
211 |
int bench(int dim,int RUNS) |
|
212 |
{ |
|
213 |
/* |
|
214 |
int dim=1000; |
|
215 |
int RUNS=100; |
|
216 |
*/ |
|
217 |
int incx=1; |
|
218 |
#ifdef PRINT |
|
219 |
LENGTH factor=1.; |
|
220 |
#endif |
|
221 |
|
|
222 |
LENGTH alpha=1.,beta=0.,beta2=-1.; |
|
223 |
LENGTH *A,*X,*Y; |
|
224 |
|
|
225 |
/* checkBefore checkAfter checks */ |
|
226 |
LENGTH *checksA,*checksB; |
|
227 |
|
|
228 |
int i=0, j=0; |
|
229 |
|
|
230 |
double duration; |
|
231 |
|
|
232 |
struct timeval tv1,tv2; |
|
233 |
struct timezone tz; |
|
234 |
|
|
235 |
/* Create 1 Matrix and 2 Vectors of dimension dim */ |
|
236 |
|
|
237 |
A=malloc(dim*dim*sizeof(LENGTH)); |
|
238 |
X=malloc(dim*sizeof(LENGTH)); |
|
239 |
Y=malloc(dim*sizeof(LENGTH)); |
|
240 |
|
|
241 |
/* Create 2 vectors for checker Before and After */ |
|
242 |
|
|
243 |
checksA=malloc(RUNS*sizeof(double)); |
|
244 |
checksB=malloc(RUNS*sizeof(double)); |
|
245 |
|
|
246 |
/* Initialize elements with random numbers */ |
|
247 |
/* Initialize the seed for rand() */ |
|
248 |
/* srand(time()); */ |
|
249 |
|
|
250 |
#ifdef UNIT |
|
251 |
/* Fill the matrix and vector with random numbers */ |
|
252 |
for (i=0; i<dim; i++) { |
|
253 |
for (j=0; j<dim; j++) |
|
254 |
if (j>=i) |
|
255 |
{ |
|
256 |
/* Normalization is necessary to avoid problems */ |
|
257 |
A[i*dim+j]=1.; |
|
258 |
} |
|
259 |
else |
|
260 |
{ |
|
261 |
A[i*dim+j]=0.; |
|
262 |
} |
|
263 |
X[i]=1; |
|
264 |
} |
|
265 |
#else |
|
266 |
for (i=0; i<dim; i++) { |
|
267 |
for (j=0; j<dim; j++) |
|
268 |
if (j>i) |
|
269 |
{ |
|
270 |
/* Normalization is necessary to avoid problems */ |
|
271 |
A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.) |
|
272 |
*(LENGTH)(i+1.)/(LENGTH)(j+1.); |
|
273 |
} |
|
274 |
else if (j==i) |
|
275 |
{ |
|
276 |
A[i*dim+j]=1.; |
|
277 |
} |
|
278 |
else |
|
279 |
{ |
|
280 |
A[i*dim+j]=0.; |
|
281 |
} |
|
282 |
X[i]=(LENGTH)rand()/(RAND_MAX+1.); |
|
283 |
} |
|
284 |
#endif |
|
285 |
|
|
286 |
/* Print the matrix */ |
|
287 |
|
|
288 |
#ifdef QUIET |
|
289 |
#else |
|
290 |
for (i=0; i<dim; i++) { |
|
291 |
for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]); |
|
292 |
printf("\tX[%i]=%1.5f ", i,X[i]); |
|
293 |
putchar('\n'); |
|
294 |
} |
|
295 |
putchar('\n'); |
|
296 |
#endif |
|
297 |
|
|
298 |
/* Get first timer before launching */ |
|
299 |
gettimeofday(&tv1, &tz); |
|
300 |
|
|
301 |
/* Compute with CuBLAS library */ |
|
302 |
#ifdef CUBLAS |
|
303 |
LENGTH *devPtrA=0, *devPtrX=0, *devPtrY=0; |
|
304 |
cublasStatus stat1, stat2, stat3; |
|
305 |
struct timeval tv3,tv4; |
|
306 |
|
|
307 |
/* Order is Row */ |
|
308 |
/* Have to swap uplo and trans */ |
|
309 |
char uplo='L',trans='T',diag='N'; |
|
310 |
|
|
311 |
printf("Using CuBLAS: %i iterations for %ix%i matrix\n", |
|
312 |
RUNS,dim,dim); |
|
313 |
|
|
314 |
stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA); |
|
315 |
stat2=cublasAlloc(dim,sizeof(devPtrX[0]),(void**)&devPtrX); |
|
316 |
stat3=cublasAlloc(dim,sizeof(devPtrY[0]),(void**)&devPtrY); |
|
317 |
|
|
318 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) || |
|
319 |
(stat2 != CUBLAS_STATUS_SUCCESS) || |
|
320 |
(stat3 != CUBLAS_STATUS_SUCCESS)) { |
|
321 |
wrapperError ("Dtrsv", CUBLAS_WRAPPER_ERROR_ALLOC); |
|
322 |
cublasFree (devPtrA); |
|
323 |
cublasFree (devPtrX); |
|
324 |
cublasFree (devPtrY); |
|
325 |
return 1; |
|
326 |
} |
|
327 |
|
|
328 |
stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim); |
|
329 |
stat2=cublasSetVector(dim,sizeof(X[0]),X,incx,devPtrX,incx); |
|
330 |
stat3=cublasSetVector(dim,sizeof(Y[0]),Y,incx,devPtrY,incx); |
|
331 |
|
|
332 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) || |
|
333 |
(stat2 != CUBLAS_STATUS_SUCCESS) || |
|
334 |
(stat3 != CUBLAS_STATUS_SUCCESS)) { |
|
335 |
wrapperError ("Dtrsv", CUBLAS_WRAPPER_ERROR_SET); |
|
336 |
cublasFree (devPtrA); |
|
337 |
cublasFree (devPtrX); |
|
338 |
cublasFree (devPtrY); |
|
339 |
return 1; |
|
340 |
} |
|
341 |
|
|
342 |
/* Get third timer after memory operation */ |
|
343 |
gettimeofday(&tv3, &tz); |
|
344 |
|
|
345 |
for (i=0;i<RUNS;i++) |
|
346 |
{ |
|
347 |
#ifdef DOUBLE |
|
348 |
|
|
349 |
printVectorGPU(dim,devPtrX,"X","Roots"); |
|
350 |
|
|
351 |
/* Multiply Y <- A.X */ |
|
352 |
cublasDgemv(trans,dim,dim,alpha,devPtrA,dim, |
|
353 |
devPtrX,incx,beta,devPtrY,incx); |
|
354 |
|
|
355 |
printVectorGPU(dim,devPtrY,"Y","Results"); |
|
356 |
|
|
357 |
/* Solve linear system A.X=Y : Y <- A-1.Y */ |
|
358 |
cublasDtrsv(uplo,trans,diag,dim,devPtrA,dim,devPtrY,incx); |
|
359 |
|
|
360 |
printVectorGPU(dim,devPtrY,"Y","Solutions"); |
|
361 |
|
|
362 |
/* Estimate the difference between X and Y : Y <- -Y+X */ |
|
363 |
cublasDaxpy(dim,beta2,devPtrY,incx,devPtrX,incx); |
|
364 |
|
|
365 |
printVectorGPU(dim,devPtrX,"X","Errors"); |
|
366 |
|
|
367 |
/* Estimate the second checker */ |
|
368 |
/* checksA[i]=(double)cublasDnrm2(dim,devPtrX,incx); */ |
|
369 |
|
|
370 |
/* Swap vector X and Y */ |
|
371 |
cublasDswap(dim,devPtrX,incx,devPtrY,incx); |
|
372 |
|
|
373 |
#else |
|
374 |
|
|
375 |
printVectorGPU(dim,devPtrX,"X","Roots"); |
|
376 |
|
|
377 |
/* Multiply Y <- A.X */ |
|
378 |
cublasSgemv(trans,dim,dim,alpha,devPtrA,dim, |
|
379 |
devPtrX,incx,beta,devPtrY,incx); |
|
380 |
|
|
381 |
printVectorGPU(dim,devPtrY,"Y","Results"); |
|
382 |
|
|
383 |
/* Solve linear system Y <- A-1.Y */ |
|
384 |
cublasStrsv(uplo,trans,diag,dim,devPtrA,dim,devPtrY,incx); |
|
385 |
|
|
386 |
printVectorGPU(dim,devPtrY,"Y","Solutions"); |
|
387 |
|
|
388 |
/* Add vectors X and -Y */ |
|
389 |
cublasSaxpy(dim,beta2,devPtrY,incx,devPtrX,incx); |
|
390 |
|
|
391 |
printVectorGPU(dim,devPtrX,"X","Errors"); |
|
392 |
|
|
393 |
/* Estimate the second checker */ |
|
394 |
/* checksA[i]=(double)cublasSnrm2(dim,devPtrX,incx); */ |
|
395 |
|
|
396 |
/* Swap vector X and Y */ |
|
397 |
cublasSswap(dim,devPtrX,incx,devPtrY,incx); |
|
398 |
|
|
399 |
#endif |
|
400 |
|
|
401 |
} |
|
402 |
|
|
403 |
stat1=cublasGetMatrix(dim,dim,sizeof(A[0]),devPtrA,dim,A,dim); |
|
404 |
stat2=cublasGetVector(dim,sizeof(X[0]),devPtrX,incx,X,incx); |
|
405 |
stat3=cublasGetVector(dim,sizeof(Y[0]),devPtrY,incx,Y,incx); |
|
406 |
|
|
407 |
cublasFree (devPtrA); |
|
408 |
cublasFree (devPtrX); |
|
409 |
cublasFree (devPtrY); |
|
410 |
|
|
411 |
if ((stat1 != CUBLAS_STATUS_SUCCESS) || |
|
412 |
(stat2 != CUBLAS_STATUS_SUCCESS) || |
|
413 |
(stat3 != CUBLAS_STATUS_SUCCESS)) { |
|
414 |
wrapperError ("LinearSystem", CUBLAS_WRAPPER_ERROR_GET); |
|
415 |
} |
|
416 |
|
|
417 |
/* Get fourth timer after memory free */ |
|
418 |
gettimeofday(&tv4, &tz); |
|
419 |
|
|
420 |
#elif THUNKING |
|
421 |
|
|
422 |
/* Order is Row : Have to swap uplo='U' and trans='N' */ |
|
423 |
char uplo='L',trans='T',diag='N'; |
|
424 |
printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n", |
|
425 |
RUNS,dim,dim); |
|
426 |
|
|
427 |
for (i=0;i<RUNS;i++) |
|
428 |
{ |
|
429 |
#ifdef DOUBLE |
|
430 |
|
|
431 |
printVector(dim,X,"X","Roots"); |
|
432 |
|
|
433 |
/* Multiply A by X as Y <- A.X */ |
|
434 |
CUBLAS_DGEMV(&trans,&dim,&dim,&alpha,A,&dim,X,&incx,&beta,Y,&incx); |
|
435 |
|
|
436 |
printVector(dim,Y,"Y","Results"); |
|
437 |
|
|
438 |
/* Solve linear system */ |
|
439 |
CUBLAS_DTRSV(&uplo,&trans,&diag,&dim,A,&dim,Y,&incx); |
|
440 |
|
|
441 |
printVector(dim,Y,"Y","Solutions"); |
|
442 |
|
|
443 |
/* Compare the roots X and Y */ |
|
444 |
CUBLAS_DAXPY(&dim,&beta2,Y,&incx,X,&incx); |
|
445 |
|
|
446 |
printVector(dim,X,"X","Errors"); |
|
447 |
|
|
448 |
/* Store the checker of errors */ |
|
449 |
/* checksA[i]=(double)CUBLAS_DNRM2(&dim,X,&incx); */ |
|
450 |
|
|
451 |
/* Swap vector X and Y */ |
|
452 |
CUBLAS_DSWAP(&dim,X,&incx,Y,&incx); |
|
453 |
#else |
|
454 |
|
|
455 |
printVector(dim,X,"X","Roots"); |
|
456 |
|
|
457 |
/* Multiply A by X as Y <- A.X */ |
|
458 |
CUBLAS_SGEMV(&trans,&dim,&dim,&alpha,A,&dim,X,&incx,&beta,Y,&incx); |
|
459 |
|
|
460 |
printVector(dim,Y,"Y","Results"); |
|
461 |
|
|
462 |
/* Solve linear system */ |
|
463 |
CUBLAS_STRSV(&uplo,&trans,&diag,&dim,A,&dim,Y,&incx); |
|
464 |
|
|
465 |
printVector(dim,Y,"Y","Solutions"); |
|
466 |
|
|
467 |
/* Compare the roots X and Y */ |
|
468 |
CUBLAS_SAXPY(&dim,&beta2,Y,&incx,X,&incx); |
|
469 |
|
|
470 |
printVector(dim,X,"X","Errors"); |
|
471 |
|
|
472 |
/* Store the checker of errors */ |
|
473 |
/* checksA[i]=(double)CUBLAS_SNRM2(&dim,X,&incx); */ |
|
474 |
|
|
475 |
/* Swap vector X and Y */ |
|
476 |
CUBLAS_SSWAP(&dim,X,&incx,Y,&incx); |
|
477 |
#endif |
|
478 |
|
|
479 |
#ifdef PRINT |
|
480 |
printf("Iteration %i, checker is %2.5f and error is %2.10f\n", |
|
481 |
i,checksA[i],fabs(checksB[i]-checksA[i])/factor); |
|
482 |
#endif |
|
483 |
} |
|
484 |
|
|
485 |
#elif FBLAS |
|
486 |
|
|
487 |
/* Order is Row : Have to swap uplo='U' and trans='N' */ |
|
488 |
char uplo='L',trans='T',diag='N'; |
|
489 |
|
|
490 |
printf("Using FBLAS: %i iterations for %ix%i matrix\n", |
|
491 |
RUNS,dim,dim); |
|
492 |
|
|
493 |
for (i=0;i<RUNS;i++) |
|
494 |
{ |
|
495 |
#ifdef DOUBLE |
|
496 |
|
|
497 |
printVector(dim,X,"X","Roots"); |
|
498 |
|
|
499 |
/* Multiply A by X as Y <- A.X */ |
|
500 |
dgemv_(&trans,&dim,&dim,&alpha,A,&dim,X,&incx,&beta,Y,&incx); |
|
501 |
|
|
502 |
printVector(dim,Y,"Y","Results"); |
|
503 |
|
|
504 |
/* Solve linear system */ |
|
505 |
dtrsv_(&uplo,&trans,&diag,&dim,A,&dim,Y,&incx); |
|
506 |
|
|
507 |
printVector(dim,Y,"Y","Solutions"); |
|
508 |
|
|
509 |
/* Compare the roots X and Y */ |
|
510 |
daxpy_(&dim,&beta2,Y,&incx,X,&incx); |
|
511 |
|
|
512 |
printVector(dim,X,"X","Errors"); |
|
513 |
|
|
514 |
/* Store the checker of errors */ |
|
515 |
/* dnrm2_(&dim,X,&incx,&checksA[i]); */ |
|
516 |
|
|
517 |
/* Swap vector X and Y */ |
|
518 |
dswap_(&dim,X,&incx,Y,&incx); |
|
519 |
|
|
520 |
#else |
|
521 |
|
|
522 |
printVector(dim,X,"X","Roots"); |
|
523 |
|
|
524 |
/* Multiply A by X as Y <- A.X */ |
|
525 |
sgemv_(&trans,&dim,&dim,&alpha,A,&dim,X,&incx,&beta,Y,&incx); |
|
526 |
|
|
527 |
printVector(dim,Y,"Y","Results"); |
|
528 |
|
|
529 |
/* Solve linear system */ |
|
530 |
strsv_(&uplo,&trans,&diag,&dim,A,&dim,Y,&incx); |
|
531 |
|
|
532 |
printVector(dim,Y,"Y","Solutions"); |
|
533 |
|
|
534 |
/* Compare the roots X and Y */ |
|
535 |
saxpy_(&dim,&beta2,Y,&incx,X,&incx); |
|
536 |
|
|
537 |
printVector(dim,X,"X","Errors"); |
|
538 |
|
|
539 |
/* Store the checker of errors */ |
|
540 |
/* snrm2_(&dim,X,&incx,&checksA[i]); */ |
|
541 |
|
|
542 |
/* Swap vector X and Y */ |
|
543 |
sswap_(&dim,X,&incx,Y,&incx); |
|
544 |
#endif |
|
545 |
|
|
546 |
} |
|
547 |
|
|
548 |
#elif ACML |
|
549 |
|
|
550 |
/* Order is Row : Have to swap uplo='U' and trans='N' */ |
|
551 |
char uplo='L',trans='T',diag='N'; |
|
552 |
|
|
553 |
printf("Using ACML: %i iterations for %ix%i matrix\n", |
|
554 |
RUNS,dim,dim); |
|
555 |
|
|
556 |
for (i=0;i<RUNS;i++) |
|
557 |
{ |
|
558 |
#ifdef DOUBLE |
|
559 |
|
|
560 |
printVector(dim,X,"X","Roots"); |
|
561 |
|
|
562 |
/* Multiply A by X as Y <- A.X */ |
|
563 |
dgemv(trans,dim,dim,alpha,A,dim,X,incx,beta,Y,incx); |
|
564 |
|
|
565 |
printVector(dim,Y,"Y","Results"); |
|
566 |
|
|
567 |
/* Solve linear system */ |
|
568 |
dtrsv(uplo,trans,diag,dim,A,dim,Y,incx); |
|
569 |
|
|
570 |
printVector(dim,Y,"Y","Solutions"); |
|
571 |
|
|
572 |
/* Compare the roots X and Y */ |
|
573 |
daxpy(dim,beta2,Y,incx,X,incx); |
|
574 |
|
|
575 |
printVector(dim,X,"X","Errors"); |
|
576 |
|
|
577 |
/* Store the checker of errors */ |
|
578 |
/* dnrm2_(&dim,X,&incx,&checksA[i]); */ |
|
579 |
|
|
580 |
/* Swap vector X and Y */ |
|
581 |
dswap(dim,X,incx,Y,incx); |
|
582 |
|
|
583 |
#else |
|
584 |
|
|
585 |
printVector(dim,X,"X","Roots"); |
|
586 |
|
|
587 |
/* Multiply A by X as Y <- A.X */ |
|
588 |
sgemv(trans,dim,dim,alpha,A,dim,X,incx,beta,Y,incx); |
|
589 |
|
|
590 |
printVector(dim,Y,"Y","Results"); |
|
591 |
|
|
592 |
/* Solve linear system */ |
|
593 |
strsv(uplo,trans,diag,dim,A,dim,Y,incx); |
|
594 |
|
|
595 |
printVector(dim,Y,"Y","Solutions"); |
|
596 |
|
|
597 |
/* Compare the roots X and Y */ |
|
598 |
saxpy(dim,beta2,Y,incx,X,incx); |
|
599 |
|
|
600 |
printVector(dim,X,"X","Errors"); |
|
601 |
|
|
602 |
/* Store the checker of errors */ |
|
603 |
/* snrm2_(&dim,X,&incx,&checksA[i]); */ |
|
604 |
|
|
605 |
/* Swap vector X and Y */ |
|
606 |
sswap(dim,X,incx,Y,incx); |
|
607 |
#endif |
|
608 |
|
|
609 |
} |
|
610 |
|
|
611 |
#elif GSL |
|
612 |
|
|
613 |
printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim); |
|
614 |
|
|
615 |
/* |
|
616 |
RowMajor : Matrix is read row by row |
|
617 |
Upper : the no null elements are on top |
|
618 |
NoTrans : no transposition before estimation |
|
619 |
NonUnit : Matrix is not unit |
|
620 |
*/ |
|
621 |
|
|
622 |
for (i=0;i<RUNS;i++) |
|
623 |
{ |
|
624 |
|
|
625 |
#ifdef DOUBLE |
|
626 |
|
|
627 |
printVector(dim,X,"X","Roots"); |
|
628 |
|
|
629 |
/* Multiply A by X as Y <- A.X */ |
|
630 |
cblas_dgemv(CblasRowMajor,CblasNoTrans, |
|
631 |
dim,dim,alpha,A,dim,X,incx,beta,Y,incx); |
|
632 |
|
|
633 |
printVector(dim,Y,"Y","Results"); |
|
634 |
|
|
635 |
/* Solve linear system : Y <- A-1.Y */ |
|
636 |
cblas_dtrsv(CblasRowMajor,CblasUpper,CblasNoTrans,CblasNonUnit, |
|
637 |
dim,A,dim,Y,incx); |
|
638 |
|
|
639 |
printVector(dim,Y,"Y","Solutions"); |
|
640 |
|
|
641 |
cblas_daxpy(dim,beta2,Y,incx,X,incx); |
|
642 |
|
|
643 |
printVector(dim,X,"X","Errors"); |
|
644 |
|
|
645 |
/* Store the checker of errors */ |
|
646 |
/* checksA[i]=(double)cblas_dnrm2(dim,X,incx); */ |
|
647 |
|
|
648 |
cblas_dswap(dim,X,incx,Y,incx); |
|
649 |
|
|
650 |
#else |
|
651 |
|
|
652 |
printVector(dim,X,"X","Roots"); |
|
653 |
|
|
654 |
/* Multiply A by X as Y <- A.X */ |
|
655 |
cblas_sgemv(CblasRowMajor,CblasNoTrans, |
|
656 |
dim,dim,alpha,A,dim,X,incx,beta,Y,incx); |
|
657 |
|
|
658 |
printVector(dim,Y,"Y","Results"); |
|
659 |
|
|
660 |
/* Solve linear system : Y <- A-1.Y */ |
|
661 |
cblas_strsv(CblasRowMajor,CblasUpper,CblasNoTrans,CblasNonUnit, |
|
662 |
dim,A,dim,Y,incx); |
|
663 |
|
|
664 |
printVector(dim,Y,"Y","Solutions"); |
|
665 |
|
|
666 |
cblas_saxpy(dim,beta2,Y,incx,X,incx); |
|
667 |
|
|
668 |
printVector(dim,X,"X","Errors"); |
|
669 |
|
|
670 |
/* Store the checker of errors */ |
|
671 |
/* checksA[i]=(double)cblas_snrm2(dim,X,incx); */ |
|
672 |
|
|
673 |
cblas_sswap(dim,X,incx,Y,incx); |
|
674 |
|
|
675 |
#endif |
|
676 |
|
|
677 |
} |
|
678 |
#else |
|
679 |
|
|
680 |
printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim); |
|
681 |
|
|
682 |
/* |
|
683 |
RowMajor : Matrix is read row bu row |
|
684 |
Upper : the no null elements are on top |
|
685 |
NoTrans : no transposition before estimation |
|
686 |
NonUnit : Matrix is not unit |
|
687 |
*/ |
|
688 |
|
|
689 |
for (i=0;i<RUNS;i++) |
|
690 |
{ |
|
691 |
|
|
692 |
#ifdef DOUBLE |
|
693 |
|
|
694 |
printVector(dim,X,"X","Roots"); |
|
695 |
|
|
696 |
/* Multiply A by X as Y <- A.X */ |
|
697 |
cblas_dgemv(CblasRowMajor,CblasNoTrans, |
|
698 |
dim,dim,alpha,A,dim,X,incx,beta,Y,incx); |
|
699 |
|
|
700 |
printVector(dim,Y,"Y","Results"); |
|
701 |
|
|
702 |
/* Solve linear system : Y <- A-1.Y */ |
|
703 |
cblas_dtrsv(CblasRowMajor,CblasUpper,CblasNoTrans,CblasNonUnit, |
|
704 |
dim,A,dim,Y,incx); |
|
705 |
|
|
706 |
printVector(dim,Y,"Y","Solutions"); |
|
707 |
|
|
708 |
cblas_daxpy(dim,beta2,Y,incx,X,incx); |
|
709 |
|
|
710 |
printVector(dim,X,"X","Errors"); |
|
711 |
|
|
712 |
/* Store the checker of errors */ |
|
713 |
/* checksA[i]=(double)cblas_dnrm2(dim,X,incx); */ |
|
714 |
|
|
715 |
cblas_dswap(dim,X,incx,Y,incx); |
|
716 |
|
|
717 |
#else |
|
718 |
|
|
719 |
printVector(dim,X,"X","Roots"); |
|
720 |
|
|
721 |
/* Multiply A by X as Y <- A.X */ |
|
722 |
cblas_sgemv(CblasRowMajor,CblasNoTrans, |
|
723 |
dim,dim,alpha,A,dim,X,incx,beta,Y,incx); |
|
724 |
|
|
725 |
printVector(dim,Y,"Y","Results"); |
|
726 |
|
|
727 |
/* Solve linear system : Y <- A-1.Y */ |
|
728 |
cblas_strsv(CblasRowMajor,CblasUpper,CblasNoTrans,CblasNonUnit, |
|
729 |
dim,A,dim,Y,incx); |
|
730 |
|
|
731 |
printVector(dim,Y,"Y","Solutions"); |
|
732 |
|
|
733 |
cblas_saxpy(dim,beta2,Y,incx,X,incx); |
|
734 |
|
|
735 |
printVector(dim,X,"X","Errors"); |
|
736 |
|
|
737 |
/* Store the checker of errors */ |
|
738 |
/* checksA[i]=(double)cblas_snrm2(dim,X,incx); */ |
|
739 |
|
|
740 |
cblas_sswap(dim,X,incx,Y,incx); |
|
741 |
|
|
742 |
#endif |
|
743 |
|
|
744 |
} |
|
745 |
#endif |
|
746 |
putchar('\n'); |
|
747 |
|
|
748 |
/* Get second timer after launching */ |
|
749 |
gettimeofday(&tv2, &tz); |
|
750 |
|
|
751 |
#ifdef CUBLAS |
|
752 |
double memoryIn,memoryOut; |
|
753 |
|
|
754 |
memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
|
755 |
(tv3.tv_usec-tv1.tv_usec))/1000000.; |
|
756 |
|
|
757 |
memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
|
758 |
(tv2.tv_usec-tv4.tv_usec))/1000000.; |
|
759 |
|
|
760 |
duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
|
761 |
(tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
|
762 |
|
|
763 |
printf("Duration of memory allocation : %2.10f s\n",memoryIn); |
|
764 |
printf("Duration of memory free : %2.10f s\n",memoryOut); |
|
765 |
#else |
|
766 |
duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + \ |
|
767 |
(tv2.tv_usec-tv1.tv_usec))/1000000./RUNS; |
|
768 |
|
|
769 |
#endif |
|
770 |
|
|
771 |
printf("Duration of each cycle : %2.10f s\n",duration); |
|
772 |
|
|
773 |
printResults(RUNS,checksA,"C","Errors cumulated"); |
|
774 |
|
|
775 |
putchar('\n'); |
|
776 |
|
|
777 |
/* |
|
778 |
#ifdef PRINT |
|
779 |
for (i=0; i<dim; i++) { |
|
780 |
for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]); |
|
781 |
putchar('\n'); |
|
782 |
} |
|
783 |
|
|
784 |
for (i=0; i<dim; i++) { |
|
785 |
printf("X[%i]=%2.5f",i,X[i]); |
|
786 |
putchar('\n'); |
|
787 |
} |
|
788 |
putchar('\n'); |
|
789 |
for (i=0; i<dim; i++) { |
|
790 |
printf("Y[%i]=%2.5f",i,Y[i]); |
|
791 |
putchar('\n'); |
|
792 |
} |
|
793 |
#endif |
|
794 |
*/ |
|
795 |
|
|
796 |
return 0; |
|
797 |
} |
|
798 |
|
|
799 |
int main(int argc,char **argv) |
|
800 |
{ |
|
801 |
if ((argc==1)|| |
|
802 |
(strcmp(argv[1],"-h")==0)|| |
|
803 |
(strcmp(argv[1],"--help")==0)) |
|
804 |
{ |
|
805 |
printf("\nPerforms a bench using BLAS library implementation:\n\n" |
|
806 |
"\t#1 Size on triangular system\n" |
|
807 |
"\t#2 Number of iterations\n\n"); |
|
808 |
} |
|
809 |
else if ((atoi(argv[1])>=2)&& |
|
810 |
(atoi(argv[2])>=1)) |
|
811 |
{ |
|
812 |
bench(atoi(argv[1]),atoi(argv[2])); |
|
813 |
} |
|
814 |
|
|
815 |
return 0; |
|
816 |
} |
BLAS/xTRSV/bench.sh (revision 1) | ||
---|---|---|
1 |
#!/bin/sh |
|
2 |
|
|
3 |
BENCH=lesson11 |
|
4 |
|
|
5 |
NUMBER=100 |
|
6 |
|
|
7 |
SIZE=1000 |
|
8 |
|
|
9 |
MAX=32000 |
|
10 |
|
|
11 |
FORMAT=DP |
|
12 |
|
|
13 |
OUT_CBLAS=/tmp/${BENCH}_${FORMAT}_cblas.out |
|
14 |
OUT_FBLAS=/tmp/${BENCH}_${FORMAT}_fblas.out |
|
15 |
OUT_GSL=/tmp/${BENCH}_${FORMAT}_gsl.out |
|
16 |
OUT_THUNKING=/tmp/${BENCH}_${FORMAT}_thunking.out |
|
17 |
OUT_CUBLAS=/tmp/${BENCH}_${FORMAT}_cublas.out |
|
18 |
|
|
19 |
echo > $OUT_CBLAS |
|
20 |
echo > $OUT_FBLAS |
|
21 |
echo > $OUT_GSL |
|
22 |
echo > $OUT_THUNKING |
|
23 |
echo > $OUT_CUBLAS |
|
24 |
|
|
25 |
while [ $SIZE -le $MAX ] |
|
26 |
do |
|
27 |
|
|
28 |
CBLAS=$(./${BENCH}_cblas $SIZE $NUMBER | grep Duration | awk -F: '{ print $2 }' | awk '{ print $1 }') |
|
29 |
|
|
30 |
FBLAS=$(./${BENCH}_fblas $SIZE $NUMBER | grep Duration | awk -F: '{ print $2 }' | awk '{ print $1 }') |
|
31 |
|
|
32 |
GSL=$(./${BENCH}_gsl $SIZE $NUMBER | grep Duration | awk -F: '{ print $2 }' | awk '{ print $1 }') |
|
33 |
|
|
34 |
THUNKING=$(./${BENCH}_thunking $SIZE $NUMBER | grep Duration | awk -F: '{ print $2 }' | awk '{ print $1 }') |
|
35 |
|
|
36 |
CUBLAS=$(./${BENCH}_cublas $SIZE $NUMBER | grep Duration | awk -F: '{ print $2 }' | tr "\n" " " | awk '{ print $5"\t"$1"\t"$3 }') |
|
37 |
|
|
38 |
echo -e $SIZE"\t"$CBLAS >> $OUT_CBLAS |
|
39 |
echo -e $SIZE"\t"$FBLAS >> $OUT_FBLAS |
|
40 |
echo -e $SIZE"\t"$GSL >> $OUT_GSL |
|
41 |
echo -e $SIZE"\t"$THUNKING >> $OUT_THUNKING |
|
42 |
echo -e $SIZE"\t"$CUBLAS >> $OUT_CUBLAS |
|
43 |
|
|
44 |
SIZE=$(($SIZE+1000)) |
|
45 |
|
|
46 |
done |
|
0 | 47 |
BLAS/xTRSV/fortran_common.h.orig (revision 1) | ||
---|---|---|
1 |
/* |
|
2 |
* Copyright 1993-2011 NVIDIA Corporation. All rights reserved. |
|
3 |
* |
|
4 |
* NOTICE TO LICENSEE: |
|
5 |
* |
|
6 |
* This source code and/or documentation ("Licensed Deliverables") are |
|
7 |
* subject to NVIDIA intellectual property rights under U.S. and |
|
8 |
* international Copyright laws. |
|
9 |
* |
|
10 |
* These Licensed Deliverables contained herein is PROPRIETARY and |
|
11 |
* CONFIDENTIAL to NVIDIA and is being provided under the terms and |
|
12 |
* conditions of a form of NVIDIA software license agreement by and |
|
13 |
* between NVIDIA and Licensee ("License Agreement") or electronically |
|
14 |
* accepted by Licensee. Notwithstanding any terms or conditions to |
|
15 |
* the contrary in the License Agreement, reproduction or disclosure |
|
16 |
* of the Licensed Deliverables to any third party without the express |
|
17 |
* written consent of NVIDIA is prohibited. |
|
18 |
* |
|
19 |
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE |
|
20 |
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE |
|
21 |
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS |
|
22 |
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. |
|
23 |
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED |
|
24 |
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, |
|
25 |
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. |
|
26 |
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE |
|
27 |
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY |
|
28 |
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY |
|
29 |
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
|
30 |
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
|
31 |
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
|
32 |
* OF THESE LICENSED DELIVERABLES. |
|
33 |
* |
|
34 |
* U.S. Government End Users. These Licensed Deliverables are a |
|
35 |
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT |
|
36 |
* 1995), consisting of "commercial computer software" and "commercial |
|
37 |
* computer software documentation" as such terms are used in 48 |
|
38 |
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government |
|
39 |
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and |
|
40 |
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all |
|
41 |
* U.S. Government End Users acquire the Licensed Deliverables with |
|
42 |
* only those rights set forth herein. |
|
43 |
* |
|
44 |
* Any use of the Licensed Deliverables in individual and commercial |
|
45 |
* software must include, in the user documentation and internal |
|
46 |
* comments to the code, the above Disclaimer and U.S. Government End |
|
47 |
* Users Notice. |
|
48 |
*/ |
|
49 |
|
|
50 |
#define CUBLAS_G77 1 |
|
51 |
#define CUBLAS_INTEL_FORTRAN 2 |
|
52 |
#define CUBLAS_G95 3 |
|
53 |
|
|
54 |
/* Default to g77 on Linux, and Intel Fortran on Win32 */ |
|
55 |
#if defined(_WIN32) |
|
56 |
#define CUBLAS_FORTRAN_COMPILER CUBLAS_INTEL_FORTRAN |
|
57 |
#elif defined(__linux) |
|
58 |
#define CUBLAS_FORTRAN_COMPILER CUBLAS_G95 |
|
59 |
#elif defined(__APPLE__) |
|
60 |
#define CUBLAS_FORTRAN_COMPILER CUBLAS_G95 |
|
61 |
#define RETURN_COMPLEX 1 |
|
62 |
#else |
|
63 |
#error unsupported platform |
|
64 |
#endif |
|
65 |
|
|
66 |
#if (CUBLAS_FORTRAN_COMPILER==CUBLAS_G77) || (CUBLAS_FORTRAN_COMPILER==CUBLAS_G95) |
|
67 |
/* NOTE: Must use -fno-second-underscore when building Fortran source with g77 |
|
68 |
* g77 invocation may not use -fno-f2c, which forces different return |
|
69 |
* type conventions than the one used below |
|
70 |
*/ |
|
71 |
#define CUBLAS_INIT cublas_init_ |
|
72 |
#define CUBLAS_SHUTDOWN cublas_shutdown_ |
|
73 |
#define CUBLAS_ALLOC cublas_alloc_ |
|
74 |
#define CUBLAS_FREE cublas_free_ |
|
75 |
#define CUBLAS_SET_VECTOR cublas_set_vector_ |
|
76 |
#define CUBLAS_GET_VECTOR cublas_get_vector_ |
|
77 |
#define CUBLAS_SET_MATRIX cublas_set_matrix_ |
|
78 |
#define CUBLAS_GET_MATRIX cublas_get_matrix_ |
|
79 |
#define CUBLAS_GET_ERROR cublas_get_error_ |
|
80 |
#define CUBLAS_XERBLA cublas_xerbla_ |
|
81 |
#define CUBLAS_ISAMAX cublas_isamax_ |
|
82 |
#define CUBLAS_ISAMIN cublas_isamin_ |
|
83 |
#define CUBLAS_SASUM cublas_sasum_ |
|
84 |
#define CUBLAS_SAXPY cublas_saxpy_ |
|
85 |
#define CUBLAS_SCOPY cublas_scopy_ |
|
86 |
#define CUBLAS_SDOT cublas_sdot_ |
|
87 |
#define CUBLAS_SNRM2 cublas_snrm2_ |
|
88 |
#define CUBLAS_SROT cublas_srot_ |
|
89 |
#define CUBLAS_SROTG cublas_srotg_ |
|
90 |
#define CUBLAS_SROTM cublas_srotm_ |
|
91 |
#define CUBLAS_SROTMG cublas_srotmg_ |
|
92 |
#define CUBLAS_SSCAL cublas_sscal_ |
|
93 |
#define CUBLAS_SSWAP cublas_sswap_ |
|
94 |
#define CUBLAS_CAXPY cublas_caxpy_ |
|
95 |
#define CUBLAS_CCOPY cublas_ccopy_ |
|
96 |
#define CUBLAS_CROT cublas_crot_ |
|
97 |
#define CUBLAS_CROTG cublas_crotg_ |
|
98 |
#define CUBLAS_CSCAL cublas_cscal_ |
|
99 |
#define CUBLAS_CSROT cublas_csrot_ |
|
100 |
#define CUBLAS_CSSCAL cublas_csscal_ |
|
101 |
#define CUBLAS_CSWAP cublas_cswap_ |
|
102 |
#define CUBLAS_CTRMV cublas_ctrmv_ |
|
103 |
#define CUBLAS_CDOTU cublas_cdotu_ |
|
104 |
#define CUBLAS_CDOTC cublas_cdotc_ |
|
105 |
#define CUBLAS_ICAMAX cublas_icamax_ |
|
106 |
#define CUBLAS_SCASUM cublas_scasum_ |
|
107 |
#define CUBLAS_SCNRM2 cublas_scnrm2_ |
|
108 |
#define CUBLAS_SGBMV cublas_sgbmv_ |
|
109 |
#define CUBLAS_SGEMV cublas_sgemv_ |
|
110 |
#define CUBLAS_SGER cublas_sger_ |
|
111 |
#define CUBLAS_SSBMV cublas_ssbmv_ |
|
112 |
#define CUBLAS_SSPMV cublas_sspmv_ |
|
113 |
#define CUBLAS_SSPR cublas_sspr_ |
|
114 |
#define CUBLAS_SSPR2 cublas_sspr2_ |
|
115 |
#define CUBLAS_SSYMV cublas_ssymv_ |
|
116 |
#define CUBLAS_SSYR cublas_ssyr_ |
|
117 |
#define CUBLAS_SSYR2 cublas_ssyr2_ |
|
118 |
#define CUBLAS_STBMV cublas_stbmv_ |
|
119 |
#define CUBLAS_STBSV cublas_stbsv_ |
|
120 |
#define CUBLAS_STPMV cublas_stpmv_ |
|
121 |
#define CUBLAS_STPSV cublas_stpsv_ |
|
122 |
#define CUBLAS_STRMV cublas_strmv_ |
|
123 |
#define CUBLAS_STRSV cublas_strsv_ |
|
124 |
#define CUBLAS_SGEMM cublas_sgemm_ |
|
125 |
#define CUBLAS_SSYMM cublas_ssymm_ |
|
126 |
#define CUBLAS_SSYR2K cublas_ssyr2k_ |
|
127 |
#define CUBLAS_SSYRK cublas_ssyrk_ |
|
128 |
#define CUBLAS_STRMM cublas_strmm_ |
|
129 |
#define CUBLAS_STRSM cublas_strsm_ |
|
130 |
#define CUBLAS_CGEMM cublas_cgemm_ |
|
131 |
#define CUBLAS_CHEMM cublas_chemm_ |
|
132 |
#define CUBLAS_CSYMM cublas_csymm_ |
|
133 |
#define CUBLAS_CTRMM cublas_ctrmm_ |
|
134 |
#define CUBLAS_CTRSM cublas_ctrsm_ |
|
135 |
#define CUBLAS_CHERK cublas_cherk_ |
|
136 |
#define CUBLAS_CSYRK cublas_csyrk_ |
|
137 |
#define CUBLAS_CHER2K cublas_cher2k_ |
|
138 |
#define CUBLAS_CSYR2K cublas_csyr2k_ |
|
139 |
#define CUBLAS_IDAMAX cublas_idamax_ |
|
140 |
#define CUBLAS_IDAMIN cublas_idamin_ |
|
141 |
#define CUBLAS_DASUM cublas_dasum_ |
|
142 |
#define CUBLAS_DAXPY cublas_daxpy_ |
|
143 |
#define CUBLAS_DCOPY cublas_dcopy_ |
|
144 |
#define CUBLAS_DDOT cublas_ddot_ |
|
145 |
#define CUBLAS_DNRM2 cublas_dnrm2_ |
|
146 |
#define CUBLAS_DROT cublas_drot_ |
|
147 |
#define CUBLAS_DROTG cublas_drotg_ |
|
148 |
#define CUBLAS_DROTM cublas_drotm_ |
|
149 |
#define CUBLAS_DROTMG cublas_drotmg_ |
|
150 |
#define CUBLAS_DSCAL cublas_dscal_ |
|
151 |
#define CUBLAS_DSWAP cublas_dswap_ |
|
152 |
#define CUBLAS_ZAXPY cublas_zaxpy_ |
|
153 |
#define CUBLAS_ZCOPY cublas_zcopy_ |
|
154 |
#define CUBLAS_ZROT cublas_zrot_ |
|
155 |
#define CUBLAS_ZROTG cublas_zrotg_ |
|
156 |
#define CUBLAS_ZSCAL cublas_zscal_ |
|
157 |
#define CUBLAS_ZDROT cublas_zdrot_ |
|
158 |
#define CUBLAS_ZDSCAL cublas_zdscal_ |
|
159 |
#define CUBLAS_ZSWAP cublas_zswap_ |
|
160 |
#define CUBLAS_ZDOTU cublas_zdotu_ |
|
161 |
#define CUBLAS_ZDOTC cublas_zdotc_ |
|
162 |
#define CUBLAS_IZAMAX cublas_izamax_ |
|
163 |
#define CUBLAS_DZASUM cublas_dzasum_ |
|
164 |
#define CUBLAS_DZNRM2 cublas_dznrm2_ |
|
165 |
#define CUBLAS_DGBMV cublas_dgbmv_ |
|
166 |
#define CUBLAS_DGEMV cublas_dgemv_ |
|
167 |
#define CUBLAS_ZGEMV cublas_zgemv_ |
|
168 |
#define CUBLAS_DGER cublas_dger_ |
|
169 |
#define CUBLAS_DSBMV cublas_dsbmv_ |
|
170 |
#define CUBLAS_DSPMV cublas_dspmv_ |
|
171 |
#define CUBLAS_DSPR cublas_dspr_ |
|
172 |
#define CUBLAS_DSPR2 cublas_dspr2_ |
|
173 |
#define CUBLAS_DSYMV cublas_dsymv_ |
|
174 |
#define CUBLAS_DSYR cublas_dsyr_ |
|
175 |
#define CUBLAS_DSYR2 cublas_dsyr2_ |
|
176 |
#define CUBLAS_DTBMV cublas_dtbmv_ |
|
177 |
#define CUBLAS_DTBSV cublas_dtbsv_ |
|
178 |
#define CUBLAS_DTPMV cublas_dtpmv_ |
|
179 |
#define CUBLAS_DTPSV cublas_dtpsv_ |
|
180 |
#define CUBLAS_DTRMV cublas_dtrmv_ |
|
181 |
#define CUBLAS_DTRSV cublas_dtrsv_ |
|
182 |
#define CUBLAS_DGEMM cublas_dgemm_ |
|
183 |
#define CUBLAS_DSYMM cublas_dsymm_ |
|
184 |
#define CUBLAS_DSYR2K cublas_dsyr2k_ |
|
185 |
#define CUBLAS_DSYRK cublas_dsyrk_ |
|
186 |
#define CUBLAS_ZSYRK cublas_zsyrk_ |
|
187 |
#define CUBLAS_DTRMM cublas_dtrmm_ |
|
188 |
#define CUBLAS_DTRSM cublas_dtrsm_ |
|
189 |
#define CUBLAS_ZGEMM cublas_zgemm_ |
|
190 |
#define CUBLAS_ZHEMM cublas_zhemm_ |
|
191 |
#define CUBLAS_ZSYMM cublas_zsymm_ |
|
192 |
#define CUBLAS_ZTRMM cublas_ztrmm_ |
|
193 |
#define CUBLAS_ZTRSM cublas_ztrsm_ |
|
194 |
#define CUBLAS_ZHERK cublas_zherk_ |
|
195 |
#define CUBLAS_ZSYRK cublas_zsyrk_ |
|
196 |
#define CUBLAS_ZHER2K cublas_zher2k_ |
|
197 |
#define CUBLAS_ZSYR2K cublas_zsyr2k_ |
|
198 |
|
|
199 |
#define CUBLAS_CGEMV cublas_cgemv_ |
|
200 |
#define CUBLAS_CGBMV cublas_cgbmv_ |
|
201 |
#define CUBLAS_CHEMV cublas_chemv_ |
|
202 |
#define CUBLAS_CHBMV cublas_chbmv_ |
|
203 |
#define CUBLAS_CHPMV cublas_chpmv_ |
|
204 |
#define CUBLAS_CTBMV cublas_ctbmv_ |
|
205 |
#define CUBLAS_CTPMV cublas_ctpmv_ |
|
206 |
#define CUBLAS_CTRSV cublas_ctrsv_ |
|
207 |
#define CUBLAS_CTBSV cublas_ctbsv_ |
|
208 |
#define CUBLAS_CTPSV cublas_ctpsv_ |
|
209 |
#define CUBLAS_CGERC cublas_cgerc_ |
|
210 |
#define CUBLAS_CGERU cublas_cgeru_ |
|
211 |
#define CUBLAS_CHPR cublas_chpr_ |
|
212 |
#define CUBLAS_CHPR2 cublas_chpr2_ |
|
213 |
#define CUBLAS_CHER cublas_cher_ |
|
214 |
#define CUBLAS_CHER2 cublas_cher2_ |
|
215 |
|
|
216 |
// stubs for zblat2 |
|
217 |
#define CUBLAS_ZGBMV cublas_zgbmv_ |
|
218 |
#define CUBLAS_ZHEMV cublas_zhemv_ |
|
219 |
#define CUBLAS_ZHBMV cublas_zhbmv_ |
|
220 |
#define CUBLAS_ZHPMV cublas_zhpmv_ |
|
221 |
#define CUBLAS_ZTRMV cublas_ztrmv_ |
|
222 |
#define CUBLAS_ZTBMV cublas_ztbmv_ |
|
223 |
#define CUBLAS_ZTPMV cublas_ztpmv_ |
|
224 |
#define CUBLAS_ZTRSV cublas_ztrsv_ |
|
225 |
#define CUBLAS_ZTBSV cublas_ztbsv_ |
|
226 |
#define CUBLAS_ZTPSV cublas_ztpsv_ |
|
227 |
#define CUBLAS_ZGERC cublas_zgerc_ |
|
228 |
#define CUBLAS_ZGERU cublas_zgeru_ |
|
229 |
#define CUBLAS_ZHER cublas_zher_ |
|
230 |
#define CUBLAS_ZHPR cublas_zhpr_ |
|
231 |
#define CUBLAS_ZHER2 cublas_zher2_ |
|
232 |
#define CUBLAS_ZHPR2 cublas_zhpr2_ |
|
233 |
|
|
234 |
#elif CUBLAS_FORTRAN_COMPILER==CUBLAS_INTEL_FORTRAN |
|
235 |
|
|
236 |
#define CUBLAS_INIT CUBLAS_INIT |
|
237 |
#define CUBLAS_SHUTDOWN CUBLAS_SHUTDOWN |
|
238 |
#define CUBLAS_ALLOC CUBLAS_ALLOC |
|
239 |
#define CUBLAS_FREE CUBLAS_FREE |
|
240 |
#define CUBLAS_SET_VECTOR CUBLAS_SET_VECTOR |
|
241 |
#define CUBLAS_GET_VECTOR CUBLAS_GET_VECTOR |
|
242 |
#define CUBLAS_SET_MATRIX CUBLAS_SET_MATRIX |
|
243 |
#define CUBLAS_GET_MATRIX CUBLAS_GET_MATRIX |
|
244 |
#define CUBLAS_GET_ERROR CUBLAS_GET_ERROR |
|
245 |
#define CUBLAS_XERBLA CUBLAS_XERBLA |
|
246 |
#define CUBLAS_ISAMAX CUBLAS_ISAMAX |
|
247 |
#define CUBLAS_ISAMIN CUBLAS_ISAMIN |
|
248 |
#define CUBLAS_SASUM CUBLAS_SASUM |
|
249 |
#define CUBLAS_SAXPY CUBLAS_SAXPY |
|
250 |
#define CUBLAS_SCOPY CUBLAS_SCOPY |
|
251 |
#define CUBLAS_SDOT CUBLAS_SDOT |
|
252 |
#define CUBLAS_SNRM2 CUBLAS_SNRM2 |
|
253 |
#define CUBLAS_SROT CUBLAS_SROT |
|
254 |
#define CUBLAS_SROTG CUBLAS_SROTG |
|
255 |
#define CUBLAS_SROTM CUBLAS_SROTM |
|
256 |
#define CUBLAS_SROTMG CUBLAS_SROTMG |
|
257 |
#define CUBLAS_SSCAL CUBLAS_SSCAL |
|
258 |
#define CUBLAS_SSWAP CUBLAS_SSWAP |
|
259 |
#define CUBLAS_CAXPY CUBLAS_CAXPY |
|
260 |
#define CUBLAS_CCOPY CUBLAS_CCOPY |
|
261 |
#define CUBLAS_ZCOPY CUBLAS_ZCOPY |
|
262 |
#define CUBLAS_CROT CUBLAS_CROT |
|
263 |
#define CUBLAS_CROTG CUBLAS_CROTG |
|
264 |
#define CUBLAS_CSCAL CUBLAS_CSCAL |
|
265 |
#define CUBLAS_CSROT CUBLAS_CSROT |
|
266 |
#define CUBLAS_CSSCAL CUBLAS_CSSCAL |
|
267 |
#define CUBLAS_CSWAP CUBLAS_CSWAP |
|
268 |
#define CUBLAS_ZSWAP CUBLAS_ZSWAP |
|
269 |
#define CUBLAS_CTRMV CUBLAS_CTRMV |
|
270 |
#define CUBLAS_CDOTU CUBLAS_CDOTU |
|
271 |
#define CUBLAS_CDOTC CUBLAS_CDOTC |
|
272 |
#define CUBLAS_ICAMAX CUBLAS_ICAMAX |
|
273 |
#define CUBLAS_SCASUM CUBLAS_SCASUM |
|
274 |
#define CUBLAS_SCNRM2 CUBLAS_SCNRM2 |
|
275 |
#define CUBLAS_SGBMV CUBLAS_SGBMV |
|
276 |
#define CUBLAS_SGEMV CUBLAS_SGEMV |
|
277 |
#define CUBLAS_SGER CUBLAS_SGER |
|
278 |
#define CUBLAS_SSBMV CUBLAS_SSBMV |
|
279 |
#define CUBLAS_SSPMV CUBLAS_SSPMV |
|
280 |
#define CUBLAS_SSPR CUBLAS_SSPR |
|
281 |
#define CUBLAS_SSPR2 CUBLAS_SSPR2 |
|
282 |
#define CUBLAS_SSYMV CUBLAS_SSYMV |
|
283 |
#define CUBLAS_SSYR CUBLAS_SSYR |
|
284 |
#define CUBLAS_SSYR2 CUBLAS_SSYR2 |
|
285 |
#define CUBLAS_STBMV CUBLAS_STBMV |
|
286 |
#define CUBLAS_STBSV CUBLAS_STBSV |
|
287 |
#define CUBLAS_STPMV CUBLAS_STPMV |
|
288 |
#define CUBLAS_STPSV CUBLAS_STPSV |
|
289 |
#define CUBLAS_STRMV CUBLAS_STRMV |
|
290 |
#define CUBLAS_STRSV CUBLAS_STRSV |
|
291 |
#define CUBLAS_SGEMM CUBLAS_SGEMM |
|
292 |
#define CUBLAS_SSYMM CUBLAS_SSYMM |
|
293 |
#define CUBLAS_SSYR2K CUBLAS_SSYR2K |
|
294 |
#define CUBLAS_SSYRK CUBLAS_SSYRK |
|
295 |
#define CUBLAS_STRMM CUBLAS_STRMM |
|
296 |
#define CUBLAS_STRSM CUBLAS_STRSM |
|
297 |
#define CUBLAS_CGEMM CUBLAS_CGEMM |
|
298 |
#define CUBLAS_CHEMM CUBLAS_CHEMM |
|
299 |
#define CUBLAS_CSYMM CUBLAS_CSYMM |
|
300 |
#define CUBLAS_CTRMM CUBLAS_CTRMM |
|
301 |
#define CUBLAS_CTRSM CUBLAS_CTRSM |
|
302 |
#define CUBLAS_CHERK CUBLAS_CHERK |
|
303 |
#define CUBLAS_CSYRK CUBLAS_CSYRK |
|
304 |
#define CUBLAS_CHER2K CUBLAS_CHER2K |
|
305 |
#define CUBLAS_CSYR2K CUBLAS_CSYR2K |
|
306 |
#define CUBLAS_IDAMAX CUBLAS_IDAMAX |
|
307 |
#define CUBLAS_IDAMIN CUBLAS_IDAMIN |
|
308 |
#define CUBLAS_DASUM CUBLAS_DASUM |
|
309 |
#define CUBLAS_DAXPY CUBLAS_DAXPY |
|
310 |
#define CUBLAS_DCOPY CUBLAS_DCOPY |
|
311 |
#define CUBLAS_DDOT CUBLAS_DDOT |
|
312 |
#define CUBLAS_DNRM2 CUBLAS_DNRM2 |
|
313 |
#define CUBLAS_DROT CUBLAS_DROT |
|
314 |
#define CUBLAS_DROTG CUBLAS_DROTG |
|
315 |
#define CUBLAS_DROTM CUBLAS_DROTM |
|
316 |
#define CUBLAS_DROTMG CUBLAS_DROTMG |
|
317 |
#define CUBLAS_DSCAL CUBLAS_DSCAL |
|
318 |
#define CUBLAS_DSWAP CUBLAS_DSWAP |
|
319 |
#define CUBLAS_ZAXPY CUBLAS_ZAXPY |
|
320 |
#define CUBLAS_ZCOPY CUBLAS_ZCOPY |
|
321 |
#define CUBLAS_ZROT CUBLAS_ZROT |
|
322 |
#define CUBLAS_ZROTG CUBLAS_ZROTG |
|
323 |
#define CUBLAS_ZSCAL CUBLAS_ZSCAL |
|
324 |
#define CUBLAS_ZDROT CUBLAS_ZDROT |
|
325 |
#define CUBLAS_ZDSCAL CUBLAS_ZDSCAL |
|
326 |
#define CUBLAS_ZSWAP CUBLAS_ZSWAP |
|
327 |
#define CUBLAS_ZDOTU CUBLAS_ZDOTU |
|
328 |
#define CUBLAS_ZDOTC CUBLAS_ZDOTC |
|
329 |
#define CUBLAS_IZAMAX CUBLAS_IZAMAX |
|
330 |
#define CUBLAS_DZASUM CUBLAS_DZASUM |
|
331 |
#define CUBLAS_DZNRM2 CUBLAS_DZNRM2 |
|
332 |
#define CUBLAS_DGBMV CUBLAS_DGBMV |
|
333 |
#define CUBLAS_DGEMV CUBLAS_DGEMV |
|
334 |
#define CUBLAS_ZGEMV CUBLAS_ZGEMV |
|
335 |
#define CUBLAS_DGER CUBLAS_DGER |
|
336 |
#define CUBLAS_DSBMV CUBLAS_DSBMV |
|
337 |
#define CUBLAS_DSPMV CUBLAS_DSPMV |
|
338 |
#define CUBLAS_DSPR CUBLAS_DSPR |
|
339 |
#define CUBLAS_DSPR2 CUBLAS_DSPR2 |
|
340 |
#define CUBLAS_DSYMV CUBLAS_DSYMV |
|
341 |
#define CUBLAS_DSYR CUBLAS_DSYR |
|
342 |
#define CUBLAS_DSYR2 CUBLAS_DSYR2 |
|
343 |
#define CUBLAS_DTBMV CUBLAS_DTBMV |
|
344 |
#define CUBLAS_DTBSV CUBLAS_DTBSV |
|
345 |
#define CUBLAS_DTPMV CUBLAS_DTPMV |
|
346 |
#define CUBLAS_DTPSV CUBLAS_DTPSV |
|
347 |
#define CUBLAS_DTRMV CUBLAS_DTRMV |
|
348 |
#define CUBLAS_DTRSV CUBLAS_DTRSV |
|
349 |
#define CUBLAS_DGEMM CUBLAS_DGEMM |
|
350 |
#define CUBLAS_DSYMM CUBLAS_DSYMM |
|
351 |
#define CUBLAS_DSYR2K CUBLAS_DSYR2K |
|
352 |
#define CUBLAS_ZSYRK CUBLAS_ZSYRK |
|
353 |
#define CUBLAS_DTRMM CUBLAS_DTRMM |
|
354 |
#define CUBLAS_DTRSM CUBLAS_DTRSM |
|
355 |
#define CUBLAS_ZGEMM CUBLAS_ZGEMM |
|
356 |
#define CUBLAS_ZHEMM CUBLAS_ZHEMM |
|
357 |
#define CUBLAS_ZSYMM CUBLAS_ZSYMM |
|
358 |
#define CUBLAS_ZTRMM CUBLAS_ZTRMM |
|
359 |
#define CUBLAS_ZTRSM CUBLAS_ZTRSM |
|
360 |
#define CUBLAS_ZHERK CUBLAS_ZHERK |
|
361 |
#define CUBLAS_ZSYRK CUBLAS_ZSYRK |
|
362 |
#define CUBLAS_ZHER2K CUBLAS_ZHER2K |
|
363 |
#define CUBLAS_ZSYR2K CUBLAS_ZSYR2K |
|
364 |
|
|
365 |
#define CUBLAS_CGEMV CUBLAS_CGEMV |
|
366 |
#define CUBLAS_CGBMV CUBLAS_CGBMV |
|
367 |
#define CUBLAS_CHEMV CUBLAS_CHEMV |
|
368 |
#define CUBLAS_CHBMV CUBLAS_CHBMV |
|
369 |
#define CUBLAS_CHPMV CUBLAS_CHPMV |
|
370 |
#define CUBLAS_CTBMV CUBLAS_CTBMV |
|
371 |
#define CUBLAS_CTPMV CUBLAS_CTPMV |
|
372 |
#define CUBLAS_CTRSV CUBLAS_CTRSV |
|
373 |
#define CUBLAS_CTBSV CUBLAS_CTBSV |
|
374 |
#define CUBLAS_CTPSV CUBLAS_CTPSV |
|
375 |
#define CUBLAS_CGERC CUBLAS_CGERC |
|
376 |
#define CUBLAS_CGERU CUBLAS_CGERU |
|
377 |
#define CUBLAS_CHPR CUBLAS_CHPR |
|
378 |
|
|
379 |
|
|
380 |
// stubs for zblat2 |
|
381 |
#define CUBLAS_ZGBMV CUBLAS_ZGBMV |
|
382 |
#define CUBLAS_ZHEMV CUBLAS_ZHEMV |
|
383 |
#define CUBLAS_ZHBMV CUBLAS_ZHBMV |
|
384 |
#define CUBLAS_ZHPMV CUBLAS_ZHPMV |
|
385 |
#define CUBLAS_ZTRMV CUBLAS_ZTRMV |
|
386 |
#define CUBLAS_ZTBMV CUBLAS_ZTBMV |
|
387 |
#define CUBLAS_ZTPMV CUBLAS_ZTPMV |
|
388 |
#define CUBLAS_ZTRSV CUBLAS_ZTRSV |
|
389 |
#define CUBLAS_ZTBSV CUBLAS_ZTBSV |
|
390 |
#define CUBLAS_ZTPSV CUBLAS_ZTPSV |
|
391 |
#define CUBLAS_ZGERC CUBLAS_ZGERC |
|
392 |
#define CUBLAS_ZGERU CUBLAS_ZGERU |
|
393 |
#define CUBLAS_ZHER CUBLAS_ZHER |
|
394 |
#define CUBLAS_ZHPR CUBLAS_ZHPR |
|
395 |
#define CUBLAS_ZHER2 CUBLAS_ZHER2 |
|
396 |
#define CUBLAS_ZHPR2 CUBLAS_ZHPR2 |
|
397 |
|
|
398 |
#else |
|
399 |
#error unsupported Fortran compiler |
|
400 |
#endif |
BLAS/xGEMM/patch_thunking.h (revision 1) | ||
---|---|---|
1 |
41c41 |
|
2 |
< #define CUBLAS_FORTRAN_COMPILER CUBLAS_G95 |
|
3 |
--- |
|
4 |
> #define CUBLAS_FORTRAN_COMPILER CUBLAS_INTEL_FORTRAN |
BLAS/xGEMM/Makefile (revision 1) | ||
---|---|---|
1 |
SOURCE=xGEMM.c |
|
2 |
|
|
3 |
CC=gcc |
|
4 |
CFLAGS=-Wall -O3 |
|
5 |
LDFLAGS=-lm |
|
6 |
CUDADIR=/opt/cuda |
|
7 |
CUDASRC=$(CUDADIR)/src |
|
8 |
THUNKING=fortran_thunking.c |
|
9 |
CUDASRCINC=fortran_common.h |
|
10 |
CUDAINC=$(CUDADIR)/include |
|
11 |
CUDALIB=$(CUDADIR)/lib64 |
|
12 |
|
|
13 |
PATCHTHUNKING=patch_thunking.h |
|
14 |
|
|
15 |
GSLINC=/usr/include/gsl |
|
16 |
|
|
17 |
GOTO2=/opt/GotoBLAS2 |
|
18 |
|
|
19 |
ACML=/opt/acml |
|
20 |
ACMLINC=$(ACML)/gfortran64_mp/include |
|
21 |
ACMLLIB=$(ACML)/gfortran64_mp/lib |
|
22 |
|
|
23 |
|
|
24 |
EXECUTABLE=cblas fblas gsl cublas thunking gotoblas acml |
|
25 |
|
|
26 |
FORMAT=DOUBLE |
|
27 |
#FORMAT=FLOAT |
|
28 |
|
|
29 |
#DIRECTIVES=-D$(FORMAT) -DPRINT -DUNIT |
|
30 |
#DIRECTIVES=-D$(FORMAT) -DUNIT -DRESULTS -DQUIET |
|
31 |
DIRECTIVES=-DUNIT -DQUIET |
|
32 |
|
|
33 |
all: $(EXECUTABLE) |
|
34 |
|
|
35 |
cblas: $(SOURCE) |
|
36 |
|
|
37 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DCBLAS $(LDFLAGS) \ |
|
38 |
$(SOURCE) -lcblas -o $(SOURCE:.c=)_SP_$@ |
|
39 |
|
|
40 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DCBLAS $(LDFLAGS) \ |
|
41 |
$(SOURCE) -lcblas -o $(SOURCE:.c=)_DP_$@ |
|
42 |
|
|
43 |
gotoblas: $(SOURCE) |
|
44 |
|
|
45 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DCBLAS $(LDFLAGS) \ |
|
46 |
$(SOURCE) $(GOTO2)/libgoto2.a -lpthread -o $(SOURCE:.c=)_SP_$@ |
|
47 |
|
|
48 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DCBLAS $(LDFLAGS) \ |
|
49 |
$(SOURCE) $(GOTO2)/libgoto2.a -lpthread -o $(SOURCE:.c=)_DP_$@ |
|
50 |
|
|
51 |
acml: $(SOURCE) |
|
52 |
|
|
53 |
$(CC) -I$(ACMLINC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DACML $(LDFLAGS) \ |
|
54 |
$(SOURCE) -L$(ACMLLIB) -lacml_mp -lacml_mv \ |
|
55 |
-lgomp -lgfortran -lpthread -o $(SOURCE:.c=)_SP_$@ |
|
56 |
|
|
57 |
$(CC) -I$(ACMLINC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DACML $(LDFLAGS) \ |
|
58 |
$(SOURCE) -L$(ACMLLIB) -lacml_mp -lacml_mv \ |
|
59 |
-lgomp -lgfortran -lpthread -o $(SOURCE:.c=)_DP_$@ |
|
60 |
|
|
61 |
fblas: $(SOURCE) |
|
62 |
|
|
63 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DFLOAT -DFBLAS $(LDFLAGS) \ |
|
64 |
$(SOURCE) -lf77blas -o $(SOURCE:.c=)_SP_$@ |
|
65 |
|
|
66 |
$(CC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE -DFBLAS $(LDFLAGS) \ |
|
67 |
$(SOURCE) -lf77blas -o $(SOURCE:.c=)_DP_$@ |
|
68 |
|
|
69 |
|
|
70 |
gsl: $(SOURCE) |
|
71 |
|
|
72 |
$(CC) -I$(GSLINC) $(CFLAGS) $(DIRECTIVES) -DFLOAT \ |
|
73 |
-DGSL $(LDFLAGS) \ |
|
74 |
$(SOURCE) -lgslcblas -o $(SOURCE:.c=)_SP_$@ |
|
75 |
|
|
76 |
$(CC) -I$(GSLINC) $(CFLAGS) $(DIRECTIVES) -DDOUBLE \ |
|
77 |
-DGSL $(LDFLAGS) \ |
|
78 |
$(SOURCE) -lgslcblas -o $(SOURCE:.c=)_DP_$@ |
|
79 |
|
|
80 |
cublas: $(SOURCE) |
|
81 |
|
Formats disponibles : Unified diff