root / BLAS / xGEMM / xGEMM.c @ 146
Historique | Voir | Annoter | Télécharger (21,82 ko)
1 | 1 | equemene | /*
|
---|---|---|---|
2 | 51 | equemene | Performs matrix multiply on several BLAS implementations
|
3 | 51 | equemene | Copyleft Emmanuel QUEMENER <emmanuel.quemener@gmail.com> under GPLv3
|
4 | 1 | equemene | |
5 | 51 | equemene | 2014-03-14 : Add clBLAS implementation
|
6 | 51 | equemene | |
7 | 1 | equemene | Thanks for help from aurel32@debian.org
|
8 | 1 | equemene | */
|
9 | 1 | equemene | |
10 | 1 | equemene | #include <stdio.h> |
11 | 1 | equemene | #include <math.h> |
12 | 1 | equemene | #include <stdlib.h> |
13 | 1 | equemene | #include <sys/time.h> |
14 | 1 | equemene | #include <string.h> |
15 | 1 | equemene | |
16 | 51 | equemene | #ifdef CLBLAS
|
17 | 51 | equemene | #include <clBLAS.h> |
18 | 53 | equemene | /* Precise here to avoid new specific bench function */
|
19 | 53 | equemene | int MyPlatform;
|
20 | 53 | equemene | int MyDevice;
|
21 | 51 | equemene | #elif CUBLAS
|
22 | 1 | equemene | #include <cublas.h> |
23 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_NOERR 0 |
24 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_ALLOC 1 |
25 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_SET 2 |
26 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_GET 3 |
27 | 1 | equemene | #define CUBLAS_WRAPPER_ERROR_STUB 4 |
28 | 1 | equemene | #elif THUNKING
|
29 | 1 | equemene | #include <cublas.h> |
30 | 7 | equemene | #include "fortran_common.h" |
31 | 7 | equemene | #include "fortran_thunking.h" |
32 | 1 | equemene | #elif FBLAS
|
33 | 1 | equemene | #include <cblas_f77.h> |
34 | 1 | equemene | #elif GSL
|
35 | 1 | equemene | #include <gsl_cblas.h> |
36 | 1 | equemene | #elif ACML
|
37 | 1 | equemene | #include <acml.h> |
38 | 1 | equemene | #else
|
39 | 1 | equemene | #include <cblas.h> |
40 | 146 | equemene | #include <blaswrap.h> |
41 | 1 | equemene | #endif
|
42 | 1 | equemene | |
43 | 51 | equemene | #ifdef CLBLAS
|
44 | 51 | equemene | |
45 | 1 | equemene | #ifdef DOUBLE
|
46 | 51 | equemene | #define LENGTH cl_double
|
47 | 51 | equemene | #else
|
48 | 51 | equemene | #define LENGTH cl_float
|
49 | 51 | equemene | #endif
|
50 | 51 | equemene | |
51 | 51 | equemene | #else
|
52 | 51 | equemene | |
53 | 51 | equemene | #ifdef DOUBLE
|
54 | 1 | equemene | #define LENGTH double |
55 | 1 | equemene | #else
|
56 | 1 | equemene | #define LENGTH float |
57 | 1 | equemene | #endif
|
58 | 1 | equemene | |
59 | 51 | equemene | #endif
|
60 | 51 | equemene | |
61 | 7 | equemene | #ifdef FBLAS
|
62 | 1 | equemene | |
63 | 1 | equemene | #ifdef DOUBLE
|
64 | 1 | equemene | |
65 | 1 | equemene | void dgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const double *, const double *, FINT, |
66 | 1 | equemene | const double *, FINT, const double *, double *, FINT); |
67 | 1 | equemene | |
68 | 1 | equemene | #else
|
69 | 1 | equemene | |
70 | 1 | equemene | void sgemm_(FCHAR, FCHAR, FINT, FINT, FINT, const float *, const float *, FINT, |
71 | 1 | equemene | const float *, FINT, const float *, float *, FINT); |
72 | 1 | equemene | |
73 | 1 | equemene | #endif
|
74 | 1 | equemene | #endif
|
75 | 1 | equemene | |
76 | 1 | equemene | /* Matrix with only defined triangular terms */
|
77 | 1 | equemene | /* Even if there are 0 in matrix, must be defined at all ! */
|
78 | 1 | equemene | |
79 | 1 | equemene | /* Get from fortran.c */
|
80 | 1 | equemene | |
81 | 1 | equemene | #ifdef CUBLAS
|
82 | 1 | equemene | static char *errMsg[5] = |
83 | 1 | equemene | { |
84 | 1 | equemene | "no error",
|
85 | 1 | equemene | "allocation error",
|
86 | 1 | equemene | "setVector/setMatrix error",
|
87 | 1 | equemene | "getVector/getMatrix error",
|
88 | 1 | equemene | "not implemented"
|
89 | 1 | equemene | }; |
90 | 1 | equemene | |
91 | 1 | equemene | static void wrapperError (const char *funcName, int error) |
92 | 1 | equemene | { |
93 | 1 | equemene | printf ("cublas%s wrapper: %s\n", funcName, errMsg[error]);
|
94 | 1 | equemene | fflush (stdout); |
95 | 1 | equemene | } |
96 | 1 | equemene | #endif
|
97 | 1 | equemene | |
98 | 1 | equemene | int printVector(const int dimVector,const LENGTH *dataVector, |
99 | 1 | equemene | char *nameVector,char *mesgVector) |
100 | 1 | equemene | { |
101 | 1 | equemene | #ifndef QUIET
|
102 | 1 | equemene | |
103 | 1 | equemene | int i;
|
104 | 1 | equemene | printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
105 | 1 | equemene | for (i=0;i<dimVector;i++) |
106 | 1 | equemene | { |
107 | 1 | equemene | printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
108 | 1 | equemene | } |
109 | 1 | equemene | #endif
|
110 | 1 | equemene | |
111 | 1 | equemene | return 0; |
112 | 1 | equemene | } |
113 | 1 | equemene | |
114 | 1 | equemene | int printResults(const int dimVector,const LENGTH *dataVector, |
115 | 1 | equemene | char *nameVector,char *mesgVector) |
116 | 1 | equemene | { |
117 | 1 | equemene | #ifdef RESULTS
|
118 | 1 | equemene | int i;
|
119 | 1 | equemene | |
120 | 1 | equemene | printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
121 | 1 | equemene | for (i=0;i<dimVector;i++) |
122 | 1 | equemene | { |
123 | 1 | equemene | printf("%s[%i]=%2.10e\n",nameVector,i,dataVector[i]);
|
124 | 1 | equemene | } |
125 | 1 | equemene | #endif
|
126 | 1 | equemene | return 0; |
127 | 1 | equemene | } |
128 | 1 | equemene | |
129 | 1 | equemene | #ifdef CUBLAS
|
130 | 1 | equemene | int printVectorGPU(const int dimVector,const LENGTH *dataVector, |
131 | 1 | equemene | char *nameVector,char *mesgVector) |
132 | 1 | equemene | { |
133 | 1 | equemene | #ifndef QUIET
|
134 | 1 | equemene | int i;
|
135 | 1 | equemene | cublasStatus stat; |
136 | 1 | equemene | LENGTH *P=0;
|
137 | 1 | equemene | int incx=1; |
138 | 1 | equemene | |
139 | 1 | equemene | P=malloc(dimVector*sizeof(LENGTH));
|
140 | 1 | equemene | |
141 | 1 | equemene | stat=cublasGetVector(dimVector,sizeof(P[0]),dataVector,incx,P,incx); |
142 | 1 | equemene | |
143 | 1 | equemene | if (stat != CUBLAS_STATUS_SUCCESS) {
|
144 | 1 | equemene | wrapperError ("ToGet", CUBLAS_WRAPPER_ERROR_GET);
|
145 | 1 | equemene | } |
146 | 1 | equemene | |
147 | 1 | equemene | printf("\n%s of %s, size %i:\n",mesgVector,nameVector,dimVector);
|
148 | 1 | equemene | for (i=0;i<dimVector;i++) |
149 | 1 | equemene | { |
150 | 1 | equemene | printf("%s[%i]=%2.10e\n",nameVector,i,P[i]);
|
151 | 1 | equemene | } |
152 | 1 | equemene | |
153 | 1 | equemene | free(P); |
154 | 1 | equemene | #endif
|
155 | 1 | equemene | |
156 | 1 | equemene | return 0; |
157 | 1 | equemene | } |
158 | 1 | equemene | #endif
|
159 | 1 | equemene | |
160 | 1 | equemene | int bench(int dim,int RUNS) |
161 | 1 | equemene | { |
162 | 1 | equemene | /*
|
163 | 1 | equemene | int dim=1000;
|
164 | 1 | equemene | int RUNS=100;
|
165 | 1 | equemene | int incx=1;
|
166 | 1 | equemene | */
|
167 | 1 | equemene | #ifdef PRINT
|
168 | 1 | equemene | LENGTH factor=1.;
|
169 | 1 | equemene | #endif
|
170 | 1 | equemene | |
171 | 1 | equemene | LENGTH alpha=1.,beta=0.; |
172 | 1 | equemene | LENGTH *A,*B,*C,*D; |
173 | 1 | equemene | |
174 | 1 | equemene | /* checkBefore checkAfter checks */
|
175 | 1 | equemene | LENGTH *checksA,*checksB; |
176 | 1 | equemene | |
177 | 1 | equemene | int i=0, j=0; |
178 | 1 | equemene | |
179 | 1 | equemene | double duration;
|
180 | 1 | equemene | |
181 | 1 | equemene | struct timeval tv1,tv2;
|
182 | 1 | equemene | struct timezone tz;
|
183 | 1 | equemene | |
184 | 1 | equemene | /* Create 4 Matrix of dimension dim by dim */
|
185 | 1 | equemene | |
186 | 1 | equemene | A=malloc(dim*dim*sizeof(LENGTH));
|
187 | 1 | equemene | B=malloc(dim*dim*sizeof(LENGTH));
|
188 | 1 | equemene | C=malloc(dim*dim*sizeof(LENGTH));
|
189 | 1 | equemene | D=malloc(dim*dim*sizeof(LENGTH));
|
190 | 1 | equemene | |
191 | 1 | equemene | /* Create 2 vectors for checker Before and After */
|
192 | 1 | equemene | |
193 | 1 | equemene | checksA=malloc(RUNS*sizeof(LENGTH));
|
194 | 1 | equemene | checksB=malloc(RUNS*sizeof(LENGTH));
|
195 | 1 | equemene | |
196 | 1 | equemene | /* Initialize elements with random numbers */
|
197 | 1 | equemene | /* Initialize the seed for rand() */
|
198 | 1 | equemene | /* srand(time()); */
|
199 | 1 | equemene | |
200 | 1 | equemene | for (i=0; i<dim; i++) { |
201 | 1 | equemene | for (j=0; j<dim; j++) { |
202 | 1 | equemene | A[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
203 | 1 | equemene | *(LENGTH)(i+1.)/(LENGTH)(j+1.); |
204 | 1 | equemene | B[i*dim+j]=(LENGTH)rand()/(RAND_MAX+1.)
|
205 | 1 | equemene | *(LENGTH)(i+1.)/(LENGTH)(j+1.); |
206 | 1 | equemene | C[i*dim+j]=0.;
|
207 | 1 | equemene | D[i*dim+j]=0.;
|
208 | 1 | equemene | } |
209 | 1 | equemene | } |
210 | 1 | equemene | /*
|
211 | 1 | equemene | A[0]=1;
|
212 | 1 | equemene | A[1]=2;
|
213 | 1 | equemene | A[2]=3;
|
214 | 1 | equemene | A[3]=4;
|
215 | 1 | equemene | |
216 | 1 | equemene | B[0]=5;
|
217 | 1 | equemene | B[1]=6;
|
218 | 1 | equemene | B[2]=7;
|
219 | 1 | equemene | B[3]=8;
|
220 | 1 | equemene | */
|
221 | 1 | equemene | |
222 | 1 | equemene | /* Print the matrix */
|
223 | 1 | equemene | |
224 | 1 | equemene | #ifdef QUIET
|
225 | 1 | equemene | #else
|
226 | 1 | equemene | for (i=0; i<dim; i++) { |
227 | 1 | equemene | for (j=0; j<dim; j++) printf("A[%i,%i]=%1.5f ", i,j,A[i*dim+j]); |
228 | 1 | equemene | putchar('\n');
|
229 | 1 | equemene | } |
230 | 1 | equemene | putchar('\n');
|
231 | 1 | equemene | for (i=0; i<dim; i++) { |
232 | 1 | equemene | for (j=0; j<dim; j++) printf("B[%i,%i]=%1.5f ", i,j,B[i*dim+j]); |
233 | 1 | equemene | putchar('\n');
|
234 | 1 | equemene | } |
235 | 1 | equemene | putchar('\n');
|
236 | 1 | equemene | #endif
|
237 | 1 | equemene | |
238 | 1 | equemene | /* Get first timer before launching */
|
239 | 1 | equemene | gettimeofday(&tv1, &tz); |
240 | 1 | equemene | |
241 | 51 | equemene | /* Compute with CLBLAS library */
|
242 | 51 | equemene | #ifdef CLBLAS
|
243 | 51 | equemene | |
244 | 53 | equemene | cl_uint platformCount; |
245 | 53 | equemene | cl_platform_id* platforms; |
246 | 53 | equemene | cl_uint deviceCount; |
247 | 53 | equemene | cl_device_id* devices; |
248 | 53 | equemene | |
249 | 51 | equemene | cl_int err,errA,errB,errC,errD; |
250 | 51 | equemene | cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; |
251 | 51 | equemene | cl_context ctx = 0;
|
252 | 51 | equemene | cl_command_queue queue = 0;
|
253 | 51 | equemene | cl_mem bufA, bufB, bufC, bufD; |
254 | 51 | equemene | cl_event event = NULL;
|
255 | 51 | equemene | |
256 | 53 | equemene | char* value;
|
257 | 53 | equemene | size_t valueSize; |
258 | 53 | equemene | |
259 | 53 | equemene | // tv3 Put on Device: Allocate & Write buffer
|
260 | 53 | equemene | // tv4 Compute
|
261 | 51 | equemene | struct timeval tv3,tv4;
|
262 | 51 | equemene | |
263 | 53 | equemene | printf("Using CLBLAS: %i iterations for %ix%i matrix on (%d,%d)\n",
|
264 | 53 | equemene | RUNS,dim,dim,MyPlatform,MyDevice); |
265 | 51 | equemene | |
266 | 51 | equemene | /* Setup OpenCL environment. */
|
267 | 53 | equemene | /* - get all platforms and select MyPlatform */
|
268 | 53 | equemene | /* - get all devices from MyPlatform and select MyDevice */
|
269 | 51 | equemene | |
270 | 53 | equemene | // Get all platforms
|
271 | 53 | equemene | err = clGetPlatformIDs(0, NULL, &platformCount); |
272 | 53 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
273 | 53 | equemene | err = clGetPlatformIDs(platformCount, platforms, NULL);
|
274 | 53 | equemene | |
275 | 53 | equemene | // Get Device defined
|
276 | 53 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
277 | 53 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
278 | 53 | equemene | err = clGetDeviceIDs(platforms[MyPlatform], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
279 | 53 | equemene | |
280 | 53 | equemene | // print device name
|
281 | 53 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, 0, NULL, &valueSize); |
282 | 53 | equemene | value = (char*) malloc(valueSize);
|
283 | 53 | equemene | err = clGetDeviceInfo(devices[MyDevice], CL_DEVICE_NAME, valueSize, value, NULL);
|
284 | 53 | equemene | printf("Device (%d,%d): %s\n",MyPlatform,MyDevice, value);
|
285 | 53 | equemene | free(value); |
286 | 53 | equemene | |
287 | 53 | equemene | props[1] = (cl_context_properties)platforms[MyPlatform];
|
288 | 53 | equemene | |
289 | 53 | equemene | /* Initialize Context */
|
290 | 53 | equemene | ctx = clCreateContext( props, 1, &devices[MyDevice], NULL, NULL, &err ); |
291 | 53 | equemene | queue = clCreateCommandQueue( ctx, devices[MyDevice], 0, &err );
|
292 | 53 | equemene | |
293 | 51 | equemene | /* Setup clBLAS */
|
294 | 51 | equemene | err = clblasSetup( ); |
295 | 51 | equemene | |
296 | 51 | equemene | /* Prepare OpenCL memory objects and place matrices inside them. */
|
297 | 52 | equemene | bufA = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*A),NULL,&errA ); |
298 | 52 | equemene | bufB = clCreateBuffer(ctx,CL_MEM_READ_ONLY,dim*dim*sizeof(*B),NULL,&errB ); |
299 | 52 | equemene | bufC = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*C),NULL,&errC ); |
300 | 52 | equemene | bufD = clCreateBuffer(ctx,CL_MEM_READ_WRITE,dim*dim*sizeof(*D),NULL,&errD ); |
301 | 51 | equemene | |
302 | 52 | equemene | errA = clEnqueueWriteBuffer( queue,bufA,CL_TRUE,0,
|
303 | 52 | equemene | dim*dim*sizeof(*A),A,0,NULL,NULL ); |
304 | 52 | equemene | errB = clEnqueueWriteBuffer( queue, bufB, CL_TRUE,0,
|
305 | 52 | equemene | dim*dim*sizeof(*B),B,0,NULL,NULL ); |
306 | 52 | equemene | errC = clEnqueueWriteBuffer( queue, bufC, CL_TRUE,0,
|
307 | 54 | equemene | dim*dim*sizeof(*C),C,0,NULL,NULL ); |
308 | 52 | equemene | errD = clEnqueueWriteBuffer( queue, bufD, CL_TRUE,0,
|
309 | 54 | equemene | dim*dim*sizeof(*D),D,0,NULL,NULL ); |
310 | 51 | equemene | |
311 | 51 | equemene | /* Get third timer after memory operation */
|
312 | 51 | equemene | gettimeofday(&tv3, &tz); |
313 | 51 | equemene | |
314 | 51 | equemene | #ifdef DOUBLE
|
315 | 51 | equemene | |
316 | 51 | equemene | for (i=0;i<RUNS;i++) |
317 | 51 | equemene | { |
318 | 52 | equemene | err = clblasDgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
319 | 52 | equemene | dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
320 | 52 | equemene | bufC,0,dim,1,&queue,0,NULL,&event ); |
321 | 51 | equemene | |
322 | 52 | equemene | err = clblasDgemm( clblasRowMajor,clblasTrans,clblasTrans, |
323 | 52 | equemene | dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
324 | 52 | equemene | bufD,0,dim,1,&queue,0,NULL,&event ); |
325 | 51 | equemene | |
326 | 51 | equemene | } |
327 | 51 | equemene | |
328 | 53 | equemene | if (err != CL_SUCCESS) {
|
329 | 53 | equemene | printf("clblasDgemm() failed with %d\n", err);
|
330 | 53 | equemene | } |
331 | 53 | equemene | |
332 | 51 | equemene | #else
|
333 | 51 | equemene | |
334 | 51 | equemene | for (i=0;i<RUNS;i++) |
335 | 51 | equemene | { |
336 | 51 | equemene | |
337 | 52 | equemene | err = clblasSgemm( clblasRowMajor,clblasNoTrans,clblasNoTrans, |
338 | 52 | equemene | dim,dim,dim,alpha,bufA,0,dim,bufB,0,dim,beta, |
339 | 52 | equemene | bufC,0,dim,1,&queue,0,NULL,&event ); |
340 | 51 | equemene | |
341 | 52 | equemene | err = clblasSgemm( clblasRowMajor,clblasTrans,clblasTrans, |
342 | 52 | equemene | dim,dim,dim,alpha,bufB,0,dim,bufA,0,dim,beta, |
343 | 52 | equemene | bufD,0,dim,1,&queue,0,NULL,&event ); |
344 | 51 | equemene | } |
345 | 51 | equemene | |
346 | 53 | equemene | if (err != CL_SUCCESS) {
|
347 | 53 | equemene | printf("clblasSgemm() failed with %d\n", err);
|
348 | 53 | equemene | } |
349 | 53 | equemene | |
350 | 51 | equemene | #endif
|
351 | 51 | equemene | |
352 | 51 | equemene | /* Wait for calculations to be finished. */
|
353 | 51 | equemene | err = clWaitForEvents( 1, &event );
|
354 | 51 | equemene | |
355 | 53 | equemene | /* Get fourth timer after memory free */
|
356 | 53 | equemene | gettimeofday(&tv4, &tz); |
357 | 53 | equemene | |
358 | 51 | equemene | /* Fetch results of calculations from GPU memory. */
|
359 | 53 | equemene | errC = clEnqueueReadBuffer( queue,bufC,CL_TRUE,0,dim*dim * sizeof(*C), |
360 | 52 | equemene | C,0,NULL,NULL ); |
361 | 51 | equemene | |
362 | 51 | equemene | /* Fetch results of calculations from GPU memory. */
|
363 | 53 | equemene | errD = clEnqueueReadBuffer( queue,bufD,CL_TRUE,0,dim*dim*sizeof(*D), |
364 | 52 | equemene | D,0,NULL,NULL ); |
365 | 51 | equemene | |
366 | 51 | equemene | /* Release OpenCL memory objects. */
|
367 | 51 | equemene | clReleaseMemObject( bufD ); |
368 | 51 | equemene | clReleaseMemObject( bufC ); |
369 | 51 | equemene | clReleaseMemObject( bufB ); |
370 | 51 | equemene | clReleaseMemObject( bufA ); |
371 | 51 | equemene | |
372 | 51 | equemene | /* Finalize work with clBLAS */
|
373 | 51 | equemene | clblasTeardown( ); |
374 | 51 | equemene | |
375 | 51 | equemene | /* Release OpenCL working objects. */
|
376 | 51 | equemene | clReleaseCommandQueue( queue ); |
377 | 51 | equemene | clReleaseContext( ctx ); |
378 | 51 | equemene | |
379 | 51 | equemene | |
380 | 1 | equemene | /* Compute with CuBLAS library */
|
381 | 51 | equemene | #elif CUBLAS
|
382 | 1 | equemene | LENGTH *devPtrA=0, *devPtrB=0, *devPtrC=0, *devPtrD=0; |
383 | 1 | equemene | cublasStatus stat1, stat2, stat3, stat4; |
384 | 1 | equemene | struct timeval tv3,tv4;
|
385 | 1 | equemene | |
386 | 1 | equemene | /* Order is Row */
|
387 | 1 | equemene | /* Have to swap uplo and trans */
|
388 | 1 | equemene | char transa='N',transb='T'; |
389 | 1 | equemene | |
390 | 1 | equemene | printf("Using CuBLAS: %i iterations for %ix%i matrix\n",
|
391 | 1 | equemene | RUNS,dim,dim); |
392 | 1 | equemene | |
393 | 1 | equemene | stat1=cublasAlloc(dim*dim,sizeof(devPtrA[0]),(void**)&devPtrA); |
394 | 1 | equemene | stat2=cublasAlloc(dim*dim,sizeof(devPtrB[0]),(void**)&devPtrB); |
395 | 1 | equemene | stat3=cublasAlloc(dim*dim,sizeof(devPtrC[0]),(void**)&devPtrC); |
396 | 1 | equemene | stat4=cublasAlloc(dim*dim,sizeof(devPtrD[0]),(void**)&devPtrD); |
397 | 1 | equemene | |
398 | 1 | equemene | if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
399 | 1 | equemene | (stat2 != CUBLAS_STATUS_SUCCESS) || |
400 | 1 | equemene | (stat3 != CUBLAS_STATUS_SUCCESS) || |
401 | 1 | equemene | (stat4 != CUBLAS_STATUS_SUCCESS) ) { |
402 | 5 | equemene | wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_ALLOC);
|
403 | 1 | equemene | cublasFree (devPtrA); |
404 | 1 | equemene | cublasFree (devPtrB); |
405 | 1 | equemene | cublasFree (devPtrC); |
406 | 1 | equemene | cublasFree (devPtrD); |
407 | 1 | equemene | return 1; |
408 | 1 | equemene | } |
409 | 1 | equemene | |
410 | 1 | equemene | stat1=cublasSetMatrix(dim,dim,sizeof(A[0]),A,dim,devPtrA,dim); |
411 | 1 | equemene | stat2=cublasSetMatrix(dim,dim,sizeof(B[0]),B,dim,devPtrB,dim); |
412 | 1 | equemene | stat3=cublasSetMatrix(dim,dim,sizeof(C[0]),C,dim,devPtrC,dim); |
413 | 1 | equemene | stat4=cublasSetMatrix(dim,dim,sizeof(D[0]),D,dim,devPtrD,dim); |
414 | 1 | equemene | |
415 | 1 | equemene | if ((stat1 != CUBLAS_STATUS_SUCCESS) ||
|
416 | 1 | equemene | (stat2 != CUBLAS_STATUS_SUCCESS) || |
417 | 1 | equemene | (stat3 != CUBLAS_STATUS_SUCCESS) || |
418 | 1 | equemene | (stat4 != CUBLAS_STATUS_SUCCESS) ) { |
419 | 5 | equemene | wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_SET);
|
420 | 1 | equemene | cublasFree (devPtrA); |
421 | 1 | equemene | cublasFree (devPtrB); |
422 | 1 | equemene | cublasFree (devPtrC); |
423 | 1 | equemene | cublasFree (devPtrD); |
424 | 1 | equemene | return 1; |
425 | 1 | equemene | } |
426 | 1 | equemene | |
427 | 1 | equemene | /* Get third timer after memory operation */
|
428 | 1 | equemene | gettimeofday(&tv3, &tz); |
429 | 1 | equemene | |
430 | 1 | equemene | #ifdef DOUBLE
|
431 | 1 | equemene | |
432 | 1 | equemene | for (i=0;i<RUNS;i++) |
433 | 1 | equemene | { |
434 | 1 | equemene | cublasDgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
435 | 1 | equemene | devPtrA,dim,beta,devPtrC,dim); |
436 | 1 | equemene | cublasDgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
437 | 1 | equemene | devPtrB,dim,beta,devPtrD,dim); |
438 | 1 | equemene | } |
439 | 1 | equemene | |
440 | 1 | equemene | #else
|
441 | 1 | equemene | |
442 | 1 | equemene | for (i=0;i<RUNS;i++) |
443 | 1 | equemene | { |
444 | 1 | equemene | cublasSgemm(transa,transa,dim,dim,dim,alpha,devPtrB,dim, |
445 | 1 | equemene | devPtrA,dim,beta,devPtrC,dim); |
446 | 1 | equemene | cublasSgemm(transb,transb,dim,dim,dim,alpha,devPtrA,dim, |
447 | 1 | equemene | devPtrB,dim,beta,devPtrD,dim); |
448 | 1 | equemene | } |
449 | 1 | equemene | |
450 | 1 | equemene | #endif
|
451 | 1 | equemene | |
452 | 1 | equemene | stat3=cublasGetMatrix(dim,dim,sizeof(C[0]),devPtrC,dim,C,dim); |
453 | 1 | equemene | stat4=cublasGetMatrix(dim,dim,sizeof(D[0]),devPtrD,dim,D,dim); |
454 | 1 | equemene | |
455 | 53 | equemene | /* Get fourth timer before memory free */
|
456 | 53 | equemene | gettimeofday(&tv4, &tz); |
457 | 53 | equemene | |
458 | 1 | equemene | cublasFree (devPtrA); |
459 | 1 | equemene | cublasFree (devPtrB); |
460 | 1 | equemene | cublasFree (devPtrC); |
461 | 1 | equemene | cublasFree (devPtrD); |
462 | 1 | equemene | |
463 | 1 | equemene | if ((stat1 != CUBLAS_STATUS_SUCCESS) ) {
|
464 | 5 | equemene | wrapperError ("xGEMM", CUBLAS_WRAPPER_ERROR_GET);
|
465 | 1 | equemene | } |
466 | 1 | equemene | |
467 | 1 | equemene | |
468 | 1 | equemene | #elif THUNKING
|
469 | 1 | equemene | |
470 | 1 | equemene | /* Order is Row : Have to swap uplo='U' and trans='N' */
|
471 | 1 | equemene | char transa='N',transb='T'; |
472 | 1 | equemene | printf("Using CuBLAS/Thunking: %i iterations for %ix%i matrix\n",
|
473 | 1 | equemene | RUNS,dim,dim); |
474 | 1 | equemene | |
475 | 1 | equemene | #ifdef DOUBLE
|
476 | 1 | equemene | |
477 | 1 | equemene | for (i=0;i<RUNS;i++) |
478 | 1 | equemene | { |
479 | 1 | equemene | CUBLAS_DGEMM(&transa,&transa, |
480 | 7 | equemene | &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
481 | 1 | equemene | CUBLAS_DGEMM(&transb,&transb, |
482 | 7 | equemene | &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
483 | 1 | equemene | } |
484 | 1 | equemene | |
485 | 1 | equemene | #else
|
486 | 1 | equemene | |
487 | 1 | equemene | for (i=0;i<RUNS;i++) |
488 | 1 | equemene | { |
489 | 1 | equemene | CUBLAS_SGEMM(&transa,&transa, |
490 | 7 | equemene | &dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
491 | 1 | equemene | CUBLAS_SGEMM(&transb,&transb, |
492 | 7 | equemene | &dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
493 | 1 | equemene | } |
494 | 1 | equemene | |
495 | 1 | equemene | #endif
|
496 | 1 | equemene | |
497 | 1 | equemene | #elif FBLAS
|
498 | 1 | equemene | |
499 | 1 | equemene | /* Order is Row : Have to swap uplo='U' and trans='N' */
|
500 | 1 | equemene | char transa='N',transb='T'; |
501 | 1 | equemene | |
502 | 1 | equemene | printf("Using FBLAS: %i iterations for %ix%i matrix\n",
|
503 | 1 | equemene | RUNS,dim,dim); |
504 | 1 | equemene | |
505 | 1 | equemene | #ifdef DOUBLE
|
506 | 1 | equemene | |
507 | 1 | equemene | for (i=0;i<RUNS;i++) |
508 | 1 | equemene | { |
509 | 1 | equemene | dgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
510 | 1 | equemene | dgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
511 | 1 | equemene | } |
512 | 1 | equemene | |
513 | 1 | equemene | #else
|
514 | 1 | equemene | |
515 | 1 | equemene | for (i=0;i<RUNS;i++) |
516 | 1 | equemene | { |
517 | 1 | equemene | sgemm_(&transa,&transa,&dim,&dim,&dim,&alpha,B,&dim,A,&dim,&beta,C,&dim); |
518 | 1 | equemene | sgemm_(&transb,&transb,&dim,&dim,&dim,&alpha,A,&dim,B,&dim,&beta,D,&dim); |
519 | 1 | equemene | } |
520 | 1 | equemene | |
521 | 1 | equemene | #endif
|
522 | 1 | equemene | |
523 | 1 | equemene | #elif ACML
|
524 | 1 | equemene | |
525 | 1 | equemene | /* Order is Row : Have to swap uplo='U' and trans='N' */
|
526 | 1 | equemene | char transa='N',transb='T'; |
527 | 1 | equemene | |
528 | 1 | equemene | printf("Using ACML: %i iterations for %ix%i matrix\n",
|
529 | 1 | equemene | RUNS,dim,dim); |
530 | 1 | equemene | |
531 | 1 | equemene | #ifdef DOUBLE
|
532 | 1 | equemene | |
533 | 1 | equemene | for (i=0;i<RUNS;i++) |
534 | 1 | equemene | { |
535 | 1 | equemene | dgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
536 | 1 | equemene | dgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
537 | 1 | equemene | } |
538 | 1 | equemene | |
539 | 1 | equemene | #else
|
540 | 1 | equemene | |
541 | 1 | equemene | for (i=0;i<RUNS;i++) |
542 | 1 | equemene | { |
543 | 1 | equemene | sgemm(transa,transa,dim,dim,dim,alpha,B,dim,A,dim,beta,C,dim); |
544 | 1 | equemene | sgemm(transb,transb,dim,dim,dim,alpha,A,dim,B,dim,beta,D,dim); |
545 | 1 | equemene | } |
546 | 1 | equemene | |
547 | 1 | equemene | #endif
|
548 | 1 | equemene | |
549 | 1 | equemene | #elif GSL
|
550 | 1 | equemene | |
551 | 1 | equemene | printf("Using GSL: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
552 | 1 | equemene | |
553 | 1 | equemene | /*
|
554 | 1 | equemene | RowMajor : Matrix is read row by row
|
555 | 1 | equemene | Upper : the no null elements are on top
|
556 | 1 | equemene | NoTrans : no transposition before estimation
|
557 | 1 | equemene | NonUnit : Matrix is not unit
|
558 | 1 | equemene | */
|
559 | 1 | equemene | |
560 | 1 | equemene | #ifdef DOUBLE
|
561 | 1 | equemene | |
562 | 1 | equemene | for (i=0;i<RUNS;i++) |
563 | 1 | equemene | { |
564 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
565 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
566 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
567 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
568 | 1 | equemene | } |
569 | 1 | equemene | |
570 | 1 | equemene | #else
|
571 | 1 | equemene | |
572 | 1 | equemene | for (i=0;i<RUNS;i++) |
573 | 1 | equemene | { |
574 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
575 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
576 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
577 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
578 | 1 | equemene | } |
579 | 1 | equemene | |
580 | 1 | equemene | #endif
|
581 | 1 | equemene | |
582 | 1 | equemene | #else
|
583 | 1 | equemene | |
584 | 1 | equemene | printf("Using CBLAS: %i iterations for %ix%i matrix\n",RUNS,dim,dim);
|
585 | 1 | equemene | |
586 | 1 | equemene | /*
|
587 | 1 | equemene | RowMajor : Matrix is read row bu row
|
588 | 1 | equemene | Upper : the no null elements are on top
|
589 | 1 | equemene | NoTrans : no transposition before estimation
|
590 | 1 | equemene | NonUnit : Matrix is not unit
|
591 | 1 | equemene | */
|
592 | 1 | equemene | |
593 | 1 | equemene | #ifdef DOUBLE
|
594 | 1 | equemene | |
595 | 1 | equemene | for (i=0;i<RUNS;i++) |
596 | 1 | equemene | { |
597 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
598 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
599 | 1 | equemene | cblas_dgemm(CblasRowMajor,CblasTrans,CblasTrans, |
600 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
601 | 146 | equemene | /* dgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, */
|
602 | 146 | equemene | /* dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); */
|
603 | 146 | equemene | /* dgemm(CblasRowMajor,CblasTrans,CblasTrans, */
|
604 | 146 | equemene | /* dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); */
|
605 | 1 | equemene | } |
606 | 1 | equemene | |
607 | 1 | equemene | #else
|
608 | 1 | equemene | |
609 | 1 | equemene | for (i=0;i<RUNS;i++) |
610 | 1 | equemene | { |
611 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, |
612 | 1 | equemene | dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); |
613 | 1 | equemene | cblas_sgemm(CblasRowMajor,CblasTrans,CblasTrans, |
614 | 1 | equemene | dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); |
615 | 146 | equemene | /* sgemm(CblasRowMajor,CblasNoTrans,CblasNoTrans, */
|
616 | 146 | equemene | /* dim,dim,dim,alpha,A,dim,B,dim,beta,C,dim); */
|
617 | 146 | equemene | /* sgemm(CblasRowMajor,CblasTrans,CblasTrans, */
|
618 | 146 | equemene | /* dim,dim,dim,alpha,B,dim,A,dim,beta,D,dim); */
|
619 | 1 | equemene | } |
620 | 1 | equemene | |
621 | 1 | equemene | #endif
|
622 | 1 | equemene | |
623 | 1 | equemene | #endif
|
624 | 1 | equemene | |
625 | 1 | equemene | /* Get second timer after launching */
|
626 | 1 | equemene | gettimeofday(&tv2, &tz); |
627 | 1 | equemene | |
628 | 1 | equemene | /* Store the checker of errors */
|
629 | 1 | equemene | checksA[0]=0.; |
630 | 1 | equemene | |
631 | 1 | equemene | for (i=0; i<dim; i++) { |
632 | 1 | equemene | for (j=0; j<dim; j++) { |
633 | 1 | equemene | checksA[0]=checksA[0]+fabs(D[i*dim+j]-C[j*dim+i]); |
634 | 1 | equemene | } |
635 | 1 | equemene | } |
636 | 1 | equemene | |
637 | 1 | equemene | /* Print the matrix */
|
638 | 1 | equemene | |
639 | 1 | equemene | #ifdef QUIET
|
640 | 1 | equemene | #else
|
641 | 1 | equemene | for (i=0; i<dim; i++) { |
642 | 1 | equemene | for (j=0; j<dim; j++) printf("C[%i,%i]=%1.5f ", i,j,C[i*dim+j]); |
643 | 1 | equemene | putchar('\n');
|
644 | 1 | equemene | } |
645 | 1 | equemene | putchar('\n');
|
646 | 1 | equemene | for (i=0; i<dim; i++) { |
647 | 1 | equemene | for (j=0; j<dim; j++) printf("D[%i,%i]=%1.5f ", i,j,D[i*dim+j]); |
648 | 1 | equemene | putchar('\n');
|
649 | 1 | equemene | } |
650 | 1 | equemene | putchar('\n');
|
651 | 1 | equemene | #endif
|
652 | 1 | equemene | |
653 | 1 | equemene | /* Free 1 Matrix and 2 Vectors of dimension dim */
|
654 | 1 | equemene | |
655 | 1 | equemene | free(A); |
656 | 1 | equemene | free(B); |
657 | 1 | equemene | free(C); |
658 | 1 | equemene | free(D); |
659 | 1 | equemene | |
660 | 1 | equemene | putchar('\n');
|
661 | 1 | equemene | |
662 | 53 | equemene | #ifdef CLBLAS
|
663 | 1 | equemene | double memoryIn,memoryOut;
|
664 | 1 | equemene | |
665 | 1 | equemene | memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
666 | 1 | equemene | (tv3.tv_usec-tv1.tv_usec))/1000000.; |
667 | 1 | equemene | |
668 | 1 | equemene | memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
669 | 1 | equemene | (tv2.tv_usec-tv4.tv_usec))/1000000.; |
670 | 1 | equemene | |
671 | 1 | equemene | duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
672 | 1 | equemene | (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
673 | 1 | equemene | |
674 | 1 | equemene | printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
675 | 1 | equemene | printf("Duration of memory free : %2.10f s\n",memoryOut);
|
676 | 53 | equemene | #elif CUBLAS
|
677 | 53 | equemene | double memoryIn,memoryOut;
|
678 | 53 | equemene | |
679 | 53 | equemene | memoryIn=(double)((tv3.tv_sec-tv1.tv_sec) * 1000000L + \ |
680 | 53 | equemene | (tv3.tv_usec-tv1.tv_usec))/1000000.; |
681 | 53 | equemene | |
682 | 53 | equemene | memoryOut=(double)((tv2.tv_sec-tv4.tv_sec) * 1000000L + \ |
683 | 53 | equemene | (tv2.tv_usec-tv4.tv_usec))/1000000.; |
684 | 53 | equemene | |
685 | 53 | equemene | duration=(double)((tv4.tv_sec-tv3.tv_sec) * 1000000L + \ |
686 | 53 | equemene | (tv4.tv_usec-tv3.tv_usec))/1000000./RUNS; |
687 | 53 | equemene | |
688 | 53 | equemene | printf("Duration of memory allocation : %2.10f s\n",memoryIn);
|
689 | 53 | equemene | printf("Duration of memory free : %2.10f s\n",memoryOut);
|
690 | 1 | equemene | #else
|
691 | 1 | equemene | duration=(double)((tv2.tv_sec-tv1.tv_sec) * 1000000L + \ |
692 | 1 | equemene | (tv2.tv_usec-tv1.tv_usec))/1000000./RUNS; |
693 | 1 | equemene | |
694 | 1 | equemene | #endif
|
695 | 1 | equemene | |
696 | 1 | equemene | printf("Duration of each cycle : %2.10f s\n",duration);
|
697 | 1 | equemene | |
698 | 1 | equemene | printf("Number of GFlops : %2.3f \n",
|
699 | 1 | equemene | dim*dim*2.*(2.*dim-1)/duration/1000000000.); |
700 | 1 | equemene | |
701 | 1 | equemene | printf("Error %1.10f\n",checksA[0]); |
702 | 1 | equemene | printResults(RUNS,checksA,"C","Errors cumulated"); |
703 | 1 | equemene | |
704 | 1 | equemene | putchar('\n');
|
705 | 1 | equemene | |
706 | 1 | equemene | /* Free 2 vectors for checker Before and After */
|
707 | 1 | equemene | |
708 | 1 | equemene | free(checksA); |
709 | 1 | equemene | free(checksB); |
710 | 1 | equemene | |
711 | 1 | equemene | return 0; |
712 | 1 | equemene | } |
713 | 1 | equemene | |
714 | 53 | equemene | #ifdef CLBLAS
|
715 | 53 | equemene | |
716 | 53 | equemene | int DelectOpenCLDevices()
|
717 | 53 | equemene | { |
718 | 53 | equemene | /* */
|
719 | 53 | equemene | /* Not needed to import CL.h, already done in CLBLAS.h */
|
720 | 53 | equemene | |
721 | 53 | equemene | int i, j;
|
722 | 53 | equemene | char* value;
|
723 | 53 | equemene | size_t valueSize; |
724 | 53 | equemene | cl_uint platformCount; |
725 | 53 | equemene | cl_platform_id* platforms; |
726 | 53 | equemene | cl_uint deviceCount; |
727 | 53 | equemene | cl_device_id* devices; |
728 | 53 | equemene | cl_uint maxComputeUnits; |
729 | 53 | equemene | cl_int maxWorkGroupSize; |
730 | 53 | equemene | cl_int maxWorkItemSizes; |
731 | 53 | equemene | cl_device_type dev_type; |
732 | 53 | equemene | |
733 | 53 | equemene | // get all platforms
|
734 | 53 | equemene | clGetPlatformIDs(0, NULL, &platformCount); |
735 | 53 | equemene | platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount);
|
736 | 53 | equemene | clGetPlatformIDs(platformCount, platforms, NULL);
|
737 | 53 | equemene | |
738 | 53 | equemene | |
739 | 53 | equemene | printf("OpenCL statistics: %d platform(s) detected\n\n",platformCount);
|
740 | 53 | equemene | |
741 | 53 | equemene | for (i = 0; i < platformCount; i++) { |
742 | 53 | equemene | |
743 | 53 | equemene | // get all devices
|
744 | 53 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &deviceCount); |
745 | 53 | equemene | devices = (cl_device_id*) malloc(sizeof(cl_device_id) * deviceCount);
|
746 | 53 | equemene | clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, deviceCount, devices, NULL);
|
747 | 53 | equemene | |
748 | 53 | equemene | // for each device print critical attributes
|
749 | 53 | equemene | for (j = 0; j < deviceCount; j++) { |
750 | 53 | equemene | |
751 | 53 | equemene | // print device name
|
752 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, 0, NULL, &valueSize); |
753 | 53 | equemene | value = (char*) malloc(valueSize);
|
754 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_NAME, valueSize, value, NULL);
|
755 | 53 | equemene | printf("Device (%d,%d): %s\n",i, j, value);
|
756 | 53 | equemene | free(value); |
757 | 53 | equemene | |
758 | 53 | equemene | // print type device CPU/GPU/ACCELERATOR
|
759 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL); |
760 | 53 | equemene | printf("\tDevice Type: ");
|
761 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_GPU)
|
762 | 53 | equemene | printf("CL_DEVICE_TYPE_GPU ");
|
763 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_CPU)
|
764 | 53 | equemene | printf("CL_DEVICE_TYPE_CPU ");
|
765 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_ACCELERATOR)
|
766 | 53 | equemene | printf("CL_DEVICE_TYPE_ACCELERATOR ");
|
767 | 53 | equemene | if(dev_type & CL_DEVICE_TYPE_DEFAULT)
|
768 | 53 | equemene | printf("CL_DEVICE_TYPE_DEFAULT ");
|
769 | 53 | equemene | printf("\n");
|
770 | 53 | equemene | |
771 | 53 | equemene | // print device vendor
|
772 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, 0, NULL, &valueSize); |
773 | 53 | equemene | value = (char*) malloc(valueSize);
|
774 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, valueSize, value, NULL);
|
775 | 53 | equemene | printf("\tDevice vendor: %s\n", value);
|
776 | 53 | equemene | free(value); |
777 | 53 | equemene | |
778 | 53 | equemene | // print hardware device version
|
779 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, 0, NULL, &valueSize); |
780 | 53 | equemene | value = (char*) malloc(valueSize);
|
781 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_VERSION, valueSize, value, NULL);
|
782 | 53 | equemene | printf("\tHardware version: %s\n", value);
|
783 | 53 | equemene | free(value); |
784 | 53 | equemene | |
785 | 53 | equemene | // print software driver version
|
786 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, 0, NULL, &valueSize); |
787 | 53 | equemene | value = (char*) malloc(valueSize);
|
788 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DRIVER_VERSION, valueSize, value, NULL);
|
789 | 53 | equemene | printf("\tSoftware version: %s\n", value);
|
790 | 53 | equemene | free(value); |
791 | 53 | equemene | |
792 | 53 | equemene | // print c version supported by compiler for device
|
793 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &valueSize); |
794 | 53 | equemene | value = (char*) malloc(valueSize);
|
795 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_OPENCL_C_VERSION, valueSize, value, NULL);
|
796 | 53 | equemene | printf("\tOpenCL C version: %s\n", value);
|
797 | 53 | equemene | free(value); |
798 | 53 | equemene | |
799 | 53 | equemene | // print parallel compute units
|
800 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, |
801 | 53 | equemene | sizeof(maxComputeUnits), &maxComputeUnits, NULL); |
802 | 53 | equemene | printf("\tParallel compute units: %d\n", maxComputeUnits);
|
803 | 53 | equemene | |
804 | 53 | equemene | // print max work group size
|
805 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, |
806 | 53 | equemene | sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); |
807 | 53 | equemene | printf("\tMaximum Work Group Size: %d\n", maxWorkGroupSize);
|
808 | 53 | equemene | |
809 | 53 | equemene | // print max work items size
|
810 | 53 | equemene | clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, |
811 | 53 | equemene | sizeof(maxWorkItemSizes), &maxWorkItemSizes, NULL); |
812 | 53 | equemene | printf("\tMaximum Work Item Sizes: %d\n", maxWorkItemSizes);
|
813 | 53 | equemene | |
814 | 53 | equemene | } |
815 | 53 | equemene | printf("\n");
|
816 | 53 | equemene | free(devices); |
817 | 53 | equemene | } |
818 | 53 | equemene | |
819 | 53 | equemene | free(platforms); |
820 | 53 | equemene | return 0; |
821 | 53 | equemene | |
822 | 53 | equemene | } |
823 | 53 | equemene | #endif
|
824 | 53 | equemene | |
825 | 1 | equemene | int main(int argc,char **argv) |
826 | 1 | equemene | { |
827 | 1 | equemene | if ((argc==1)|| |
828 | 1 | equemene | (strcmp(argv[1],"-h")==0)|| |
829 | 1 | equemene | (strcmp(argv[1],"--help")==0)) |
830 | 1 | equemene | { |
831 | 53 | equemene | #ifdef CLBLAS
|
832 | 1 | equemene | printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
833 | 1 | equemene | "\t#1 Size of square matrices \n"
|
834 | 53 | equemene | "\t#2 Number of iterations \n"
|
835 | 53 | equemene | "\t#3 OpenCL Plateform ID\n"
|
836 | 53 | equemene | "\t#4 OpenCL Device ID\n\n");
|
837 | 53 | equemene | DelectOpenCLDevices(); |
838 | 53 | equemene | #else
|
839 | 53 | equemene | printf("\nPerforms a bench using BLAS library implementation:\n\n"
|
840 | 53 | equemene | "\t#1 Size of square matrices \n"
|
841 | 1 | equemene | "\t#2 Number of iterations\n\n");
|
842 | 53 | equemene | #endif
|
843 | 1 | equemene | } |
844 | 1 | equemene | else if ((atoi(argv[1])>=2)&& |
845 | 1 | equemene | (atoi(argv[2])>=1)) |
846 | 1 | equemene | { |
847 | 53 | equemene | #ifdef CLBLAS
|
848 | 53 | equemene | MyPlatform=atoi(argv[3]);
|
849 | 53 | equemene | MyDevice=atoi(argv[4]);
|
850 | 53 | equemene | #endif
|
851 | 1 | equemene | bench(atoi(argv[1]),atoi(argv[2])); |
852 | 1 | equemene | } |
853 | 1 | equemene | |
854 | 1 | equemene | return 0; |
855 | 1 | equemene | } |