root / src / blas / HPL_dgemm.c @ 9
Historique | Voir | Annoter | Télécharger (20,06 ko)
1 | 1 | equemene | /*
|
---|---|---|---|
2 | 1 | equemene | * -- High Performance Computing Linpack Benchmark (HPL)
|
3 | 1 | equemene | * HPL - 2.0 - September 10, 2008
|
4 | 1 | equemene | * Antoine P. Petitet
|
5 | 1 | equemene | * University of Tennessee, Knoxville
|
6 | 1 | equemene | * Innovative Computing Laboratory
|
7 | 1 | equemene | * (C) Copyright 2000-2008 All Rights Reserved
|
8 | 1 | equemene | *
|
9 | 1 | equemene | * -- Copyright notice and Licensing terms:
|
10 | 1 | equemene | *
|
11 | 1 | equemene | * Redistribution and use in source and binary forms, with or without
|
12 | 1 | equemene | * modification, are permitted provided that the following conditions
|
13 | 1 | equemene | * are met:
|
14 | 1 | equemene | *
|
15 | 1 | equemene | * 1. Redistributions of source code must retain the above copyright
|
16 | 1 | equemene | * notice, this list of conditions and the following disclaimer.
|
17 | 1 | equemene | *
|
18 | 1 | equemene | * 2. Redistributions in binary form must reproduce the above copyright
|
19 | 1 | equemene | * notice, this list of conditions, and the following disclaimer in the
|
20 | 1 | equemene | * documentation and/or other materials provided with the distribution.
|
21 | 1 | equemene | *
|
22 | 1 | equemene | * 3. All advertising materials mentioning features or use of this
|
23 | 1 | equemene | * software must display the following acknowledgement:
|
24 | 1 | equemene | * This product includes software developed at the University of
|
25 | 1 | equemene | * Tennessee, Knoxville, Innovative Computing Laboratory.
|
26 | 1 | equemene | *
|
27 | 1 | equemene | * 4. The name of the University, the name of the Laboratory, or the
|
28 | 1 | equemene | * names of its contributors may not be used to endorse or promote
|
29 | 1 | equemene | * products derived from this software without specific written
|
30 | 1 | equemene | * permission.
|
31 | 1 | equemene | *
|
32 | 1 | equemene | * -- Disclaimer:
|
33 | 1 | equemene | *
|
34 | 1 | equemene | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
35 | 1 | equemene | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
36 | 1 | equemene | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
37 | 1 | equemene | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
38 | 1 | equemene | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
39 | 1 | equemene | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
40 | 1 | equemene | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
41 | 1 | equemene | * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
42 | 1 | equemene | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
43 | 1 | equemene | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
44 | 1 | equemene | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
45 | 1 | equemene | * ---------------------------------------------------------------------
|
46 | 1 | equemene | */
|
47 | 1 | equemene | /*
|
48 | 1 | equemene | * Include files
|
49 | 1 | equemene | */
|
50 | 1 | equemene | #include "hpl.h" |
51 | 1 | equemene | |
52 | 1 | equemene | #ifndef HPL_dgemm
|
53 | 1 | equemene | |
54 | 1 | equemene | #ifdef HPL_CALL_VSIPL
|
55 | 1 | equemene | |
56 | 1 | equemene | #ifdef STDC_HEADERS
|
57 | 1 | equemene | static void HPL_dgemmNN |
58 | 1 | equemene | ( |
59 | 1 | equemene | const int M, |
60 | 1 | equemene | const int N, |
61 | 1 | equemene | const int K, |
62 | 1 | equemene | const double ALPHA, |
63 | 1 | equemene | const double * A, |
64 | 1 | equemene | const int LDA, |
65 | 1 | equemene | const double * B, |
66 | 1 | equemene | const int LDB, |
67 | 1 | equemene | const double BETA, |
68 | 1 | equemene | double * C,
|
69 | 1 | equemene | const int LDC |
70 | 1 | equemene | ) |
71 | 1 | equemene | #else
|
72 | 1 | equemene | static void HPL_dgemmNN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ) |
73 | 1 | equemene | const int K, LDA, LDB, LDC, M, N; |
74 | 1 | equemene | const double ALPHA, BETA; |
75 | 1 | equemene | const double * A, * B; |
76 | 1 | equemene | double * C;
|
77 | 1 | equemene | #endif
|
78 | 1 | equemene | { |
79 | 1 | equemene | register double t0; |
80 | 1 | equemene | int i, iail, iblj, icij, j, jal, jbj, jcj, l;
|
81 | 1 | equemene | |
82 | 1 | equemene | for( j = 0, jbj = 0, jcj = 0; j < N; j++, jbj += LDB, jcj += LDC ) |
83 | 1 | equemene | { |
84 | 1 | equemene | HPL_dscal( M, BETA, C+jcj, 1 );
|
85 | 1 | equemene | for( l = 0, jal = 0, iblj = jbj; l < K; l++, jal += LDA, iblj += 1 ) |
86 | 1 | equemene | { |
87 | 1 | equemene | t0 = ALPHA * B[iblj]; |
88 | 1 | equemene | for( i = 0, iail = jal, icij = jcj; i < M; i++, iail += 1, icij += 1 ) |
89 | 1 | equemene | { C[icij] += A[iail] * t0; } |
90 | 1 | equemene | } |
91 | 1 | equemene | } |
92 | 1 | equemene | } |
93 | 1 | equemene | |
94 | 1 | equemene | #ifdef STDC_HEADERS
|
95 | 1 | equemene | static void HPL_dgemmNT |
96 | 1 | equemene | ( |
97 | 1 | equemene | const int M, |
98 | 1 | equemene | const int N, |
99 | 1 | equemene | const int K, |
100 | 1 | equemene | const double ALPHA, |
101 | 1 | equemene | const double * A, |
102 | 1 | equemene | const int LDA, |
103 | 1 | equemene | const double * B, |
104 | 1 | equemene | const int LDB, |
105 | 1 | equemene | const double BETA, |
106 | 1 | equemene | double * C,
|
107 | 1 | equemene | const int LDC |
108 | 1 | equemene | ) |
109 | 1 | equemene | #else
|
110 | 1 | equemene | static void HPL_dgemmNT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ) |
111 | 1 | equemene | const int K, LDA, LDB, LDC, M, N; |
112 | 1 | equemene | const double ALPHA, BETA; |
113 | 1 | equemene | const double * A, * B; |
114 | 1 | equemene | double * C;
|
115 | 1 | equemene | #endif
|
116 | 1 | equemene | { |
117 | 1 | equemene | register double t0; |
118 | 1 | equemene | int i, iail, ibj, ibjl, icij, j, jal, jcj, l;
|
119 | 1 | equemene | |
120 | 1 | equemene | for( j = 0, ibj = 0, jcj = 0; j < N; j++, ibj += 1, jcj += LDC ) |
121 | 1 | equemene | { |
122 | 1 | equemene | HPL_dscal( M, BETA, C+jcj, 1 );
|
123 | 1 | equemene | for( l = 0, jal = 0, ibjl = ibj; l < K; l++, jal += LDA, ibjl += LDB ) |
124 | 1 | equemene | { |
125 | 1 | equemene | t0 = ALPHA * B[ibjl]; |
126 | 1 | equemene | for( i = 0, iail = jal, icij = jcj; i < M; i++, iail += 1, icij += 1 ) |
127 | 1 | equemene | { C[icij] += A[iail] * t0; } |
128 | 1 | equemene | } |
129 | 1 | equemene | } |
130 | 1 | equemene | } |
131 | 1 | equemene | |
132 | 1 | equemene | #ifdef STDC_HEADERS
|
133 | 1 | equemene | static void HPL_dgemmTN |
134 | 1 | equemene | ( |
135 | 1 | equemene | const int M, |
136 | 1 | equemene | const int N, |
137 | 1 | equemene | const int K, |
138 | 1 | equemene | const double ALPHA, |
139 | 1 | equemene | const double * A, |
140 | 1 | equemene | const int LDA, |
141 | 1 | equemene | const double * B, |
142 | 1 | equemene | const int LDB, |
143 | 1 | equemene | const double BETA, |
144 | 1 | equemene | double * C,
|
145 | 1 | equemene | const int LDC |
146 | 1 | equemene | ) |
147 | 1 | equemene | #else
|
148 | 1 | equemene | static void HPL_dgemmTN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ) |
149 | 1 | equemene | const int K, LDA, LDB, LDC, M, N; |
150 | 1 | equemene | const double ALPHA, BETA; |
151 | 1 | equemene | const double * A, * B; |
152 | 1 | equemene | double * C;
|
153 | 1 | equemene | #endif
|
154 | 1 | equemene | { |
155 | 1 | equemene | register double t0; |
156 | 1 | equemene | int i, iai, iail, iblj, icij, j, jbj, jcj, l;
|
157 | 1 | equemene | |
158 | 1 | equemene | for( j = 0, jbj = 0, jcj = 0; j < N; j++, jbj += LDB, jcj += LDC ) |
159 | 1 | equemene | { |
160 | 1 | equemene | for( i = 0, icij = jcj, iai = 0; i < M; i++, icij += 1, iai += LDA ) |
161 | 1 | equemene | { |
162 | 1 | equemene | t0 = HPL_rzero; |
163 | 1 | equemene | for( l = 0, iail = iai, iblj = jbj; l < K; l++, iail += 1, iblj += 1 ) |
164 | 1 | equemene | { t0 += A[iail] * B[iblj]; } |
165 | 1 | equemene | if( BETA == HPL_rzero ) C[icij] = HPL_rzero;
|
166 | 1 | equemene | else C[icij] *= BETA;
|
167 | 1 | equemene | C[icij] += ALPHA * t0; |
168 | 1 | equemene | } |
169 | 1 | equemene | } |
170 | 1 | equemene | } |
171 | 1 | equemene | |
172 | 1 | equemene | #ifdef STDC_HEADERS
|
173 | 1 | equemene | static void HPL_dgemmTT |
174 | 1 | equemene | ( |
175 | 1 | equemene | const int M, |
176 | 1 | equemene | const int N, |
177 | 1 | equemene | const int K, |
178 | 1 | equemene | const double ALPHA, |
179 | 1 | equemene | const double * A, |
180 | 1 | equemene | const int LDA, |
181 | 1 | equemene | const double * B, |
182 | 1 | equemene | const int LDB, |
183 | 1 | equemene | const double BETA, |
184 | 1 | equemene | double * C,
|
185 | 1 | equemene | const int LDC |
186 | 1 | equemene | ) |
187 | 1 | equemene | #else
|
188 | 1 | equemene | static void HPL_dgemmTT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ) |
189 | 1 | equemene | const int K, LDA, LDB, LDC, M, N; |
190 | 1 | equemene | const double ALPHA, BETA; |
191 | 1 | equemene | const double * A, * B; |
192 | 1 | equemene | double * C;
|
193 | 1 | equemene | #endif
|
194 | 1 | equemene | { |
195 | 1 | equemene | register double t0; |
196 | 1 | equemene | int i, iali, ibj, ibjl, icij, j, jai, jcj, l;
|
197 | 1 | equemene | |
198 | 1 | equemene | for( j = 0, ibj = 0, jcj = 0; j < N; j++, ibj += 1, jcj += LDC ) |
199 | 1 | equemene | { |
200 | 1 | equemene | for( i = 0, icij = jcj, jai = 0; i < M; i++, icij += 1, jai += LDA ) |
201 | 1 | equemene | { |
202 | 1 | equemene | t0 = HPL_rzero; |
203 | 1 | equemene | for( l = 0, iali = jai, ibjl = ibj; |
204 | 1 | equemene | l < K; l++, iali += 1, ibjl += LDB ) t0 += A[iali] * B[ibjl];
|
205 | 1 | equemene | if( BETA == HPL_rzero ) C[icij] = HPL_rzero;
|
206 | 1 | equemene | else C[icij] *= BETA;
|
207 | 1 | equemene | C[icij] += ALPHA * t0; |
208 | 1 | equemene | } |
209 | 1 | equemene | } |
210 | 1 | equemene | } |
211 | 1 | equemene | |
212 | 1 | equemene | #ifdef STDC_HEADERS
|
213 | 1 | equemene | static void HPL_dgemm0 |
214 | 1 | equemene | ( |
215 | 1 | equemene | const enum HPL_TRANS TRANSA, |
216 | 1 | equemene | const enum HPL_TRANS TRANSB, |
217 | 1 | equemene | const int M, |
218 | 1 | equemene | const int N, |
219 | 1 | equemene | const int K, |
220 | 1 | equemene | const double ALPHA, |
221 | 1 | equemene | const double * A, |
222 | 1 | equemene | const int LDA, |
223 | 1 | equemene | const double * B, |
224 | 1 | equemene | const int LDB, |
225 | 1 | equemene | const double BETA, |
226 | 1 | equemene | double * C,
|
227 | 1 | equemene | const int LDC |
228 | 1 | equemene | ) |
229 | 1 | equemene | #else
|
230 | 1 | equemene | static void HPL_dgemm0( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, |
231 | 1 | equemene | BETA, C, LDC ) |
232 | 1 | equemene | const enum HPL_TRANS TRANSA, TRANSB; |
233 | 1 | equemene | const int K, LDA, LDB, LDC, M, N; |
234 | 1 | equemene | const double ALPHA, BETA; |
235 | 1 | equemene | const double * A, * B; |
236 | 1 | equemene | double * C;
|
237 | 1 | equemene | #endif
|
238 | 1 | equemene | { |
239 | 1 | equemene | int i, j;
|
240 | 1 | equemene | |
241 | 1 | equemene | if( ( M == 0 ) || ( N == 0 ) || |
242 | 1 | equemene | ( ( ( ALPHA == HPL_rzero ) || ( K == 0 ) ) &&
|
243 | 1 | equemene | ( BETA == HPL_rone ) ) ) return;
|
244 | 1 | equemene | |
245 | 1 | equemene | if( ALPHA == HPL_rzero )
|
246 | 1 | equemene | { |
247 | 1 | equemene | for( j = 0; j < N; j++ ) |
248 | 1 | equemene | { for( i = 0; i < M; i++ ) *(C+i+j*LDC) = HPL_rzero; } |
249 | 1 | equemene | return;
|
250 | 1 | equemene | } |
251 | 1 | equemene | |
252 | 1 | equemene | if( TRANSB == HplNoTrans )
|
253 | 1 | equemene | { |
254 | 1 | equemene | if( TRANSA == HplNoTrans )
|
255 | 1 | equemene | { HPL_dgemmNN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); } |
256 | 1 | equemene | else
|
257 | 1 | equemene | { HPL_dgemmTN( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); } |
258 | 1 | equemene | } |
259 | 1 | equemene | else
|
260 | 1 | equemene | { |
261 | 1 | equemene | if( TRANSA == HplNoTrans )
|
262 | 1 | equemene | { HPL_dgemmNT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); } |
263 | 1 | equemene | else
|
264 | 1 | equemene | { HPL_dgemmTT( M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ); } |
265 | 1 | equemene | } |
266 | 1 | equemene | } |
267 | 1 | equemene | |
268 | 1 | equemene | #endif
|
269 | 1 | equemene | |
270 | 1 | equemene | #ifdef STDC_HEADERS
|
271 | 1 | equemene | void HPL_dgemm
|
272 | 1 | equemene | ( |
273 | 1 | equemene | const enum HPL_ORDER ORDER, |
274 | 1 | equemene | const enum HPL_TRANS TRANSA, |
275 | 1 | equemene | const enum HPL_TRANS TRANSB, |
276 | 1 | equemene | const int M, |
277 | 1 | equemene | const int N, |
278 | 1 | equemene | const int K, |
279 | 1 | equemene | const double ALPHA, |
280 | 1 | equemene | const double * A, |
281 | 1 | equemene | const int LDA, |
282 | 1 | equemene | const double * B, |
283 | 1 | equemene | const int LDB, |
284 | 1 | equemene | const double BETA, |
285 | 1 | equemene | double * C,
|
286 | 1 | equemene | const int LDC |
287 | 1 | equemene | ) |
288 | 1 | equemene | #else
|
289 | 1 | equemene | void HPL_dgemm
|
290 | 1 | equemene | ( ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC ) |
291 | 1 | equemene | const enum HPL_ORDER ORDER; |
292 | 1 | equemene | const enum HPL_TRANS TRANSA; |
293 | 1 | equemene | const enum HPL_TRANS TRANSB; |
294 | 1 | equemene | const int M; |
295 | 1 | equemene | const int N; |
296 | 1 | equemene | const int K; |
297 | 1 | equemene | const double ALPHA; |
298 | 1 | equemene | const double * A; |
299 | 1 | equemene | const int LDA; |
300 | 1 | equemene | const double * B; |
301 | 1 | equemene | const int LDB; |
302 | 1 | equemene | const double BETA; |
303 | 1 | equemene | double * C;
|
304 | 1 | equemene | const int LDC; |
305 | 1 | equemene | #endif
|
306 | 1 | equemene | { |
307 | 1 | equemene | /*
|
308 | 1 | equemene | * Purpose
|
309 | 1 | equemene | * =======
|
310 | 1 | equemene | *
|
311 | 1 | equemene | * HPL_dgemm performs one of the matrix-matrix operations
|
312 | 1 | equemene | *
|
313 | 1 | equemene | * C := alpha * op( A ) * op( B ) + beta * C
|
314 | 1 | equemene | *
|
315 | 1 | equemene | * where op( X ) is one of
|
316 | 1 | equemene | *
|
317 | 1 | equemene | * op( X ) = X or op( X ) = X^T.
|
318 | 1 | equemene | *
|
319 | 1 | equemene | * Alpha and beta are scalars, and A, B and C are matrices, with op(A)
|
320 | 1 | equemene | * an m by k matrix, op(B) a k by n matrix and C an m by n matrix.
|
321 | 1 | equemene | *
|
322 | 1 | equemene | * Arguments
|
323 | 1 | equemene | * =========
|
324 | 1 | equemene | *
|
325 | 1 | equemene | * ORDER (local input) const enum HPL_ORDER
|
326 | 1 | equemene | * On entry, ORDER specifies the storage format of the operands
|
327 | 1 | equemene | * as follows:
|
328 | 1 | equemene | * ORDER = HplRowMajor,
|
329 | 1 | equemene | * ORDER = HplColumnMajor.
|
330 | 1 | equemene | *
|
331 | 1 | equemene | * TRANSA (local input) const enum HPL_TRANS
|
332 | 1 | equemene | * On entry, TRANSA specifies the form of op(A) to be used in
|
333 | 1 | equemene | * the matrix-matrix operation follows:
|
334 | 1 | equemene | * TRANSA==HplNoTrans : op( A ) = A,
|
335 | 1 | equemene | * TRANSA==HplTrans : op( A ) = A^T,
|
336 | 1 | equemene | * TRANSA==HplConjTrans : op( A ) = A^T.
|
337 | 1 | equemene | *
|
338 | 1 | equemene | * TRANSB (local input) const enum HPL_TRANS
|
339 | 1 | equemene | * On entry, TRANSB specifies the form of op(B) to be used in
|
340 | 1 | equemene | * the matrix-matrix operation follows:
|
341 | 1 | equemene | * TRANSB==HplNoTrans : op( B ) = B,
|
342 | 1 | equemene | * TRANSB==HplTrans : op( B ) = B^T,
|
343 | 1 | equemene | * TRANSB==HplConjTrans : op( B ) = B^T.
|
344 | 1 | equemene | *
|
345 | 1 | equemene | * M (local input) const int
|
346 | 1 | equemene | * On entry, M specifies the number of rows of the matrix
|
347 | 1 | equemene | * op(A) and of the matrix C. M must be at least zero.
|
348 | 1 | equemene | *
|
349 | 1 | equemene | * N (local input) const int
|
350 | 1 | equemene | * On entry, N specifies the number of columns of the matrix
|
351 | 1 | equemene | * op(B) and the number of columns of the matrix C. N must be
|
352 | 1 | equemene | * at least zero.
|
353 | 1 | equemene | *
|
354 | 1 | equemene | * K (local input) const int
|
355 | 1 | equemene | * On entry, K specifies the number of columns of the matrix
|
356 | 1 | equemene | * op(A) and the number of rows of the matrix op(B). K must be
|
357 | 1 | equemene | * be at least zero.
|
358 | 1 | equemene | *
|
359 | 1 | equemene | * ALPHA (local input) const double
|
360 | 1 | equemene | * On entry, ALPHA specifies the scalar alpha. When ALPHA is
|
361 | 1 | equemene | * supplied as zero then the elements of the matrices A and B
|
362 | 1 | equemene | * need not be set on input.
|
363 | 1 | equemene | *
|
364 | 1 | equemene | * A (local input) const double *
|
365 | 1 | equemene | * On entry, A is an array of dimension (LDA,ka), where ka is
|
366 | 1 | equemene | * k when TRANSA==HplNoTrans, and is m otherwise. Before
|
367 | 1 | equemene | * entry with TRANSA==HplNoTrans, the leading m by k part of
|
368 | 1 | equemene | * the array A must contain the matrix A, otherwise the leading
|
369 | 1 | equemene | * k by m part of the array A must contain the matrix A.
|
370 | 1 | equemene | *
|
371 | 1 | equemene | * LDA (local input) const int
|
372 | 1 | equemene | * On entry, LDA specifies the first dimension of A as declared
|
373 | 1 | equemene | * in the calling (sub) program. When TRANSA==HplNoTrans then
|
374 | 1 | equemene | * LDA must be at least max(1,m), otherwise LDA must be at least
|
375 | 1 | equemene | * max(1,k).
|
376 | 1 | equemene | *
|
377 | 1 | equemene | * B (local input) const double *
|
378 | 1 | equemene | * On entry, B is an array of dimension (LDB,kb), where kb is
|
379 | 1 | equemene | * n when TRANSB==HplNoTrans, and is k otherwise. Before
|
380 | 1 | equemene | * entry with TRANSB==HplNoTrans, the leading k by n part of
|
381 | 1 | equemene | * the array B must contain the matrix B, otherwise the leading
|
382 | 1 | equemene | * n by k part of the array B must contain the matrix B.
|
383 | 1 | equemene | *
|
384 | 1 | equemene | * LDB (local input) const int
|
385 | 1 | equemene | * On entry, LDB specifies the first dimension of B as declared
|
386 | 1 | equemene | * in the calling (sub) program. When TRANSB==HplNoTrans then
|
387 | 1 | equemene | * LDB must be at least max(1,k), otherwise LDB must be at least
|
388 | 1 | equemene | * max(1,n).
|
389 | 1 | equemene | *
|
390 | 1 | equemene | * BETA (local input) const double
|
391 | 1 | equemene | * On entry, BETA specifies the scalar beta. When BETA is
|
392 | 1 | equemene | * supplied as zero then the elements of the matrix C need
|
393 | 1 | equemene | * not be set on input.
|
394 | 1 | equemene | *
|
395 | 1 | equemene | * C (local input/output) double *
|
396 | 1 | equemene | * On entry, C is an array of dimension (LDC,n). Before entry,
|
397 | 1 | equemene | * the leading m by n part of the array C must contain the
|
398 | 1 | equemene | * matrix C, except when beta is zero, in which case C need not
|
399 | 1 | equemene | * be set on entry. On exit, the array C is overwritten by the
|
400 | 1 | equemene | * m by n matrix ( alpha*op( A )*op( B ) + beta*C ).
|
401 | 1 | equemene | *
|
402 | 1 | equemene | * LDC (local input) const int
|
403 | 1 | equemene | * On entry, LDC specifies the first dimension of C as declared
|
404 | 1 | equemene | * in the calling (sub) program. LDC must be at least
|
405 | 1 | equemene | * max(1,m).
|
406 | 1 | equemene | *
|
407 | 1 | equemene | * ---------------------------------------------------------------------
|
408 | 1 | equemene | */
|
409 | 1 | equemene | #ifdef HPL_CALL_CBLAS
|
410 | 1 | equemene | cblas_dgemm( ORDER, TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, |
411 | 1 | equemene | BETA, C, LDC ); |
412 | 1 | equemene | #endif
|
413 | 1 | equemene | #ifdef HPL_CALL_VSIPL
|
414 | 1 | equemene | if( ORDER == HplColumnMajor )
|
415 | 1 | equemene | { |
416 | 1 | equemene | HPL_dgemm0( TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, |
417 | 1 | equemene | C, LDC ); |
418 | 1 | equemene | } |
419 | 1 | equemene | else
|
420 | 1 | equemene | { |
421 | 1 | equemene | HPL_dgemm0( TRANSB, TRANSA, N, M, K, ALPHA, B, LDB, A, LDA, BETA, |
422 | 1 | equemene | C, LDC ); |
423 | 1 | equemene | } |
424 | 1 | equemene | #endif
|
425 | 1 | equemene | #ifdef HPL_CALL_FBLAS
|
426 | 1 | equemene | double alpha = ALPHA, beta = BETA;
|
427 | 1 | equemene | #ifdef StringSunStyle
|
428 | 1 | equemene | #ifdef HPL_USE_F77_INTEGER_DEF
|
429 | 1 | equemene | F77_INTEGER IONE = 1;
|
430 | 1 | equemene | #else
|
431 | 1 | equemene | int IONE = 1; |
432 | 1 | equemene | #endif
|
433 | 1 | equemene | #endif
|
434 | 1 | equemene | #ifdef StringStructVal
|
435 | 1 | equemene | F77_CHAR ftransa; |
436 | 1 | equemene | F77_CHAR ftransb; |
437 | 1 | equemene | #endif
|
438 | 1 | equemene | #ifdef StringStructPtr
|
439 | 1 | equemene | F77_CHAR ftransa; |
440 | 1 | equemene | F77_CHAR ftransb; |
441 | 1 | equemene | #endif
|
442 | 1 | equemene | #ifdef StringCrayStyle
|
443 | 1 | equemene | F77_CHAR ftransa; |
444 | 1 | equemene | F77_CHAR ftransb; |
445 | 1 | equemene | #endif
|
446 | 1 | equemene | #ifdef HPL_USE_F77_INTEGER_DEF
|
447 | 1 | equemene | const F77_INTEGER F77M = M, F77N = N, F77K = K,
|
448 | 1 | equemene | F77lda = LDA, F77ldb = LDB, F77ldc = LDC; |
449 | 1 | equemene | #else
|
450 | 1 | equemene | #define F77M M
|
451 | 1 | equemene | #define F77N N
|
452 | 1 | equemene | #define F77K K
|
453 | 1 | equemene | #define F77lda LDA
|
454 | 1 | equemene | #define F77ldb LDB
|
455 | 1 | equemene | #define F77ldc LDC
|
456 | 1 | equemene | #endif
|
457 | 1 | equemene | char ctransa, ctransb;
|
458 | 1 | equemene | |
459 | 1 | equemene | if( TRANSA == HplNoTrans ) ctransa = 'N'; |
460 | 1 | equemene | else if( TRANSA == HplTrans ) ctransa = 'T'; |
461 | 1 | equemene | else ctransa = 'C'; |
462 | 1 | equemene | |
463 | 1 | equemene | if( TRANSB == HplNoTrans ) ctransb = 'N'; |
464 | 1 | equemene | else if( TRANSB == HplTrans ) ctransb = 'T'; |
465 | 1 | equemene | else ctransb = 'C'; |
466 | 1 | equemene | |
467 | 1 | equemene | if( ORDER == HplColumnMajor )
|
468 | 1 | equemene | { |
469 | 1 | equemene | #ifdef StringSunStyle
|
470 | 1 | equemene | F77dgemm( &ctransa, &ctransb, &F77M, &F77N, &F77K, &alpha, A, &F77lda, |
471 | 1 | equemene | B, &F77ldb, &beta, C, &F77ldc, IONE, IONE ); |
472 | 1 | equemene | #endif
|
473 | 1 | equemene | #ifdef StringCrayStyle
|
474 | 1 | equemene | ftransa = HPL_C2F_CHAR( ctransa ); ftransb = HPL_C2F_CHAR( ctransb ); |
475 | 1 | equemene | F77dgemm( ftransa, ftransb, &F77M, &F77N, &F77K, &alpha, A, &F77lda, |
476 | 1 | equemene | B, &F77ldb, &beta, C, &F77ldc ); |
477 | 1 | equemene | #endif
|
478 | 1 | equemene | #ifdef StringStructVal
|
479 | 1 | equemene | ftransa.len = 1; ftransa.cp = &ctransa;
|
480 | 1 | equemene | ftransb.len = 1; ftransb.cp = &ctransb;
|
481 | 1 | equemene | F77dgemm( ftransa, ftransb, &F77M, &F77N, &F77K, &alpha, A, &F77lda, |
482 | 1 | equemene | B, &F77ldb, &beta, C, &F77ldc ); |
483 | 1 | equemene | #endif
|
484 | 1 | equemene | #ifdef StringStructPtr
|
485 | 1 | equemene | ftransa.len = 1; ftransa.cp = &ctransa;
|
486 | 1 | equemene | ftransb.len = 1; ftransb.cp = &ctransb;
|
487 | 1 | equemene | F77dgemm( &ftransa, &ftransb, &F77M, &F77N, &F77K, &alpha, A, &F77lda, |
488 | 1 | equemene | B, &F77ldb, &beta, C, &F77ldc ); |
489 | 1 | equemene | #endif
|
490 | 1 | equemene | } |
491 | 1 | equemene | else
|
492 | 1 | equemene | { |
493 | 1 | equemene | #ifdef StringSunStyle
|
494 | 1 | equemene | F77dgemm( &ctransb, &ctransa, &F77N, &F77M, &F77K, &alpha, B, &F77ldb, |
495 | 1 | equemene | A, &F77lda, &beta, C, &F77ldc, IONE, IONE ); |
496 | 1 | equemene | #endif
|
497 | 1 | equemene | #ifdef StringCrayStyle
|
498 | 1 | equemene | ftransa = HPL_C2F_CHAR( ctransa ); ftransb = HPL_C2F_CHAR( ctransb ); |
499 | 1 | equemene | F77dgemm( ftransb, ftransa, &F77N, &F77M, &F77K, &alpha, B, &F77ldb, |
500 | 1 | equemene | A, &F77lda, &beta, C, &F77ldc ); |
501 | 1 | equemene | #endif
|
502 | 1 | equemene | #ifdef StringStructVal
|
503 | 1 | equemene | ftransa.len = 1; ftransa.cp = &ctransa;
|
504 | 1 | equemene | ftransb.len = 1; ftransb.cp = &ctransb;
|
505 | 1 | equemene | F77dgemm( ftransb, ftransa, &F77N, &F77M, &F77K, &alpha, B, &F77ldb, |
506 | 1 | equemene | A, &F77lda, &beta, C, &F77ldc ); |
507 | 1 | equemene | #endif
|
508 | 1 | equemene | #ifdef StringStructPtr
|
509 | 1 | equemene | ftransa.len = 1; ftransa.cp = &ctransa;
|
510 | 1 | equemene | ftransb.len = 1; ftransb.cp = &ctransb;
|
511 | 1 | equemene | F77dgemm( &ftransb, &ftransa, &F77N, &F77M, &F77K, &alpha, B, &F77ldb, |
512 | 1 | equemene | A, &F77lda, &beta, C, &F77ldc ); |
513 | 1 | equemene | #endif
|
514 | 1 | equemene | } |
515 | 1 | equemene | #endif
|
516 | 9 | equemene | |
517 | 9 | equemene | #ifdef HPL_CALL_CUBLAS
|
518 | 9 | equemene | double alpha = ALPHA, beta = BETA;
|
519 | 9 | equemene | |
520 | 9 | equemene | int IONE = 1; |
521 | 9 | equemene | |
522 | 9 | equemene | #define CUBLASM M
|
523 | 9 | equemene | #define CUBLASN N
|
524 | 9 | equemene | #define CUBLASK K
|
525 | 9 | equemene | #define CUBLASlda LDA
|
526 | 9 | equemene | #define CUBLASldb LDB
|
527 | 9 | equemene | #define CUBLASldc LDC
|
528 | 9 | equemene | |
529 | 9 | equemene | char ctransa, ctransb;
|
530 | 9 | equemene | |
531 | 9 | equemene | if( TRANSA == HplNoTrans ) ctransa = 'N'; |
532 | 9 | equemene | else if( TRANSA == HplTrans ) ctransa = 'T'; |
533 | 9 | equemene | else ctransa = 'C'; |
534 | 9 | equemene | |
535 | 9 | equemene | if( TRANSB == HplNoTrans ) ctransb = 'N'; |
536 | 9 | equemene | else if( TRANSB == HplTrans ) ctransb = 'T'; |
537 | 9 | equemene | else ctransb = 'C'; |
538 | 9 | equemene | |
539 | 9 | equemene | if( ORDER == HplColumnMajor )
|
540 | 9 | equemene | { |
541 | 9 | equemene | CUBLAS_DGEMM( &ctransa, &ctransb, &CUBLASM, &CUBLASN, &CUBLASK, |
542 | 9 | equemene | &alpha, A, &CUBLASlda, B, &CUBLASldb, &beta, C, &CUBLASldc, |
543 | 9 | equemene | &IONE, &IONE ); |
544 | 9 | equemene | } |
545 | 9 | equemene | else
|
546 | 9 | equemene | { |
547 | 9 | equemene | CUBLAS_DGEMM( &ctransb, &ctransa, &CUBLASN, &CUBLASM, &CUBLASK, |
548 | 9 | equemene | &alpha, B, &CUBLASldb, A, &CUBLASlda, &beta, C, &CUBLASldc, |
549 | 9 | equemene | &IONE, &IONE ); |
550 | 9 | equemene | } |
551 | 9 | equemene | #endif
|
552 | 1 | equemene | /*
|
553 | 1 | equemene | * End of HPL_dgemm
|
554 | 1 | equemene | */
|
555 | 1 | equemene | } |
556 | 1 | equemene | |
557 | 1 | equemene | #endif |