Statistiques
| Révision :

root / src / auxil / HPL_dlatcpy.c @ 1

Historique | Voir | Annoter | Télécharger (16,67 ko)

1
/* 
2
 * -- High Performance Computing Linpack Benchmark (HPL)                
3
 *    HPL - 2.0 - September 10, 2008                          
4
 *    Antoine P. Petitet                                                
5
 *    University of Tennessee, Knoxville                                
6
 *    Innovative Computing Laboratory                                 
7
 *    (C) Copyright 2000-2008 All Rights Reserved                       
8
 *                                                                      
9
 * -- Copyright notice and Licensing terms:                             
10
 *                                                                      
11
 * Redistribution  and  use in  source and binary forms, with or without
12
 * modification, are  permitted provided  that the following  conditions
13
 * are met:                                                             
14
 *                                                                      
15
 * 1. Redistributions  of  source  code  must retain the above copyright
16
 * notice, this list of conditions and the following disclaimer.        
17
 *                                                                      
18
 * 2. Redistributions in binary form must reproduce  the above copyright
19
 * notice, this list of conditions,  and the following disclaimer in the
20
 * documentation and/or other materials provided with the distribution. 
21
 *                                                                      
22
 * 3. All  advertising  materials  mentioning  features  or  use of this
23
 * software must display the following acknowledgement:                 
24
 * This  product  includes  software  developed  at  the  University  of
25
 * Tennessee, Knoxville, Innovative Computing Laboratory.             
26
 *                                                                      
27
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28
 * names  of  its  contributors  may  not  be used to endorse or promote
29
 * products  derived   from   this  software  without  specific  written
30
 * permission.                                                          
31
 *                                                                      
32
 * -- Disclaimer:                                                       
33
 *                                                                      
34
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
45
 * ---------------------------------------------------------------------
46
 */ 
47
/*
48
 * Include files
49
 */
50
#include "hpl.h"
51
/*
52
 * Define default value for unrolling factors
53
 * #ifndef HPL_LATCPY_M_DEPTH
54
 * #define    HPL_LATCPY_M_DEPTH      32
55
 * #define    HPL_LATCPY_LOG2_M_DEPTH  5
56
 * #endif
57
 * #ifndef HPL_LATCPY_N_DEPTH
58
 * #define    HPL_LATCPY_N_DEPTH       4
59
 * #define    HPL_LATCPY_LOG2_N_DEPTH  2
60
 * #endif
61
 */
62
#ifndef HPL_LATCPY_M_DEPTH
63
#define    HPL_LATCPY_M_DEPTH       4
64
#define    HPL_LATCPY_LOG2_M_DEPTH  2
65
#endif
66
#ifndef HPL_LATCPY_N_DEPTH
67
#define    HPL_LATCPY_N_DEPTH       2
68
#define    HPL_LATCPY_LOG2_N_DEPTH  1
69
#endif
70

    
71
#ifdef STDC_HEADERS
72
void HPL_dlatcpy
73
(
74
   const int                        M,
75
   const int                        N,
76
   const double *                   A,
77
   const int                        LDA,
78
   double *                         B,
79
   const int                        LDB
80
)
81
#else
82
void HPL_dlatcpy
83
( M, N, A, LDA, B, LDB )
84
   const int                        M;
85
   const int                        N;
86
   const double *                   A;
87
   const int                        LDA;
88
   double *                         B;
89
   const int                        LDB;
90
#endif
91
{
92
/* 
93
 * Purpose
94
 * =======
95
 *
96
 * HPL_dlatcpy copies the transpose of an array A into an array B.
97
 * 
98
 *
99
 * Arguments
100
 * =========
101
 *
102
 * M       (local input)                 const int
103
 *         On entry,  M specifies the number of  rows of the array B and
104
 *         the number of columns of A. M must be at least zero.
105
 *
106
 * N       (local input)                 const int
107
 *         On entry,  N specifies the number of  rows of the array A and
108
 *         the number of columns of B. N must be at least zero.
109
 *
110
 * A       (local input)                 const double *
111
 *         On entry, A points to an array of dimension (LDA,M).
112
 *
113
 * LDA     (local input)                 const int
114
 *         On entry, LDA specifies the leading dimension of the array A.
115
 *         LDA must be at least MAX(1,N).
116
 *
117
 * B       (local output)                double *
118
 *         On entry, B points to an array of dimension (LDB,N). On exit,
119
 *         B is overwritten with the transpose of A.
120
 *
121
 * LDB     (local input)                 const int
122
 *         On entry, LDB specifies the leading dimension of the array B.
123
 *         LDB must be at least MAX(1,M).
124
 *
125
 * ---------------------------------------------------------------------
126
 */ 
127
/*
128
 * .. Local Variables ..
129
 */
130
#ifdef HPL_LATCPY_USE_COPY
131
   register int               j;
132
#else
133
#if   ( HPL_LATCPY_N_DEPTH == 1 )
134
   const double               * A0 = A;
135
   double                     * B0 = B;
136
#elif ( HPL_LATCPY_N_DEPTH == 2 )
137
   const double               * A0 = A,              * A1 = A + 1;
138
   double                     * B0 = B,              * B1 = B +     LDB;
139
#elif ( HPL_LATCPY_N_DEPTH == 4 )
140
   const double               * A0 = A,              * A1 = A + 1,
141
                              * A2 = A + 2,          * A3 = A + 3;
142
   double                     * B0 = B,              * B1 = B +     LDB,
143
                              * B2 = B + (LDB << 1), * B3 = B + 3 * LDB;
144
#endif
145
   const int                  incA = -M * LDA + (1 << HPL_LATCPY_LOG2_N_DEPTH),
146
                              incB = ( (unsigned int)(LDB) <<
147
                                       HPL_LATCPY_LOG2_N_DEPTH ) - M,
148
                              incA0 = -M * LDA + 1, incB0 = LDB - M;
149
   int                        mu, nu;
150
   register int               i, j;
151
#endif
152
/* ..
153
 * .. Executable Statements ..
154
 */
155
   if( ( M <= 0 ) || ( N <= 0 ) ) return;
156

    
157
#ifdef HPL_LATCPY_USE_COPY
158
   for( j = 0; j < N; j++, B0 += LDB ) HPL_dcopy( M, A0+j, LDA, B0, 1 );
159
#else
160
   mu = (int)( ( (unsigned int)(M) >> HPL_LATCPY_LOG2_M_DEPTH ) <<
161
                                      HPL_LATCPY_LOG2_M_DEPTH );
162
   nu = (int)( ( (unsigned int)(N) >> HPL_LATCPY_LOG2_N_DEPTH ) <<
163
                                      HPL_LATCPY_LOG2_N_DEPTH );
164

    
165
   for( j = 0; j < nu; j += HPL_LATCPY_N_DEPTH )
166
   {
167
      for( i = 0; i < mu; i += HPL_LATCPY_M_DEPTH )
168
      {
169
#if   ( HPL_LATCPY_N_DEPTH == 1 )
170
         B0[ 0] = *A0; A0 += LDA;
171
#elif ( HPL_LATCPY_N_DEPTH == 2 )
172
         B0[ 0] = *A0; A0 += LDA; B1[ 0] = *A1; A1 += LDA;
173
#elif ( HPL_LATCPY_N_DEPTH == 4 )
174
         B0[ 0] = *A0; A0 += LDA; B1[ 0] = *A1; A1 += LDA;
175
         B2[ 0] = *A2; A2 += LDA; B3[ 0] = *A3; A3 += LDA;
176
#endif
177

    
178
#if ( HPL_LATCPY_M_DEPTH >  1 )
179

    
180
#if   ( HPL_LATCPY_N_DEPTH == 1 )
181
         B0[ 1] = *A0; A0 += LDA;
182
#elif ( HPL_LATCPY_N_DEPTH == 2 )
183
         B0[ 1] = *A0; A0 += LDA; B1[ 1] = *A1; A1 += LDA;
184
#elif ( HPL_LATCPY_N_DEPTH == 4 )
185
         B0[ 1] = *A0; A0 += LDA; B1[ 1] = *A1; A1 += LDA;
186
         B2[ 1] = *A2; A2 += LDA; B3[ 1] = *A3; A3 += LDA;
187
#endif
188

    
189
#endif
190
#if ( HPL_LATCPY_M_DEPTH >  2 )
191

    
192
#if   ( HPL_LATCPY_N_DEPTH == 1 )
193
         B0[ 2] = *A0; A0 += LDA; B0[ 3] = *A0; A0 += LDA;
194
#elif ( HPL_LATCPY_N_DEPTH == 2 )
195
         B0[ 2] = *A0; A0 += LDA; B1[ 2] = *A1; A1 += LDA;
196
         B0[ 3] = *A0; A0 += LDA; B1[ 3] = *A1; A1 += LDA;
197
#elif ( HPL_LATCPY_N_DEPTH == 4 )
198
         B0[ 2] = *A0; A0 += LDA; B1[ 2] = *A1; A1 += LDA;
199
         B2[ 2] = *A2; A2 += LDA; B3[ 2] = *A3; A3 += LDA;
200
         B0[ 3] = *A0; A0 += LDA; B1[ 3] = *A1; A1 += LDA;
201
         B2[ 3] = *A2; A2 += LDA; B3[ 3] = *A3; A3 += LDA;
202
#endif
203

    
204
#endif
205
#if ( HPL_LATCPY_M_DEPTH >  4 )
206

    
207
#if   ( HPL_LATCPY_N_DEPTH == 1 )
208
         B0[ 4] = *A0; A0 += LDA; B0[ 5] = *A0; A0 += LDA;
209
         B0[ 6] = *A0; A0 += LDA; B0[ 7] = *A0; A0 += LDA;
210
#elif ( HPL_LATCPY_N_DEPTH == 2 )
211
         B0[ 4] = *A0; A0 += LDA; B1[ 4] = *A1; A1 += LDA;
212
         B0[ 5] = *A0; A0 += LDA; B1[ 5] = *A1; A1 += LDA;
213
         B0[ 6] = *A0; A0 += LDA; B1[ 6] = *A1; A1 += LDA;
214
         B0[ 7] = *A0; A0 += LDA; B1[ 7] = *A1; A1 += LDA;
215
#elif ( HPL_LATCPY_N_DEPTH == 4 )
216
         B0[ 4] = *A0; A0 += LDA; B1[ 4] = *A1; A1 += LDA;
217
         B2[ 4] = *A2; A2 += LDA; B3[ 4] = *A3; A3 += LDA;
218
         B0[ 5] = *A0; A0 += LDA; B1[ 5] = *A1; A1 += LDA;
219
         B2[ 5] = *A2; A2 += LDA; B3[ 5] = *A3; A3 += LDA;
220
         B0[ 6] = *A0; A0 += LDA; B1[ 6] = *A1; A1 += LDA;
221
         B2[ 6] = *A2; A2 += LDA; B3[ 6] = *A3; A3 += LDA;
222
         B0[ 7] = *A0; A0 += LDA; B1[ 7] = *A1; A1 += LDA;
223
         B2[ 7] = *A2; A2 += LDA; B3[ 7] = *A3; A3 += LDA;
224
#endif
225

    
226
#endif
227
#if ( HPL_LATCPY_M_DEPTH >  8 )
228

    
229
#if   ( HPL_LATCPY_N_DEPTH == 1 )
230
         B0[ 8] = *A0; A0 += LDA; B0[ 9] = *A0; A0 += LDA;
231
         B0[10] = *A0; A0 += LDA; B0[11] = *A0; A0 += LDA;
232
         B0[12] = *A0; A0 += LDA; B0[13] = *A0; A0 += LDA;
233
         B0[14] = *A0; A0 += LDA; B0[15] = *A0; A0 += LDA;
234
#elif ( HPL_LATCPY_N_DEPTH == 2 )
235
         B0[ 8] = *A0; A0 += LDA; B1[ 8] = *A1; A1 += LDA;
236
         B0[ 9] = *A0; A0 += LDA; B1[ 9] = *A1; A1 += LDA;
237
         B0[10] = *A0; A0 += LDA; B1[10] = *A1; A1 += LDA;
238
         B0[11] = *A0; A0 += LDA; B1[11] = *A1; A1 += LDA;
239
         B0[12] = *A0; A0 += LDA; B1[12] = *A1; A1 += LDA;
240
         B0[13] = *A0; A0 += LDA; B1[13] = *A1; A1 += LDA;
241
         B0[14] = *A0; A0 += LDA; B1[14] = *A1; A1 += LDA;
242
         B0[15] = *A0; A0 += LDA; B1[15] = *A1; A1 += LDA;
243
#elif ( HPL_LATCPY_N_DEPTH == 4 )
244
         B0[ 8] = *A0; A0 += LDA; B1[ 8] = *A1; A1 += LDA;
245
         B2[ 8] = *A2; A2 += LDA; B3[ 8] = *A3; A3 += LDA;
246
         B0[ 9] = *A0; A0 += LDA; B1[ 9] = *A1; A1 += LDA;
247
         B2[ 9] = *A2; A2 += LDA; B3[ 9] = *A3; A3 += LDA;
248
         B0[10] = *A0; A0 += LDA; B1[10] = *A1; A1 += LDA;
249
         B2[10] = *A2; A2 += LDA; B3[10] = *A3; A3 += LDA;
250
         B0[11] = *A0; A0 += LDA; B1[11] = *A1; A1 += LDA;
251
         B2[11] = *A2; A2 += LDA; B3[11] = *A3; A3 += LDA;
252
         B0[12] = *A0; A0 += LDA; B1[12] = *A1; A1 += LDA;
253
         B2[12] = *A2; A2 += LDA; B3[12] = *A3; A3 += LDA;
254
         B0[13] = *A0; A0 += LDA; B1[13] = *A1; A1 += LDA;
255
         B2[13] = *A2; A2 += LDA; B3[13] = *A3; A3 += LDA;
256
         B0[14] = *A0; A0 += LDA; B1[14] = *A1; A1 += LDA;
257
         B2[14] = *A2; A2 += LDA; B3[14] = *A3; A3 += LDA;
258
         B0[15] = *A0; A0 += LDA; B1[15] = *A1; A1 += LDA;
259
         B2[15] = *A2; A2 += LDA; B3[15] = *A3; A3 += LDA;
260
#endif
261

    
262
#endif
263
#if ( HPL_LATCPY_M_DEPTH > 16 )
264

    
265
#if   ( HPL_LATCPY_N_DEPTH == 1 )
266
         B0[16] = *A0; A0 += LDA; B0[17] = *A0; A0 += LDA;
267
         B0[18] = *A0; A0 += LDA; B0[19] = *A0; A0 += LDA;
268
         B0[20] = *A0; A0 += LDA; B0[21] = *A0; A0 += LDA;
269
         B0[22] = *A0; A0 += LDA; B0[23] = *A0; A0 += LDA;
270
         B0[24] = *A0; A0 += LDA; B0[25] = *A0; A0 += LDA;
271
         B0[26] = *A0; A0 += LDA; B0[27] = *A0; A0 += LDA;
272
         B0[28] = *A0; A0 += LDA; B0[29] = *A0; A0 += LDA;
273
         B0[30] = *A0; A0 += LDA; B0[31] = *A0; A0 += LDA;
274
#elif ( HPL_LATCPY_N_DEPTH == 2 )
275
         B0[16] = *A0; A0 += LDA; B1[16] = *A1; A1 += LDA;
276
         B0[17] = *A0; A0 += LDA; B1[17] = *A1; A1 += LDA;
277
         B0[18] = *A0; A0 += LDA; B1[18] = *A1; A1 += LDA;
278
         B0[19] = *A0; A0 += LDA; B1[19] = *A1; A1 += LDA;
279
         B0[20] = *A0; A0 += LDA; B1[20] = *A1; A1 += LDA;
280
         B0[21] = *A0; A0 += LDA; B1[21] = *A1; A1 += LDA;
281
         B0[22] = *A0; A0 += LDA; B1[22] = *A1; A1 += LDA;
282
         B0[23] = *A0; A0 += LDA; B1[23] = *A1; A1 += LDA;
283
         B0[24] = *A0; A0 += LDA; B1[24] = *A1; A1 += LDA;
284
         B0[25] = *A0; A0 += LDA; B1[25] = *A1; A1 += LDA;
285
         B0[26] = *A0; A0 += LDA; B1[26] = *A1; A1 += LDA;
286
         B0[27] = *A0; A0 += LDA; B1[27] = *A1; A1 += LDA;
287
         B0[28] = *A0; A0 += LDA; B1[28] = *A1; A1 += LDA;
288
         B0[29] = *A0; A0 += LDA; B1[29] = *A1; A1 += LDA;
289
         B0[30] = *A0; A0 += LDA; B1[30] = *A1; A1 += LDA;
290
         B0[31] = *A0; A0 += LDA; B1[31] = *A1; A1 += LDA;
291
#elif ( HPL_LATCPY_N_DEPTH == 4 )
292
         B0[16] = *A0; A0 += LDA; B1[16] = *A1; A1 += LDA;
293
         B2[16] = *A2; A2 += LDA; B3[16] = *A3; A3 += LDA;
294
         B0[17] = *A0; A0 += LDA; B1[17] = *A1; A1 += LDA;
295
         B2[17] = *A2; A2 += LDA; B3[17] = *A3; A3 += LDA;
296
         B0[18] = *A0; A0 += LDA; B1[18] = *A1; A1 += LDA;
297
         B2[18] = *A2; A2 += LDA; B3[18] = *A3; A3 += LDA;
298
         B0[19] = *A0; A0 += LDA; B1[19] = *A1; A1 += LDA;
299
         B2[19] = *A2; A2 += LDA; B3[19] = *A3; A3 += LDA;
300
         B0[20] = *A0; A0 += LDA; B1[20] = *A1; A1 += LDA;
301
         B2[20] = *A2; A2 += LDA; B3[20] = *A3; A3 += LDA;
302
         B0[21] = *A0; A0 += LDA; B1[21] = *A1; A1 += LDA;
303
         B2[21] = *A2; A2 += LDA; B3[21] = *A3; A3 += LDA;
304
         B0[22] = *A0; A0 += LDA; B1[22] = *A1; A1 += LDA;
305
         B2[22] = *A2; A2 += LDA; B3[22] = *A3; A3 += LDA;
306
         B0[23] = *A0; A0 += LDA; B1[23] = *A1; A1 += LDA;
307
         B2[23] = *A2; A2 += LDA; B3[23] = *A3; A3 += LDA;
308
         B0[24] = *A0; A0 += LDA; B1[24] = *A1; A1 += LDA;
309
         B2[24] = *A2; A2 += LDA; B3[24] = *A3; A3 += LDA;
310
         B0[25] = *A0; A0 += LDA; B1[25] = *A1; A1 += LDA;
311
         B2[25] = *A2; A2 += LDA; B3[25] = *A3; A3 += LDA;
312
         B0[26] = *A0; A0 += LDA; B1[26] = *A1; A1 += LDA;
313
         B2[26] = *A2; A2 += LDA; B3[26] = *A3; A3 += LDA;
314
         B0[27] = *A0; A0 += LDA; B1[27] = *A1; A1 += LDA;
315
         B2[27] = *A2; A2 += LDA; B3[27] = *A3; A3 += LDA;
316
         B0[28] = *A0; A0 += LDA; B1[28] = *A1; A1 += LDA;
317
         B2[28] = *A2; A2 += LDA; B3[28] = *A3; A3 += LDA;
318
         B0[29] = *A0; A0 += LDA; B1[29] = *A1; A1 += LDA;
319
         B2[29] = *A2; A2 += LDA; B3[29] = *A3; A3 += LDA;
320
         B0[30] = *A0; A0 += LDA; B1[30] = *A1; A1 += LDA;
321
         B2[30] = *A2; A2 += LDA; B3[30] = *A3; A3 += LDA;
322
         B0[31] = *A0; A0 += LDA; B1[31] = *A1; A1 += LDA;
323
         B2[31] = *A2; A2 += LDA; B3[31] = *A3; A3 += LDA;
324
#endif
325

    
326
#endif
327
#if   ( HPL_LATCPY_N_DEPTH == 1 )
328
         B0 += HPL_LATCPY_M_DEPTH;
329
#elif ( HPL_LATCPY_N_DEPTH == 2 )
330
         B0 += HPL_LATCPY_M_DEPTH; B1 += HPL_LATCPY_M_DEPTH;
331
#elif ( HPL_LATCPY_N_DEPTH == 4 )
332
         B0 += HPL_LATCPY_M_DEPTH; B1 += HPL_LATCPY_M_DEPTH;
333
         B2 += HPL_LATCPY_M_DEPTH; B3 += HPL_LATCPY_M_DEPTH;
334
#endif
335
      }
336

    
337
      for( i = mu; i < M; i++ )
338
      {
339
#if   ( HPL_LATCPY_N_DEPTH == 1 )
340
         *B0 = *A0; B0++; A0 += LDA;
341
#elif ( HPL_LATCPY_N_DEPTH == 2 )
342
         *B0 = *A0; B0++; A0 += LDA; *B1 = *A1; B1++; A1 += LDA;
343
#elif ( HPL_LATCPY_N_DEPTH == 4 )
344
         *B0 = *A0; B0++; A0 += LDA; *B1 = *A1; B1++; A1 += LDA;
345
         *B2 = *A2; B2++; A2 += LDA; *B3 = *A3; B3++; A3 += LDA;
346
#endif
347
      }
348

    
349
#if   ( HPL_LATCPY_N_DEPTH == 1 )
350
      A0 += incA; B0 += incB;
351
#elif ( HPL_LATCPY_N_DEPTH == 2 )
352
      A0 += incA; A1 += incA; B0 += incB; B1 += incB;
353
#elif ( HPL_LATCPY_N_DEPTH == 4 )
354
      A0 += incA; A1 += incA; A2 += incA; A3 += incA;
355
      B0 += incB; B1 += incB; B2 += incB; B3 += incB;
356
#endif
357
   }
358

    
359
   for( j = nu; j < N; j++, B0 += incB0, A0 += incA0 )
360
   {
361
      for( i = 0; i < mu; i += HPL_LATCPY_M_DEPTH, B0 += HPL_LATCPY_M_DEPTH )
362
      {
363
         B0[ 0]=*A0; A0 += LDA;
364
#if ( HPL_LATCPY_M_DEPTH >  1 )
365
         B0[ 1]=*A0; A0 += LDA;
366
#endif
367
#if ( HPL_LATCPY_M_DEPTH >  2 )
368
         B0[ 2]=*A0; A0 += LDA; B0[ 3]=*A0; A0 += LDA;
369
#endif
370
#if ( HPL_LATCPY_M_DEPTH >  4 )
371
         B0[ 4]=*A0; A0 += LDA; B0[ 5]=*A0; A0 += LDA;
372
         B0[ 6]=*A0; A0 += LDA; B0[ 7]=*A0; A0 += LDA;
373
#endif
374
#if ( HPL_LATCPY_M_DEPTH >  8 )
375
         B0[ 8]=*A0; A0 += LDA; B0[ 9]=*A0; A0 += LDA;
376
         B0[10]=*A0; A0 += LDA; B0[11]=*A0; A0 += LDA;
377
         B0[12]=*A0; A0 += LDA; B0[13]=*A0; A0 += LDA;
378
         B0[14]=*A0; A0 += LDA; B0[15]=*A0; A0 += LDA;
379
#endif
380
#if ( HPL_LATCPY_M_DEPTH > 16 )
381
         B0[16]=*A0; A0 += LDA; B0[17]=*A0; A0 += LDA;
382
         B0[18]=*A0; A0 += LDA; B0[19]=*A0; A0 += LDA;
383
         B0[20]=*A0; A0 += LDA; B0[21]=*A0; A0 += LDA;
384
         B0[22]=*A0; A0 += LDA; B0[23]=*A0; A0 += LDA;
385
         B0[24]=*A0; A0 += LDA; B0[25]=*A0; A0 += LDA;
386
         B0[26]=*A0; A0 += LDA; B0[27]=*A0; A0 += LDA;
387
         B0[28]=*A0; A0 += LDA; B0[29]=*A0; A0 += LDA;
388
         B0[30]=*A0; A0 += LDA; B0[31]=*A0; A0 += LDA;
389
#endif
390
      }
391

    
392
      for( i = mu; i < M; i++, B0++, A0 += LDA ) { *B0 = *A0; }
393
   }
394
#endif
395
/*
396
 * End of HPL_dlatcpy
397
 */
398
}