Statistiques
| Révision :

root / src / pfact / HPL_dlocswpT.c @ 1

Historique | Voir | Annoter | Télécharger (15,94 ko)

1
/* 
2
 * -- High Performance Computing Linpack Benchmark (HPL)                
3
 *    HPL - 2.0 - September 10, 2008                          
4
 *    Antoine P. Petitet                                                
5
 *    University of Tennessee, Knoxville                                
6
 *    Innovative Computing Laboratory                                 
7
 *    (C) Copyright 2000-2008 All Rights Reserved                       
8
 *                                                                      
9
 * -- Copyright notice and Licensing terms:                             
10
 *                                                                      
11
 * Redistribution  and  use in  source and binary forms, with or without
12
 * modification, are  permitted provided  that the following  conditions
13
 * are met:                                                             
14
 *                                                                      
15
 * 1. Redistributions  of  source  code  must retain the above copyright
16
 * notice, this list of conditions and the following disclaimer.        
17
 *                                                                      
18
 * 2. Redistributions in binary form must reproduce  the above copyright
19
 * notice, this list of conditions,  and the following disclaimer in the
20
 * documentation and/or other materials provided with the distribution. 
21
 *                                                                      
22
 * 3. All  advertising  materials  mentioning  features  or  use of this
23
 * software must display the following acknowledgement:                 
24
 * This  product  includes  software  developed  at  the  University  of
25
 * Tennessee, Knoxville, Innovative Computing Laboratory.             
26
 *                                                                      
27
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28
 * names  of  its  contributors  may  not  be used to endorse or promote
29
 * products  derived   from   this  software  without  specific  written
30
 * permission.                                                          
31
 *                                                                      
32
 * -- Disclaimer:                                                       
33
 *                                                                      
34
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
45
 * ---------------------------------------------------------------------
46
 */ 
47
/*
48
 * Include files
49
 */
50
#include "hpl.h"
51
/*
52
 * Define default value for unrolling factor
53
 */
54
#ifndef HPL_LOCSWP_DEPTH
55
#define    HPL_LOCSWP_DEPTH        32
56
#define    HPL_LOCSWP_LOG2_DEPTH    5
57
#endif
58

    
59
#ifdef STDC_HEADERS
60
void HPL_dlocswpT
61
(
62
   HPL_T_panel *                    PANEL,
63
   const int                        II,
64
   const int                        JJ,
65
   double *                         WORK
66
)
67
#else
68
void HPL_dlocswpT
69
( PANEL, II, JJ, WORK )
70
   HPL_T_panel *                    PANEL;
71
   const int                        II;
72
   const int                        JJ;
73
   double *                         WORK;
74
#endif
75
{
76
/* 
77
 * Purpose
78
 * =======
79
 *
80
 * HPL_dlocswpT performs  the local swapping operations  within a panel.
81
 * The lower triangular  N0-by-N0  upper block of the panel is stored in
82
 * transpose form.
83
 *
84
 * Arguments
85
 * =========
86
 *
87
 * PANEL   (local input/output)          HPL_T_panel *
88
 *         On entry,  PANEL  points to the data structure containing the
89
 *         panel information.
90
 *
91
 * II      (local input)                 const int
92
 *         On entry, II  specifies the row offset where the column to be
93
 *         operated on starts with respect to the panel.
94
 *
95
 * JJ      (local input)                 const int
96
 *         On entry, JJ  specifies the column offset where the column to
97
 *         be operated on starts with respect to the panel.
98
 *
99
 * WORK    (local workspace)             double *
100
 *         On entry, WORK  is a workarray of size at least 2 * (4+2*N0).
101
 *         WORK[0] contains  the  local  maximum  absolute value scalar,
102
 *         WORK[1] contains  the corresponding local row index,  WORK[2]
103
 *         contains the corresponding global row index, and  WORK[3]  is
104
 *         the coordinate of process owning this max.  The N0 length max
105
 *         row is stored in WORK[4:4+N0-1];  Note  that this is also the
106
 *         JJth row  (or column) of L1. The remaining part of this array
107
 *         is used as workspace.
108
 *
109
 * ---------------------------------------------------------------------
110
 */ 
111
/*
112
 * .. Local Variables ..
113
 */
114
   double                     gmax;
115
   double                     * A1, * A2, * L, * Wr0, * Wmx;
116
   int                        ilindx, lda, myrow, n0, nr, nu;
117
   register int               i;
118
/* ..
119
 * .. Executable Statements ..
120
 */
121
   myrow = PANEL->grid->myrow; n0 = PANEL->jb; lda = PANEL->lda;
122

    
123
   Wr0   = ( Wmx = WORK + 4 ) + n0; Wmx[JJ] = gmax = WORK[0];
124
   nu    = (int)( ( (unsigned int)(n0) >> HPL_LOCSWP_LOG2_DEPTH ) 
125
                  << HPL_LOCSWP_LOG2_DEPTH );
126
   nr    = n0 - nu;
127
/*
128
 * Replicated swap and copy of the current (new) row of A into L1
129
 */
130
   L  = Mptr( PANEL->L1, 0, JJ, n0  );
131
/*
132
 * If the pivot is non-zero ...
133
 */
134
   if( gmax != HPL_rzero )
135
   {
136
/*
137
 * and if I own the current row of A ...
138
 */
139
      if( myrow == PANEL->prow )
140
      {
141
/*
142
 * and if I also own the row to be swapped with the current row of A ...
143
 */
144
         if( myrow == (int)(WORK[3]) )
145
         {
146
/*
147
 * and if the current row of A is not to swapped with itself ...
148
 */
149
            if( ( ilindx = (int)(WORK[1]) ) != 0 )
150
            {
151
/*
152
 * then copy the max row into L1 and locally swap the 2 rows of A.
153
 */
154
               A1 = Mptr( PANEL->A, II,     0, lda );
155
               A2 = Mptr( A1,       ilindx, 0, lda );
156

    
157
               for( i = 0; i < nu; i += HPL_LOCSWP_DEPTH,
158
                    Wmx += HPL_LOCSWP_DEPTH, Wr0 += HPL_LOCSWP_DEPTH,
159
                    L   += HPL_LOCSWP_DEPTH )
160
               {
161
                  L[ 0]=*A1=Wmx[ 0]; *A2=Wr0[ 0]; A1+=lda; A2+=lda;
162
#if ( HPL_LOCSWP_DEPTH >  1 )
163
                  L[ 1]=*A1=Wmx[ 1]; *A2=Wr0[ 1]; A1+=lda; A2+=lda;
164
#endif
165
#if ( HPL_LOCSWP_DEPTH >  2 )
166
                  L[ 2]=*A1=Wmx[ 2]; *A2=Wr0[ 2]; A1+=lda; A2+=lda;
167
                  L[ 3]=*A1=Wmx[ 3]; *A2=Wr0[ 3]; A1+=lda; A2+=lda;
168
#endif
169
#if ( HPL_LOCSWP_DEPTH >  4 )
170
                  L[ 4]=*A1=Wmx[ 4]; *A2=Wr0[ 4]; A1+=lda; A2+=lda;
171
                  L[ 5]=*A1=Wmx[ 5]; *A2=Wr0[ 5]; A1+=lda; A2+=lda;
172
                  L[ 6]=*A1=Wmx[ 6]; *A2=Wr0[ 6]; A1+=lda; A2+=lda;
173
                  L[ 7]=*A1=Wmx[ 7]; *A2=Wr0[ 7]; A1+=lda; A2+=lda;
174
#endif
175
#if ( HPL_LOCSWP_DEPTH >  8 )
176
                  L[ 8]=*A1=Wmx[ 8]; *A2=Wr0[ 8]; A1+=lda; A2+=lda;
177
                  L[ 9]=*A1=Wmx[ 9]; *A2=Wr0[ 9]; A1+=lda; A2+=lda;
178
                  L[10]=*A1=Wmx[10]; *A2=Wr0[10]; A1+=lda; A2+=lda;
179
                  L[11]=*A1=Wmx[11]; *A2=Wr0[11]; A1+=lda; A2+=lda;
180
                  L[12]=*A1=Wmx[12]; *A2=Wr0[12]; A1+=lda; A2+=lda;
181
                  L[13]=*A1=Wmx[13]; *A2=Wr0[13]; A1+=lda; A2+=lda;
182
                  L[14]=*A1=Wmx[14]; *A2=Wr0[14]; A1+=lda; A2+=lda;
183
                  L[15]=*A1=Wmx[15]; *A2=Wr0[15]; A1+=lda; A2+=lda;
184
#endif
185
#if ( HPL_LOCSWP_DEPTH > 16 )
186
                  L[16]=*A1=Wmx[16]; *A2=Wr0[16]; A1+=lda; A2+=lda;
187
                  L[17]=*A1=Wmx[17]; *A2=Wr0[17]; A1+=lda; A2+=lda;
188
                  L[18]=*A1=Wmx[18]; *A2=Wr0[18]; A1+=lda; A2+=lda;
189
                  L[19]=*A1=Wmx[19]; *A2=Wr0[19]; A1+=lda; A2+=lda;
190
                  L[20]=*A1=Wmx[20]; *A2=Wr0[20]; A1+=lda; A2+=lda;
191
                  L[21]=*A1=Wmx[21]; *A2=Wr0[21]; A1+=lda; A2+=lda;
192
                  L[22]=*A1=Wmx[22]; *A2=Wr0[22]; A1+=lda; A2+=lda;
193
                  L[23]=*A1=Wmx[23]; *A2=Wr0[23]; A1+=lda; A2+=lda;
194
                  L[24]=*A1=Wmx[24]; *A2=Wr0[24]; A1+=lda; A2+=lda;
195
                  L[25]=*A1=Wmx[25]; *A2=Wr0[25]; A1+=lda; A2+=lda;
196
                  L[26]=*A1=Wmx[26]; *A2=Wr0[26]; A1+=lda; A2+=lda;
197
                  L[27]=*A1=Wmx[27]; *A2=Wr0[27]; A1+=lda; A2+=lda;
198
                  L[28]=*A1=Wmx[28]; *A2=Wr0[28]; A1+=lda; A2+=lda;
199
                  L[29]=*A1=Wmx[29]; *A2=Wr0[29]; A1+=lda; A2+=lda;
200
                  L[30]=*A1=Wmx[30]; *A2=Wr0[30]; A1+=lda; A2+=lda;
201
                  L[31]=*A1=Wmx[31]; *A2=Wr0[31]; A1+=lda; A2+=lda;
202
#endif
203
               }
204

    
205
               for( i = 0; i < nr; i++, A1 += lda, A2 += lda )
206
               { L[i] = *A1 = Wmx[i]; *A2 = Wr0[i]; }
207
            }
208
            else
209
            {
210
/*
211
 * otherwise the current row of  A  is swapped with itself, so just copy
212
 * the current of A into L1.
213
 */
214
               *Mptr( PANEL->A, II, JJ, lda ) = gmax;
215

    
216
               for( i = 0; i < nu; i += HPL_LOCSWP_DEPTH,
217
                    Wmx += HPL_LOCSWP_DEPTH, L += HPL_LOCSWP_DEPTH )
218
               {
219
                  L[ 0]=Wmx[ 0];
220
#if ( HPL_LOCSWP_DEPTH >  1 )
221
                  L[ 1]=Wmx[ 1];
222
#endif
223
#if ( HPL_LOCSWP_DEPTH >  2 )
224
                  L[ 2]=Wmx[ 2]; L[ 3]=Wmx[ 3];
225
#endif
226
#if ( HPL_LOCSWP_DEPTH >  4 )
227
                  L[ 4]=Wmx[ 4]; L[ 5]=Wmx[ 5];
228
                  L[ 6]=Wmx[ 6]; L[ 7]=Wmx[ 7];
229
#endif
230
#if ( HPL_LOCSWP_DEPTH >  8 )
231
                  L[ 8]=Wmx[ 8]; L[12]=Wmx[12];
232
                  L[ 9]=Wmx[ 9]; L[13]=Wmx[13];
233
                  L[10]=Wmx[10]; L[14]=Wmx[14];
234
                  L[11]=Wmx[11]; L[15]=Wmx[15];
235
#endif
236
#if ( HPL_LOCSWP_DEPTH > 16 )
237
                  L[16]=Wmx[16]; L[20]=Wmx[20];
238
                  L[17]=Wmx[17]; L[21]=Wmx[21];
239
                  L[18]=Wmx[18]; L[22]=Wmx[22];
240
                  L[19]=Wmx[19]; L[23]=Wmx[23];
241
                  L[24]=Wmx[24]; L[28]=Wmx[28];
242
                  L[25]=Wmx[25]; L[29]=Wmx[29];
243
                  L[26]=Wmx[26]; L[30]=Wmx[30];
244
                  L[27]=Wmx[27]; L[31]=Wmx[31];
245
#endif
246
               }
247
               for( i = 0; i < nr; i++ ) { L[i] = Wmx[i]; }
248
            }
249
         }
250
         else
251
         {
252
/*
253
 * otherwise, the row to be swapped with the current row of A is in Wmx,
254
 * so copy Wmx into L1 and A.
255
 */
256
            A1 = Mptr( PANEL->A, II, 0, lda );
257

    
258
            for( i = 0; i < nu; i += HPL_LOCSWP_DEPTH,
259
                 Wmx += HPL_LOCSWP_DEPTH, L += HPL_LOCSWP_DEPTH )
260
            {
261
               L[ 0]=*A1=Wmx[ 0]; A1+=lda;
262
#if ( HPL_LOCSWP_DEPTH >  1 )
263
               L[ 1]=*A1=Wmx[ 1]; A1+=lda;
264
#endif
265
#if ( HPL_LOCSWP_DEPTH >  2 )
266
               L[ 2]=*A1=Wmx[ 2]; A1+=lda; L[ 3]=*A1=Wmx[ 3]; A1+=lda;
267
#endif
268
#if ( HPL_LOCSWP_DEPTH >  4 )
269
               L[ 4]=*A1=Wmx[ 4]; A1+=lda; L[ 5]=*A1=Wmx[ 5]; A1+=lda;
270
               L[ 6]=*A1=Wmx[ 6]; A1+=lda; L[ 7]=*A1=Wmx[ 7]; A1+=lda;
271
#endif
272
#if ( HPL_LOCSWP_DEPTH >  8 )
273
               L[ 8]=*A1=Wmx[ 8]; A1+=lda; L[ 9]=*A1=Wmx[ 9]; A1+=lda;
274
               L[10]=*A1=Wmx[10]; A1+=lda; L[11]=*A1=Wmx[11]; A1+=lda;
275
               L[12]=*A1=Wmx[12]; A1+=lda; L[13]=*A1=Wmx[13]; A1+=lda;
276
               L[14]=*A1=Wmx[14]; A1+=lda; L[15]=*A1=Wmx[15]; A1+=lda;
277
#endif
278
#if ( HPL_LOCSWP_DEPTH > 16 )
279
               L[16]=*A1=Wmx[16]; A1+=lda; L[17]=*A1=Wmx[17]; A1+=lda;
280
               L[18]=*A1=Wmx[18]; A1+=lda; L[19]=*A1=Wmx[19]; A1+=lda;
281
               L[20]=*A1=Wmx[20]; A1+=lda; L[21]=*A1=Wmx[21]; A1+=lda;
282
               L[22]=*A1=Wmx[22]; A1+=lda; L[23]=*A1=Wmx[23]; A1+=lda;
283
               L[24]=*A1=Wmx[24]; A1+=lda; L[25]=*A1=Wmx[25]; A1+=lda;
284
               L[26]=*A1=Wmx[26]; A1+=lda; L[27]=*A1=Wmx[27]; A1+=lda;
285
               L[28]=*A1=Wmx[28]; A1+=lda; L[29]=*A1=Wmx[29]; A1+=lda;
286
               L[30]=*A1=Wmx[30]; A1+=lda; L[31]=*A1=Wmx[31]; A1+=lda;
287
#endif
288
            }
289

    
290
            for( i = 0; i < nr; i++, A1 += lda ) { L[i]=*A1=Wmx[i]; } 
291
         }
292
      }
293
      else
294
      {
295
/*
296
 * otherwise I do not own the current row of A, so copy the max row  Wmx
297
 * into L1.
298
 */
299
         for( i = 0; i < nu; i += HPL_LOCSWP_DEPTH,
300
              Wmx += HPL_LOCSWP_DEPTH, L += HPL_LOCSWP_DEPTH )
301
         {
302
            L[ 0]=Wmx[ 0];
303
#if ( HPL_LOCSWP_DEPTH >  1 )
304
            L[ 1]=Wmx[ 1];
305
#endif
306
#if ( HPL_LOCSWP_DEPTH >  2 )
307
            L[ 2]=Wmx[ 2]; L[ 3]=Wmx[ 3];
308
#endif
309
#if ( HPL_LOCSWP_DEPTH >  4 )
310
            L[ 4]=Wmx[ 4]; L[ 5]=Wmx[ 5]; L[ 6]=Wmx[ 6]; L[ 7]=Wmx[ 7];
311
#endif
312
#if ( HPL_LOCSWP_DEPTH >  8 )
313
            L[ 8]=Wmx[ 8]; L[ 9]=Wmx[ 9]; L[10]=Wmx[10]; L[11]=Wmx[11];
314
            L[12]=Wmx[12]; L[13]=Wmx[13]; L[14]=Wmx[14]; L[15]=Wmx[15];
315
#endif
316
#if ( HPL_LOCSWP_DEPTH > 16 )
317
            L[16]=Wmx[16]; L[17]=Wmx[17]; L[18]=Wmx[18]; L[19]=Wmx[19];
318
            L[20]=Wmx[20]; L[21]=Wmx[21]; L[22]=Wmx[22]; L[23]=Wmx[23];
319
            L[24]=Wmx[24]; L[25]=Wmx[25]; L[26]=Wmx[26]; L[27]=Wmx[27];
320
            L[28]=Wmx[28]; L[29]=Wmx[29]; L[30]=Wmx[30]; L[31]=Wmx[31];
321
#endif
322
         }
323
         for( i = 0; i < nr; i++ ) { L[i] = Wmx[i]; }
324
/*
325
 * and if I own the max row, overwrite it with the current row Wr0.
326
 */
327
         if( myrow == (int)(WORK[3]) )
328
         {
329
            A2 = Mptr( PANEL->A, II + (size_t)(WORK[1]), 0, lda );
330

    
331
            for( i = 0; i < nu; i += HPL_LOCSWP_DEPTH,
332
                 Wr0 += HPL_LOCSWP_DEPTH )
333
            {
334
               *A2 = Wr0[ 0]; A2+=lda;
335
#if ( HPL_LOCSWP_DEPTH >  1 )
336
               *A2 = Wr0[ 1]; A2+=lda;
337
#endif
338
#if ( HPL_LOCSWP_DEPTH >  2 )
339
               *A2 = Wr0[ 2]; A2+=lda; *A2 = Wr0[ 3]; A2+=lda;
340
#endif
341
#if ( HPL_LOCSWP_DEPTH >  4 )
342
               *A2 = Wr0[ 4]; A2+=lda; *A2 = Wr0[ 5]; A2+=lda;
343
               *A2 = Wr0[ 6]; A2+=lda; *A2 = Wr0[ 7]; A2+=lda;
344
#endif
345
#if ( HPL_LOCSWP_DEPTH >  8 )
346
               *A2 = Wr0[ 8]; A2+=lda; *A2 = Wr0[ 9]; A2+=lda;
347
               *A2 = Wr0[10]; A2+=lda; *A2 = Wr0[11]; A2+=lda;
348
               *A2 = Wr0[12]; A2+=lda; *A2 = Wr0[13]; A2+=lda;
349
               *A2 = Wr0[14]; A2+=lda; *A2 = Wr0[15]; A2+=lda;
350
#endif
351
#if ( HPL_LOCSWP_DEPTH > 16 )
352
               *A2 = Wr0[16]; A2+=lda; *A2 = Wr0[17]; A2+=lda;
353
               *A2 = Wr0[18]; A2+=lda; *A2 = Wr0[19]; A2+=lda;
354
               *A2 = Wr0[20]; A2+=lda; *A2 = Wr0[21]; A2+=lda;
355
               *A2 = Wr0[22]; A2+=lda; *A2 = Wr0[23]; A2+=lda;
356
               *A2 = Wr0[24]; A2+=lda; *A2 = Wr0[25]; A2+=lda;
357
               *A2 = Wr0[26]; A2+=lda; *A2 = Wr0[27]; A2+=lda;
358
               *A2 = Wr0[28]; A2+=lda; *A2 = Wr0[29]; A2+=lda;
359
               *A2 = Wr0[30]; A2+=lda; *A2 = Wr0[31]; A2+=lda;
360
#endif
361
            }
362
            for( i = 0; i < nr; i++, A2 += lda ) { *A2 = Wr0[i]; }
363
         }
364
      }
365
   }
366
   else
367
   {
368
/*
369
 * Otherwise the max element in the current column is zero,  simply copy
370
 * the current row Wr0 into L1. The matrix is singular.
371
 */
372
      for( i = 0; i < nu; i += HPL_LOCSWP_DEPTH,
373
           Wr0 += HPL_LOCSWP_DEPTH, L += HPL_LOCSWP_DEPTH )
374
      {
375
         L[ 0]=Wr0[ 0];
376
#if ( HPL_LOCSWP_DEPTH >  1 )
377
         L[ 1]=Wr0[ 1];
378
#endif
379
#if ( HPL_LOCSWP_DEPTH >  2 )
380
         L[ 2]=Wr0[ 2]; L[ 3]=Wr0[ 3];
381
#endif
382
#if ( HPL_LOCSWP_DEPTH >  4 )
383
         L[ 4]=Wr0[ 4]; L[ 5]=Wr0[ 5]; L[ 6]=Wr0[ 6]; L[ 7]=Wr0[ 7];
384
#endif
385
#if ( HPL_LOCSWP_DEPTH >  8 )
386
         L[ 8]=Wr0[ 8]; L[12]=Wr0[12]; L[ 9]=Wr0[ 9]; L[13]=Wr0[13];
387
         L[10]=Wr0[10]; L[14]=Wr0[14]; L[11]=Wr0[11]; L[15]=Wr0[15];
388
#endif
389
#if ( HPL_LOCSWP_DEPTH > 16 )
390
         L[16]=Wr0[16]; L[20]=Wr0[20]; L[17]=Wr0[17]; L[21]=Wr0[21];
391
         L[18]=Wr0[18]; L[22]=Wr0[22]; L[19]=Wr0[19]; L[23]=Wr0[23];
392
         L[24]=Wr0[24]; L[28]=Wr0[28]; L[25]=Wr0[25]; L[29]=Wr0[29];
393
         L[26]=Wr0[26]; L[30]=Wr0[30]; L[27]=Wr0[27]; L[31]=Wr0[31];
394
#endif
395
      }
396
      for( i = 0; i < nr; i++ ) { L[i] = Wr0[i]; }
397
/*
398
 * Set INFO.
399
 */
400
      if( *(PANEL->DINFO) == 0.0 )
401
         *(PANEL->DINFO) = (double)(PANEL->ia + JJ + 1);
402
   }
403
/*
404
 * End of HPL_dlocswpT
405
 */
406
}