Statistiques
| Révision :

root / src / pfact / HPL_pdpanllT.c @ 9

Historique | Voir | Annoter | Télécharger (9,84 ko)

1
/* 
2
 * -- High Performance Computing Linpack Benchmark (HPL)                
3
 *    HPL - 2.0 - September 10, 2008                          
4
 *    Antoine P. Petitet                                                
5
 *    University of Tennessee, Knoxville                                
6
 *    Innovative Computing Laboratory                                 
7
 *    (C) Copyright 2000-2008 All Rights Reserved                       
8
 *                                                                      
9
 * -- Copyright notice and Licensing terms:                             
10
 *                                                                      
11
 * Redistribution  and  use in  source and binary forms, with or without
12
 * modification, are  permitted provided  that the following  conditions
13
 * are met:                                                             
14
 *                                                                      
15
 * 1. Redistributions  of  source  code  must retain the above copyright
16
 * notice, this list of conditions and the following disclaimer.        
17
 *                                                                      
18
 * 2. Redistributions in binary form must reproduce  the above copyright
19
 * notice, this list of conditions,  and the following disclaimer in the
20
 * documentation and/or other materials provided with the distribution. 
21
 *                                                                      
22
 * 3. All  advertising  materials  mentioning  features  or  use of this
23
 * software must display the following acknowledgement:                 
24
 * This  product  includes  software  developed  at  the  University  of
25
 * Tennessee, Knoxville, Innovative Computing Laboratory.             
26
 *                                                                      
27
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28
 * names  of  its  contributors  may  not  be used to endorse or promote
29
 * products  derived   from   this  software  without  specific  written
30
 * permission.                                                          
31
 *                                                                      
32
 * -- Disclaimer:                                                       
33
 *                                                                      
34
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
45
 * ---------------------------------------------------------------------
46
 */ 
47
/*
48
 * Include files
49
 */
50
#include "hpl.h"
51

    
52
#ifdef STDC_HEADERS
53
void HPL_pdpanllT
54
(
55
   HPL_T_panel *                    PANEL,
56
   const int                        M,
57
   const int                        N,
58
   const int                        ICOFF,
59
   double *                         WORK
60
)
61
#else
62
void HPL_pdpanllT
63
( PANEL, M, N, ICOFF, WORK )
64
   HPL_T_panel *                    PANEL;
65
   const int                        M;
66
   const int                        N;
67
   const int                        ICOFF;
68
   double *                         WORK;
69
#endif
70
{
71
/* 
72
 * Purpose
73
 * =======
74
 *
75
 * HPL_pdpanllT factorizes  a panel of columns that is a sub-array of a
76
 * larger one-dimensional panel A  using the Left-looking variant of the
77
 * usual one-dimensional algorithm.  The lower triangular N0-by-N0 upper
78
 * block of the panel is stored in transpose form.
79
 *  
80
 * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
81
 * operations  at once  for one column in the panel.  This  results in a
82
 * lower number of slightly larger  messages than usual.  On P processes
83
 * and assuming bi-directional links,  the running time of this function
84
 * can be approximated by (when N is equal to N0):
85
 *  
86
 *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
87
 *    N0^2 * ( M - N0/3 ) * gam2-3
88
 *  
89
 * where M is the local number of rows of  the panel, lat and bdwth  are
90
 * the latency and bandwidth of the network for  double  precision  real
91
 * words, and   gam2-3  is an estimate of the  Level 2 and Level 3  BLAS
92
 * rate of execution. The  recursive  algorithm  allows indeed to almost
93
 * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
94
 * large  number of modern machines,  this  operation is however latency
95
 * bound,  meaning  that its cost can  be estimated  by only the latency
96
 * portion N0 * log_2(P) * lat.  Mono-directional links will double this
97
 * communication cost.
98
 *  
99
 * Note that  one  iteration of the the main loop is unrolled. The local
100
 * computation of the absolute value max of the next column is performed
101
 * just after its update by the current column. This allows to bring the
102
 * current column only  once through  cache at each  step.  The  current
103
 * implementation  does not perform  any blocking  for  this sequence of
104
 * BLAS operations, however the design allows for plugging in an optimal
105
 * (machine-specific) specialized  BLAS-like kernel.  This idea has been
106
 * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
107
 *
108
 * Arguments
109
 * =========
110
 *
111
 * PANEL   (local input/output)          HPL_T_panel *
112
 *         On entry,  PANEL  points to the data structure containing the
113
 *         panel information.
114
 *
115
 * M       (local input)                 const int
116
 *         On entry,  M specifies the local number of rows of sub(A).
117
 *
118
 * N       (local input)                 const int
119
 *         On entry,  N specifies the local number of columns of sub(A).
120
 *
121
 * ICOFF   (global input)                const int
122
 *         On entry, ICOFF specifies the row and column offset of sub(A)
123
 *         in A.
124
 *
125
 * WORK    (local workspace)             double *
126
 *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
127
 *
128
 * ---------------------------------------------------------------------
129
 */ 
130
/*
131
 * .. Local Variables ..
132
 */
133
   double                     * A, * L1, * L1ptr;
134
#ifdef HPL_CALL_VSIPL
135
   vsip_mview_d               * Av0, * Av1, * Yv1, * Xv0, * Xv1;
136
#endif
137
   int                        Mm1, Nm1, curr, ii, iip1, jj, kk, lda,
138
                              m=M, n0;
139
/* ..
140
 * .. Executable Statements ..
141
 */
142
#ifdef HPL_DETAILED_TIMING
143
   HPL_ptimer( HPL_TIMING_PFACT );
144
#endif
145
   A    = PANEL->A;   lda = PANEL->lda;
146
   L1   = PANEL->L1;  n0  = PANEL->jb;
147
   curr = (int)( PANEL->grid->myrow == PANEL->prow );
148

    
149
   Nm1 = N - 1; jj = ICOFF;
150
   if( curr != 0 ) { ii = ICOFF; iip1 = ii+1; Mm1 = m-1; }
151
   else            { ii = 0;     iip1 = ii;   Mm1 = m;   }
152
#ifdef HPL_CALL_VSIPL
153
/*
154
 * Admit the blocks
155
 */
156
   (void) vsip_blockadmit_d( PANEL->Ablock,  VSIP_TRUE );
157
   (void) vsip_blockadmit_d( PANEL->L1block, VSIP_TRUE );
158
/*
159
 * Create the matrix views
160
 */
161
   Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda,       lda, PANEL->pmat->nq );
162
   Xv0 = vsip_mbind_d( PANEL->L1block, 0, 1, PANEL->jb, PANEL->jb, PANEL->jb );
163
#endif
164
/*
165
 * Find local absolute value max in first column and initialize WORK[0:3]
166
 */
167
   HPL_dlocmax( PANEL, m, ii, jj, WORK );
168

    
169
   while( Nm1 > 0 )
170
   {
171
/*
172
 * Swap and broadcast the current row
173
 */
174
      HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
175
      HPL_dlocswpT( PANEL,    ii, jj, WORK );
176

    
177
      L1ptr = Mptr( L1, jj+1, ICOFF, n0 ); kk = jj + 1 - ICOFF;
178
      HPL_dtrsv( HplColumnMajor, HplUpper, HplTrans,   HplUnit, kk,
179
                 Mptr( L1, ICOFF, ICOFF, n0 ), n0, L1ptr, n0 );
180
/*
181
 * Scale  current column by its absolute value max entry  -  Update  and 
182
 * find local  absolute value max  in next column (Only one pass through 
183
 * cache for each next column).  This sequence of operations could bene-
184
 * fit from a specialized  blocked implementation.
185
 */ 
186
      if( WORK[0] != HPL_rzero )
187
         HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
188
#ifdef HPL_CALL_VSIPL
189
/*
190
 * Create the matrix subviews
191
 */
192
      Av1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+ICOFF, Mm1, kk );
193
      Xv1 = vsip_msubview_d( Xv0, jj+1,         ICOFF,             1,   kk );
194
      Yv1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+jj+1,  Mm1,  1 );
195

    
196
      vsip_gemp_d( -HPL_rone, Av1, VSIP_MAT_NTRANS, Xv1, VSIP_MAT_TRANS,
197
                   HPL_rone, Yv1 );
198
/*
199
 * Destroy the matrix subviews
200
 */
201
      (void) vsip_mdestroy_d( Yv1 );
202
      (void) vsip_mdestroy_d( Xv1 );
203
      (void) vsip_mdestroy_d( Av1 );
204
#else
205
      HPL_dgemv( HplColumnMajor, HplNoTrans, Mm1, kk,  -HPL_rone,
206
                 Mptr( A, iip1, ICOFF, lda ), lda, L1ptr, n0,
207
                 HPL_rone, Mptr( A, iip1, jj+1, lda ),  1 );
208
#endif
209
      HPL_dlocmax( PANEL, Mm1, iip1, jj+1, WORK );
210
      if( curr != 0 )
211
      {
212
         HPL_dcopy( kk, L1ptr, n0, Mptr( A, ICOFF, jj+1, lda ), 1 );
213
         ii = iip1; iip1++; m = Mm1; Mm1--;
214
      }
215
      Nm1--; jj++;
216
   }
217
/*
218
 * Swap and broadcast last row - Scale last column by its absolute value
219
 * max entry
220
 */ 
221
   HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
222
   HPL_dlocswpT( PANEL,    ii, jj, WORK );
223
   if( WORK[0] != HPL_rzero )
224
      HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
225

    
226
#ifdef HPL_CALL_VSIPL
227
/*
228
 * Release the blocks
229
 */
230
   (void) vsip_blockrelease_d( vsip_mgetblock_d( Xv0 ), VSIP_TRUE );
231
   (void) vsip_blockrelease_d( vsip_mgetblock_d( Av0 ), VSIP_TRUE );
232
/*
233
 * Destroy the matrix views
234
 */
235
   (void) vsip_mdestroy_d( Xv0 );
236
   (void) vsip_mdestroy_d( Av0 );
237
#endif
238
#ifdef HPL_DETAILED_TIMING
239
   HPL_ptimer( HPL_TIMING_PFACT );
240
#endif
241
/*
242
 * End of HPL_pdpanllT
243
 */
244
}