Statistiques
| Révision :

root / src / pfact / HPL_pdpanrlN.c @ 9

Historique | Voir | Annoter | Télécharger (9,98 ko)

1 1 equemene
/*
2 1 equemene
 * -- High Performance Computing Linpack Benchmark (HPL)
3 1 equemene
 *    HPL - 2.0 - September 10, 2008
4 1 equemene
 *    Antoine P. Petitet
5 1 equemene
 *    University of Tennessee, Knoxville
6 1 equemene
 *    Innovative Computing Laboratory
7 1 equemene
 *    (C) Copyright 2000-2008 All Rights Reserved
8 1 equemene
 *
9 1 equemene
 * -- Copyright notice and Licensing terms:
10 1 equemene
 *
11 1 equemene
 * Redistribution  and  use in  source and binary forms, with or without
12 1 equemene
 * modification, are  permitted provided  that the following  conditions
13 1 equemene
 * are met:
14 1 equemene
 *
15 1 equemene
 * 1. Redistributions  of  source  code  must retain the above copyright
16 1 equemene
 * notice, this list of conditions and the following disclaimer.
17 1 equemene
 *
18 1 equemene
 * 2. Redistributions in binary form must reproduce  the above copyright
19 1 equemene
 * notice, this list of conditions,  and the following disclaimer in the
20 1 equemene
 * documentation and/or other materials provided with the distribution.
21 1 equemene
 *
22 1 equemene
 * 3. All  advertising  materials  mentioning  features  or  use of this
23 1 equemene
 * software must display the following acknowledgement:
24 1 equemene
 * This  product  includes  software  developed  at  the  University  of
25 1 equemene
 * Tennessee, Knoxville, Innovative Computing Laboratory.
26 1 equemene
 *
27 1 equemene
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28 1 equemene
 * names  of  its  contributors  may  not  be used to endorse or promote
29 1 equemene
 * products  derived   from   this  software  without  specific  written
30 1 equemene
 * permission.
31 1 equemene
 *
32 1 equemene
 * -- Disclaimer:
33 1 equemene
 *
34 1 equemene
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 1 equemene
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36 1 equemene
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 1 equemene
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38 1 equemene
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39 1 equemene
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40 1 equemene
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 1 equemene
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42 1 equemene
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43 1 equemene
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 1 equemene
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 1 equemene
 * ---------------------------------------------------------------------
46 1 equemene
 */
47 1 equemene
/*
48 1 equemene
 * Include files
49 1 equemene
 */
50 1 equemene
#include "hpl.h"
51 1 equemene
52 1 equemene
#ifdef STDC_HEADERS
53 1 equemene
void HPL_pdpanrlN
54 1 equemene
(
55 1 equemene
   HPL_T_panel *                    PANEL,
56 1 equemene
   const int                        M,
57 1 equemene
   const int                        N,
58 1 equemene
   const int                        ICOFF,
59 1 equemene
   double *                         WORK
60 1 equemene
)
61 1 equemene
#else
62 1 equemene
void HPL_pdpanrlN
63 1 equemene
( PANEL, M, N, ICOFF, WORK )
64 1 equemene
   HPL_T_panel *                    PANEL;
65 1 equemene
   const int                        M;
66 1 equemene
   const int                        N;
67 1 equemene
   const int                        ICOFF;
68 1 equemene
   double *                         WORK;
69 1 equemene
#endif
70 1 equemene
{
71 1 equemene
/*
72 1 equemene
 * Purpose
73 1 equemene
 * =======
74 1 equemene
 *
75 1 equemene
 * HPL_pdpanrlN factorizes  a panel of columns  that is a sub-array of a
76 1 equemene
 * larger one-dimensional panel A using the Right-looking variant of the
77 1 equemene
 * usual one-dimensional algorithm.  The lower triangular N0-by-N0 upper
78 1 equemene
 * block of the panel is stored in no-transpose form (i.e. just like the
79 1 equemene
 * input matrix itself).
80 1 equemene
 *
81 1 equemene
 * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
82 1 equemene
 * operations  at once  for one column in the panel.  This  results in a
83 1 equemene
 * lower number of slightly larger  messages than usual.  On P processes
84 1 equemene
 * and assuming bi-directional links,  the running time of this function
85 1 equemene
 * can be approximated by (when N is equal to N0):
86 1 equemene
 *
87 1 equemene
 *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
88 1 equemene
 *    N0^2 * ( M - N0/3 ) * gam2-3
89 1 equemene
 *
90 1 equemene
 * where M is the local number of rows of  the panel, lat and bdwth  are
91 1 equemene
 * the latency and bandwidth of the network for  double  precision  real
92 1 equemene
 * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
93 1 equemene
 * rate of execution. The  recursive  algorithm  allows indeed to almost
94 1 equemene
 * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
95 1 equemene
 * large  number of modern machines,  this  operation is however latency
96 1 equemene
 * bound,  meaning  that its cost can  be estimated  by only the latency
97 1 equemene
 * portion N0 * log_2(P) * lat.  Mono-directional links will double this
98 1 equemene
 * communication cost.
99 1 equemene
 *
100 1 equemene
 * Note that  one  iteration of the the main loop is unrolled. The local
101 1 equemene
 * computation of the absolute value max of the next column is performed
102 1 equemene
 * just after its update by the current column. This allows to bring the
103 1 equemene
 * current column only  once through  cache at each  step.  The  current
104 1 equemene
 * implementation  does not perform  any blocking  for  this sequence of
105 1 equemene
 * BLAS operations, however the design allows for plugging in an optimal
106 1 equemene
 * (machine-specific) specialized  BLAS-like kernel.  This idea has been
107 1 equemene
 * suggested to us by Fred Gustavson, IBM T.J. Watson Research Center.
108 1 equemene
 *
109 1 equemene
 * Arguments
110 1 equemene
 * =========
111 1 equemene
 *
112 1 equemene
 * PANEL   (local input/output)          HPL_T_panel *
113 1 equemene
 *         On entry,  PANEL  points to the data structure containing the
114 1 equemene
 *         panel information.
115 1 equemene
 *
116 1 equemene
 * M       (local input)                 const int
117 1 equemene
 *         On entry,  M specifies the local number of rows of sub(A).
118 1 equemene
 *
119 1 equemene
 * N       (local input)                 const int
120 1 equemene
 *         On entry,  N specifies the local number of columns of sub(A).
121 1 equemene
 *
122 1 equemene
 * ICOFF   (global input)                const int
123 1 equemene
 *         On entry, ICOFF specifies the row and column offset of sub(A)
124 1 equemene
 *         in A.
125 1 equemene
 *
126 1 equemene
 * WORK    (local workspace)             double *
127 1 equemene
 *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
128 1 equemene
 *
129 1 equemene
 * ---------------------------------------------------------------------
130 1 equemene
 */
131 1 equemene
/*
132 1 equemene
 * .. Local Variables ..
133 1 equemene
 */
134 1 equemene
   double                     * A, * Acur, * Anxt;
135 1 equemene
#ifdef HPL_CALL_VSIPL
136 1 equemene
   vsip_mview_d               * Av0, * Av1, * Xv1, * Yv0, * Yv1;
137 1 equemene
#endif
138 1 equemene
   int                        Mm1, Nm1, curr, ii, iip1, jj, lda, m=M;
139 1 equemene
/* ..
140 1 equemene
 * .. Executable Statements ..
141 1 equemene
 */
142 1 equemene
#ifdef HPL_DETAILED_TIMING
143 1 equemene
   HPL_ptimer( HPL_TIMING_PFACT );
144 1 equemene
#endif
145 1 equemene
   A    = PANEL->A;   lda = PANEL->lda;
146 1 equemene
   curr = (int)( PANEL->grid->myrow == PANEL->prow );
147 1 equemene
148 1 equemene
   Nm1  = N - 1; jj = ICOFF;
149 1 equemene
   if( curr != 0 ) { ii = ICOFF; iip1 = ii+1; Mm1 = m-1; }
150 1 equemene
   else            { ii = 0;     iip1 = ii;   Mm1 = m;   }
151 1 equemene
#ifdef HPL_CALL_VSIPL
152 1 equemene
/*
153 1 equemene
 * Admit the blocks
154 1 equemene
 */
155 1 equemene
   (void) vsip_blockadmit_d(  PANEL->Ablock,  VSIP_TRUE );
156 1 equemene
   (void) vsip_blockadmit_d(  PANEL->L1block, VSIP_TRUE );
157 1 equemene
/*
158 1 equemene
 * Create the matrix views
159 1 equemene
 */
160 1 equemene
   Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda,       lda, PANEL->pmat->nq );
161 1 equemene
   Yv0 = vsip_mbind_d( PANEL->L1block, 0, 1, PANEL->jb, PANEL->jb, PANEL->jb );
162 1 equemene
#endif
163 1 equemene
/*
164 1 equemene
 * Find local absolute value max in first column - initialize WORK[0:3]
165 1 equemene
 */
166 1 equemene
   HPL_dlocmax( PANEL, m, ii, jj, WORK );
167 1 equemene
168 1 equemene
   while( Nm1 >= 1 )
169 1 equemene
   {
170 1 equemene
      Acur = Mptr( A, iip1, jj, lda ); Anxt = Mptr( Acur, 0, 1, lda );
171 1 equemene
/*
172 1 equemene
 * Swap and broadcast the current row
173 1 equemene
 */
174 1 equemene
      HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
175 1 equemene
      HPL_dlocswpN( PANEL,    ii, jj, WORK );
176 1 equemene
/*
177 1 equemene
 * Scale current column by its absolute value max entry  -  Update trai-
178 1 equemene
 * ling sub-matrix and find local absolute value max in next column (On-
179 1 equemene
 * ly one pass through cache for each current column).  This sequence of
180 1 equemene
 * operations could benefit from a specialized blocked implementation.
181 1 equemene
 */
182 1 equemene
      if( WORK[0] != HPL_rzero )
183 1 equemene
         HPL_dscal( Mm1, HPL_rone / WORK[0], Acur, 1 );
184 1 equemene
      HPL_daxpy( Mm1, -WORK[4+jj+1], Acur, 1, Anxt, 1 );
185 1 equemene
      HPL_dlocmax( PANEL, Mm1, iip1, jj+1, WORK );
186 1 equemene
#ifdef HPL_CALL_VSIPL
187 1 equemene
      if( Nm1 > 1 )
188 1 equemene
      {
189 1 equemene
/*
190 1 equemene
 * Create the matrix subviews
191 1 equemene
 */
192 1 equemene
         Av1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+jj+2,
193 1 equemene
                                Mm1, Nm1-1 );
194 1 equemene
         Xv1 = vsip_msubview_d( Av0, PANEL->ii+iip1, PANEL->jj+jj,
195 1 equemene
                                Mm1, 1   );
196 1 equemene
         Yv1 = vsip_msubview_d( Yv0, jj, jj+2, 1, Nm1-1 );
197 1 equemene
198 1 equemene
         vsip_gemp_d( -HPL_rone, Xv1, VSIP_MAT_NTRANS, Yv1, VSIP_MAT_NTRANS,
199 1 equemene
                      HPL_rone, Av1 );
200 1 equemene
/*
201 1 equemene
 * Destroy the matrix subviews
202 1 equemene
 */
203 1 equemene
         (void) vsip_mdestroy_d( Yv1 );
204 1 equemene
         (void) vsip_mdestroy_d( Xv1 );
205 1 equemene
         (void) vsip_mdestroy_d( Av1 );
206 1 equemene
      }
207 1 equemene
#else
208 1 equemene
      if( Nm1 > 1 )
209 1 equemene
         HPL_dger( HplColumnMajor, Mm1, Nm1-1, -HPL_rone, Acur, 1,
210 1 equemene
                   WORK+4+jj+2, 1, Mptr( Anxt, 0, 1, lda ), lda );
211 1 equemene
#endif
212 1 equemene
/*
213 1 equemene
 * Same thing as above but with worse data access on y (A += x * y^T)
214 1 equemene
 *
215 1 equemene
 *    if( Nm1 > 1 ) )
216 1 equemene
 *       HPL_dger( HplColumnMajor, Mm1, Nm1-1, -HPL_rone, Acur, 1,
217 1 equemene
 *                 Mptr( L1, jj, jj+2, n0 ), n0, Mptr( Anxt, 0, 1, lda ),
218 1 equemene
 *                 lda );
219 1 equemene
 */
220 1 equemene
      if( curr != 0 ) { ii = iip1; iip1++; m = Mm1; Mm1--; }
221 1 equemene
222 1 equemene
      Nm1--; jj++;
223 1 equemene
   }
224 1 equemene
/*
225 1 equemene
 * Swap and broadcast last row - Scale last column by its absolute value
226 1 equemene
 * max entry
227 1 equemene
 */
228 1 equemene
   HPL_pdmxswp(  PANEL, m, ii, jj, WORK );
229 1 equemene
   HPL_dlocswpN( PANEL,    ii, jj, WORK );
230 1 equemene
   if( WORK[0] != HPL_rzero )
231 1 equemene
      HPL_dscal( Mm1, HPL_rone / WORK[0], Mptr( A, iip1, jj, lda ), 1 );
232 1 equemene
#ifdef HPL_CALL_VSIPL
233 1 equemene
/*
234 1 equemene
 * Release the blocks
235 1 equemene
 */
236 1 equemene
   (void) vsip_blockrelease_d( vsip_mgetblock_d( Yv0 ), VSIP_TRUE );
237 1 equemene
   (void) vsip_blockrelease_d( vsip_mgetblock_d( Av0 ), VSIP_TRUE );
238 1 equemene
/*
239 1 equemene
 * Destroy the matrix views
240 1 equemene
 */
241 1 equemene
   (void) vsip_mdestroy_d( Yv0 );
242 1 equemene
   (void) vsip_mdestroy_d( Av0 );
243 1 equemene
#endif
244 1 equemene
#ifdef HPL_DETAILED_TIMING
245 1 equemene
   HPL_ptimer( HPL_TIMING_PFACT );
246 1 equemene
#endif
247 1 equemene
/*
248 1 equemene
 * End of HPL_pdpanrlN
249 1 equemene
 */
250 1 equemene
}