Statistiques
| Révision :

root / src / pgesv / HPL_pdtrsv.c @ 9

Historique | Voir | Annoter | Télécharger (11,45 ko)

1 1 equemene
/*
2 1 equemene
 * -- High Performance Computing Linpack Benchmark (HPL)
3 1 equemene
 *    HPL - 2.0 - September 10, 2008
4 1 equemene
 *    Antoine P. Petitet
5 1 equemene
 *    University of Tennessee, Knoxville
6 1 equemene
 *    Innovative Computing Laboratory
7 1 equemene
 *    (C) Copyright 2000-2008 All Rights Reserved
8 1 equemene
 *
9 1 equemene
 * -- Copyright notice and Licensing terms:
10 1 equemene
 *
11 1 equemene
 * Redistribution  and  use in  source and binary forms, with or without
12 1 equemene
 * modification, are  permitted provided  that the following  conditions
13 1 equemene
 * are met:
14 1 equemene
 *
15 1 equemene
 * 1. Redistributions  of  source  code  must retain the above copyright
16 1 equemene
 * notice, this list of conditions and the following disclaimer.
17 1 equemene
 *
18 1 equemene
 * 2. Redistributions in binary form must reproduce  the above copyright
19 1 equemene
 * notice, this list of conditions,  and the following disclaimer in the
20 1 equemene
 * documentation and/or other materials provided with the distribution.
21 1 equemene
 *
22 1 equemene
 * 3. All  advertising  materials  mentioning  features  or  use of this
23 1 equemene
 * software must display the following acknowledgement:
24 1 equemene
 * This  product  includes  software  developed  at  the  University  of
25 1 equemene
 * Tennessee, Knoxville, Innovative Computing Laboratory.
26 1 equemene
 *
27 1 equemene
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28 1 equemene
 * names  of  its  contributors  may  not  be used to endorse or promote
29 1 equemene
 * products  derived   from   this  software  without  specific  written
30 1 equemene
 * permission.
31 1 equemene
 *
32 1 equemene
 * -- Disclaimer:
33 1 equemene
 *
34 1 equemene
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 1 equemene
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36 1 equemene
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 1 equemene
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38 1 equemene
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39 1 equemene
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40 1 equemene
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 1 equemene
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42 1 equemene
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43 1 equemene
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 1 equemene
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 1 equemene
 * ---------------------------------------------------------------------
46 1 equemene
 */
47 1 equemene
/*
48 1 equemene
 * Include files
49 1 equemene
 */
50 1 equemene
#include "hpl.h"
51 1 equemene
52 1 equemene
#ifdef STDC_HEADERS
53 1 equemene
void HPL_pdtrsv
54 1 equemene
(
55 1 equemene
   HPL_T_grid *                     GRID,
56 1 equemene
   HPL_T_pmat *                     AMAT
57 1 equemene
)
58 1 equemene
#else
59 1 equemene
void HPL_pdtrsv
60 1 equemene
( GRID, AMAT )
61 1 equemene
   HPL_T_grid *                     GRID;
62 1 equemene
   HPL_T_pmat *                     AMAT;
63 1 equemene
#endif
64 1 equemene
{
65 1 equemene
/*
66 1 equemene
 * Purpose
67 1 equemene
 * =======
68 1 equemene
 *
69 1 equemene
 * HPL_pdtrsv solves an upper triangular system of linear equations.
70 1 equemene
 *
71 1 equemene
 * The rhs is the last column of the N by N+1 matrix A. The solve starts
72 1 equemene
 * in the process  column owning the  Nth  column of A, so the rhs b may
73 1 equemene
 * need to be moved one process column to the left at the beginning. The
74 1 equemene
 * routine therefore needs  a column  vector in every process column but
75 1 equemene
 * the one owning  b. The result is  replicated in all process rows, and
76 1 equemene
 * returned in XR, i.e. XR is of size nq = LOCq( N ) in all processes.
77 1 equemene
 *
78 1 equemene
 * The algorithm uses decreasing one-ring broadcast in process rows  and
79 1 equemene
 * columns  implemented  in terms of  synchronous communication point to
80 1 equemene
 * point primitives.  The  lookahead of depth 1 is used to minimize  the
81 1 equemene
 * critical path. This entire operation is essentially ``latency'' bound
82 1 equemene
 * and an estimate of its running time is given by:
83 1 equemene
 *
84 1 equemene
 *    (move rhs) lat + N / ( P bdwth ) +
85 1 equemene
 *    (solve)    ((N / NB)-1) 2 (lat + NB / bdwth) +
86 1 equemene
 *               gam2 N^2 / ( P Q ),
87 1 equemene
 *
88 1 equemene
 * where  gam2   is an estimate of the   Level 2 BLAS rate of execution.
89 1 equemene
 * There are  N / NB  diagonal blocks. One must exchange  2  messages of
90 1 equemene
 * length NB to compute the next  NB  entries of the vector solution, as
91 1 equemene
 * well as performing a total of N^2 floating point operations.
92 1 equemene
 *
93 1 equemene
 * Arguments
94 1 equemene
 * =========
95 1 equemene
 *
96 1 equemene
 * GRID    (local input)                 HPL_T_grid *
97 1 equemene
 *         On entry,  GRID  points  to the data structure containing the
98 1 equemene
 *         process grid information.
99 1 equemene
 *
100 1 equemene
 * AMAT    (local input/output)          HPL_T_pmat *
101 1 equemene
 *         On entry,  AMAT  points  to the data structure containing the
102 1 equemene
 *         local array information.
103 1 equemene
 *
104 1 equemene
 * ---------------------------------------------------------------------
105 1 equemene
 */
106 1 equemene
/*
107 1 equemene
 * .. Local Variables ..
108 1 equemene
 */
109 1 equemene
   MPI_Comm                   Ccomm, Rcomm;
110 1 equemene
   double                     * A=NULL, * Aprev=NULL, * Aptr, * XC=NULL,
111 1 equemene
                              * XR=NULL, * Xd=NULL, * Xdprev=NULL,
112 1 equemene
                              * W=NULL;
113 1 equemene
   int                        Alcol, Alrow, Anpprev, Anp, Anq, Bcol,
114 1 equemene
                              Cmsgid, GridIsNotPx1, GridIsNot1xQ, Rmsgid,
115 1 equemene
                              Wfr=0, colprev, kb, kbprev, lda, mycol,
116 1 equemene
                              myrow, n, n1, n1p, n1pprev=0, nb, npcol,
117 1 equemene
                              nprow, rowprev, tmp1, tmp2;
118 1 equemene
/* ..
119 1 equemene
 * .. Executable Statements ..
120 1 equemene
 */
121 1 equemene
#ifdef HPL_DETAILED_TIMING
122 1 equemene
   HPL_ptimer( HPL_TIMING_PTRSV );
123 1 equemene
#endif
124 1 equemene
   if( ( n = AMAT->n ) <= 0 ) return;
125 1 equemene
   nb = AMAT->nb; lda = AMAT->ld; A = AMAT->A; XR = AMAT->X;
126 1 equemene
127 1 equemene
   (void) HPL_grid_info( GRID, &nprow, &npcol, &myrow, &mycol );
128 1 equemene
   Rcomm = GRID->row_comm; Rmsgid = MSGID_BEGIN_PTRSV;
129 1 equemene
   Ccomm = GRID->col_comm; Cmsgid = MSGID_BEGIN_PTRSV + 1;
130 1 equemene
   GridIsNot1xQ = ( nprow > 1 ); GridIsNotPx1 = ( npcol > 1 );
131 1 equemene
/*
132 1 equemene
 * Move the rhs in the process column owning the last column of A.
133 1 equemene
 */
134 1 equemene
   Mnumroc( Anp, n, nb, nb, myrow, 0, nprow );
135 1 equemene
   Mnumroc( Anq, n, nb, nb, mycol, 0, npcol );
136 1 equemene
137 1 equemene
   tmp1  = ( n - 1 ) / nb;
138 1 equemene
   Alrow = tmp1 - ( tmp1 / nprow ) * nprow;
139 1 equemene
   Alcol = tmp1 - ( tmp1 / npcol ) * npcol;
140 1 equemene
   kb    = n    - tmp1 * nb;
141 1 equemene
142 1 equemene
   Aptr = (double *)(A); XC = Mptr( Aptr, 0, Anq, lda );
143 1 equemene
   Mindxg2p( n, nb, nb, Bcol, 0, npcol );
144 1 equemene
145 1 equemene
   if( ( Anp > 0 ) && ( Alcol != Bcol ) )
146 1 equemene
   {
147 1 equemene
      if( mycol == Bcol  )
148 1 equemene
      { (void) HPL_send( XC, Anp, Alcol, Rmsgid, Rcomm ); }
149 1 equemene
      else if( mycol == Alcol )
150 1 equemene
      { (void) HPL_recv( XC, Anp, Bcol,  Rmsgid, Rcomm ); }
151 1 equemene
   }
152 1 equemene
   Rmsgid = ( Rmsgid + 2 >
153 1 equemene
              MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2 );
154 1 equemene
   if( mycol != Alcol )
155 1 equemene
   { for( tmp1=0; tmp1 < Anp; tmp1++ ) XC[tmp1] = HPL_rzero; }
156 1 equemene
/*
157 1 equemene
 * Set up lookahead
158 1 equemene
 */
159 1 equemene
   n1 = ( npcol - 1 ) * nb; n1 = Mmax( n1, nb );
160 1 equemene
   if( Anp > 0 )
161 1 equemene
   {
162 1 equemene
      W = (double*)malloc( (size_t)(Mmin( n1, Anp )) * sizeof( double ) );
163 1 equemene
      if( W == NULL )
164 1 equemene
      { HPL_pabort( __LINE__, "HPL_pdtrsv", "Memory allocation failed" ); }
165 1 equemene
      Wfr = 1;
166 1 equemene
   }
167 1 equemene
168 1 equemene
   Anpprev = Anp; Xdprev = XR; Aprev = Aptr = Mptr( Aptr, 0, Anq, lda );
169 1 equemene
   tmp1    = n - kb; tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) );
170 1 equemene
   MnumrocI( n1pprev, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow );
171 1 equemene
172 1 equemene
   if( myrow == Alrow ) { Anpprev = ( Anp -= kb ); }
173 1 equemene
   if( mycol == Alcol )
174 1 equemene
   {
175 1 equemene
      Aprev = ( Aptr -= lda * kb ); Anq -= kb; Xdprev = ( Xd = XR + Anq );
176 1 equemene
      if( myrow == Alrow )
177 1 equemene
      {
178 1 equemene
         HPL_dtrsv( HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit,
179 1 equemene
                    kb, Aptr+Anp, lda, XC+Anp, 1 );
180 1 equemene
         HPL_dcopy( kb, XC+Anp, 1, Xd, 1 );
181 1 equemene
      }
182 1 equemene
   }
183 1 equemene
184 1 equemene
   rowprev = Alrow; Alrow = MModSub1( Alrow, nprow );
185 1 equemene
   colprev = Alcol; Alcol = MModSub1( Alcol, npcol );
186 1 equemene
   kbprev  = kb; n -= kb;
187 1 equemene
   tmp1    = n - ( kb = nb ); tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) );
188 1 equemene
   MnumrocI( n1p, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow );
189 1 equemene
/*
190 1 equemene
 * Start the operations
191 1 equemene
 */
192 1 equemene
   while( n > 0 )
193 1 equemene
   {
194 1 equemene
      if( mycol == Alcol ) { Aptr -= lda * kb; Anq -= kb; Xd = XR + Anq; }
195 1 equemene
      if( myrow == Alrow ) { Anp -= kb; }
196 1 equemene
/*
197 1 equemene
 * Broadcast  (decreasing-ring)  of  previous solution block in previous
198 1 equemene
 * process column,  compute  partial update of current block and send it
199 1 equemene
 * to current process column.
200 1 equemene
 */
201 1 equemene
      if( mycol == colprev )
202 1 equemene
      {
203 1 equemene
/*
204 1 equemene
 * Send previous solution block in process row above
205 1 equemene
 */
206 1 equemene
         if( myrow == rowprev )
207 1 equemene
         {
208 1 equemene
            if( GridIsNot1xQ )
209 1 equemene
               (void) HPL_send( Xdprev, kbprev, MModSub1( myrow, nprow ),
210 1 equemene
                                Cmsgid, Ccomm );
211 1 equemene
         }
212 1 equemene
         else
213 1 equemene
         {
214 1 equemene
            (void) HPL_recv( Xdprev, kbprev, MModAdd1( myrow, nprow ),
215 1 equemene
                             Cmsgid, Ccomm );
216 1 equemene
         }
217 1 equemene
/*
218 1 equemene
 * Compute partial update of previous solution block and send it to cur-
219 1 equemene
 * rent column
220 1 equemene
 */
221 1 equemene
         if( n1pprev > 0 )
222 1 equemene
         {
223 1 equemene
            tmp1 = Anpprev - n1pprev;
224 1 equemene
            HPL_dgemv( HplColumnMajor, HplNoTrans, n1pprev, kbprev,
225 1 equemene
                       -HPL_rone, Aprev+tmp1, lda, Xdprev, 1, HPL_rone,
226 1 equemene
                       XC+tmp1, 1 );
227 1 equemene
            if( GridIsNotPx1 )
228 1 equemene
               (void) HPL_send( XC+tmp1, n1pprev, Alcol, Rmsgid, Rcomm );
229 1 equemene
         }
230 1 equemene
/*
231 1 equemene
 * Finish  the (decreasing-ring) broadcast of the solution block in pre-
232 1 equemene
 * vious process column
233 1 equemene
 */
234 1 equemene
         if( ( myrow != rowprev ) &&
235 1 equemene
             ( myrow != MModAdd1( rowprev, nprow ) ) )
236 1 equemene
            (void) HPL_send( Xdprev, kbprev, MModSub1( myrow, nprow ),
237 1 equemene
                             Cmsgid, Ccomm );
238 1 equemene
      }
239 1 equemene
      else if( mycol == Alcol )
240 1 equemene
      {
241 1 equemene
/*
242 1 equemene
 * Current  column  receives  and accumulates partial update of previous
243 1 equemene
 * solution block
244 1 equemene
 */
245 1 equemene
         if( n1pprev > 0 )
246 1 equemene
         {
247 1 equemene
            (void) HPL_recv( W, n1pprev, colprev, Rmsgid, Rcomm );
248 1 equemene
            HPL_daxpy( n1pprev, HPL_rone, W, 1, XC+Anpprev-n1pprev, 1 );
249 1 equemene
         }
250 1 equemene
      }
251 1 equemene
/*
252 1 equemene
 * Solve current diagonal block
253 1 equemene
 */
254 1 equemene
      if( ( mycol == Alcol ) && ( myrow == Alrow ) )
255 1 equemene
      {
256 1 equemene
         HPL_dtrsv( HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit,
257 1 equemene
                    kb, Aptr+Anp, lda, XC+Anp, 1 );
258 1 equemene
         HPL_dcopy( kb, XC+Anp, 1, XR+Anq, 1 );
259 1 equemene
      }
260 1 equemene
/*
261 1 equemene
*  Finish previous update
262 1 equemene
*/
263 1 equemene
      if( ( mycol == colprev ) && ( ( tmp1 = Anpprev - n1pprev ) > 0 ) )
264 1 equemene
         HPL_dgemv( HplColumnMajor, HplNoTrans, tmp1, kbprev, -HPL_rone,
265 1 equemene
                    Aprev, lda, Xdprev, 1, HPL_rone, XC, 1 );
266 1 equemene
/*
267 1 equemene
*  Save info of current step and update info for the next step
268 1 equemene
*/
269 1 equemene
      if( mycol == Alcol ) { Xdprev   = Xd; Aprev = Aptr; }
270 1 equemene
      if( myrow == Alrow ) { Anpprev -= kb; }
271 1 equemene
      rowprev = Alrow; colprev = Alcol;
272 1 equemene
      n1pprev = n1p;   kbprev  = kb; n -= kb;
273 1 equemene
      Alrow = MModSub1( Alrow, nprow ); Alcol = MModSub1( Alcol, npcol );
274 1 equemene
      tmp1  = n - ( kb = nb ); tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) );
275 1 equemene
      MnumrocI( n1p, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow );
276 1 equemene
277 1 equemene
      Rmsgid = ( Rmsgid+2 > MSGID_END_PTRSV ?
278 1 equemene
                 MSGID_BEGIN_PTRSV   : Rmsgid+2 );
279 1 equemene
      Cmsgid = ( Cmsgid+2 > MSGID_END_PTRSV ?
280 1 equemene
                 MSGID_BEGIN_PTRSV+1 : Cmsgid+2 );
281 1 equemene
   }
282 1 equemene
/*
283 1 equemene
 * Replicate last solution block
284 1 equemene
 */
285 1 equemene
   if( mycol == colprev )
286 1 equemene
      (void) HPL_broadcast( (void *)(XR), kbprev, HPL_DOUBLE, rowprev,
287 1 equemene
                            Ccomm );
288 1 equemene
289 1 equemene
   if( Wfr  ) free( W  );
290 1 equemene
#ifdef HPL_DETAILED_TIMING
291 1 equemene
   HPL_ptimer( HPL_TIMING_PTRSV );
292 1 equemene
#endif
293 1 equemene
/*
294 1 equemene
 * End of HPL_pdtrsv
295 1 equemene
 */
296 1 equemene
}