root / src / pgesv / HPL_pdtrsv.c
Historique | Voir | Annoter | Télécharger (11,45 ko)
1 |
/*
|
---|---|
2 |
* -- High Performance Computing Linpack Benchmark (HPL)
|
3 |
* HPL - 2.0 - September 10, 2008
|
4 |
* Antoine P. Petitet
|
5 |
* University of Tennessee, Knoxville
|
6 |
* Innovative Computing Laboratory
|
7 |
* (C) Copyright 2000-2008 All Rights Reserved
|
8 |
*
|
9 |
* -- Copyright notice and Licensing terms:
|
10 |
*
|
11 |
* Redistribution and use in source and binary forms, with or without
|
12 |
* modification, are permitted provided that the following conditions
|
13 |
* are met:
|
14 |
*
|
15 |
* 1. Redistributions of source code must retain the above copyright
|
16 |
* notice, this list of conditions and the following disclaimer.
|
17 |
*
|
18 |
* 2. Redistributions in binary form must reproduce the above copyright
|
19 |
* notice, this list of conditions, and the following disclaimer in the
|
20 |
* documentation and/or other materials provided with the distribution.
|
21 |
*
|
22 |
* 3. All advertising materials mentioning features or use of this
|
23 |
* software must display the following acknowledgement:
|
24 |
* This product includes software developed at the University of
|
25 |
* Tennessee, Knoxville, Innovative Computing Laboratory.
|
26 |
*
|
27 |
* 4. The name of the University, the name of the Laboratory, or the
|
28 |
* names of its contributors may not be used to endorse or promote
|
29 |
* products derived from this software without specific written
|
30 |
* permission.
|
31 |
*
|
32 |
* -- Disclaimer:
|
33 |
*
|
34 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
35 |
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
36 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
37 |
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
38 |
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
39 |
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
40 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
41 |
* DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
42 |
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
43 |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
44 |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
45 |
* ---------------------------------------------------------------------
|
46 |
*/
|
47 |
/*
|
48 |
* Include files
|
49 |
*/
|
50 |
#include "hpl.h" |
51 |
|
52 |
#ifdef STDC_HEADERS
|
53 |
void HPL_pdtrsv
|
54 |
( |
55 |
HPL_T_grid * GRID, |
56 |
HPL_T_pmat * AMAT |
57 |
) |
58 |
#else
|
59 |
void HPL_pdtrsv
|
60 |
( GRID, AMAT ) |
61 |
HPL_T_grid * GRID; |
62 |
HPL_T_pmat * AMAT; |
63 |
#endif
|
64 |
{ |
65 |
/*
|
66 |
* Purpose
|
67 |
* =======
|
68 |
*
|
69 |
* HPL_pdtrsv solves an upper triangular system of linear equations.
|
70 |
*
|
71 |
* The rhs is the last column of the N by N+1 matrix A. The solve starts
|
72 |
* in the process column owning the Nth column of A, so the rhs b may
|
73 |
* need to be moved one process column to the left at the beginning. The
|
74 |
* routine therefore needs a column vector in every process column but
|
75 |
* the one owning b. The result is replicated in all process rows, and
|
76 |
* returned in XR, i.e. XR is of size nq = LOCq( N ) in all processes.
|
77 |
*
|
78 |
* The algorithm uses decreasing one-ring broadcast in process rows and
|
79 |
* columns implemented in terms of synchronous communication point to
|
80 |
* point primitives. The lookahead of depth 1 is used to minimize the
|
81 |
* critical path. This entire operation is essentially ``latency'' bound
|
82 |
* and an estimate of its running time is given by:
|
83 |
*
|
84 |
* (move rhs) lat + N / ( P bdwth ) +
|
85 |
* (solve) ((N / NB)-1) 2 (lat + NB / bdwth) +
|
86 |
* gam2 N^2 / ( P Q ),
|
87 |
*
|
88 |
* where gam2 is an estimate of the Level 2 BLAS rate of execution.
|
89 |
* There are N / NB diagonal blocks. One must exchange 2 messages of
|
90 |
* length NB to compute the next NB entries of the vector solution, as
|
91 |
* well as performing a total of N^2 floating point operations.
|
92 |
*
|
93 |
* Arguments
|
94 |
* =========
|
95 |
*
|
96 |
* GRID (local input) HPL_T_grid *
|
97 |
* On entry, GRID points to the data structure containing the
|
98 |
* process grid information.
|
99 |
*
|
100 |
* AMAT (local input/output) HPL_T_pmat *
|
101 |
* On entry, AMAT points to the data structure containing the
|
102 |
* local array information.
|
103 |
*
|
104 |
* ---------------------------------------------------------------------
|
105 |
*/
|
106 |
/*
|
107 |
* .. Local Variables ..
|
108 |
*/
|
109 |
MPI_Comm Ccomm, Rcomm; |
110 |
double * A=NULL, * Aprev=NULL, * Aptr, * XC=NULL, |
111 |
* XR=NULL, * Xd=NULL, * Xdprev=NULL, |
112 |
* W=NULL;
|
113 |
int Alcol, Alrow, Anpprev, Anp, Anq, Bcol,
|
114 |
Cmsgid, GridIsNotPx1, GridIsNot1xQ, Rmsgid, |
115 |
Wfr=0, colprev, kb, kbprev, lda, mycol,
|
116 |
myrow, n, n1, n1p, n1pprev=0, nb, npcol,
|
117 |
nprow, rowprev, tmp1, tmp2; |
118 |
/* ..
|
119 |
* .. Executable Statements ..
|
120 |
*/
|
121 |
#ifdef HPL_DETAILED_TIMING
|
122 |
HPL_ptimer( HPL_TIMING_PTRSV ); |
123 |
#endif
|
124 |
if( ( n = AMAT->n ) <= 0 ) return; |
125 |
nb = AMAT->nb; lda = AMAT->ld; A = AMAT->A; XR = AMAT->X; |
126 |
|
127 |
(void) HPL_grid_info( GRID, &nprow, &npcol, &myrow, &mycol );
|
128 |
Rcomm = GRID->row_comm; Rmsgid = MSGID_BEGIN_PTRSV; |
129 |
Ccomm = GRID->col_comm; Cmsgid = MSGID_BEGIN_PTRSV + 1;
|
130 |
GridIsNot1xQ = ( nprow > 1 ); GridIsNotPx1 = ( npcol > 1 ); |
131 |
/*
|
132 |
* Move the rhs in the process column owning the last column of A.
|
133 |
*/
|
134 |
Mnumroc( Anp, n, nb, nb, myrow, 0, nprow );
|
135 |
Mnumroc( Anq, n, nb, nb, mycol, 0, npcol );
|
136 |
|
137 |
tmp1 = ( n - 1 ) / nb;
|
138 |
Alrow = tmp1 - ( tmp1 / nprow ) * nprow; |
139 |
Alcol = tmp1 - ( tmp1 / npcol ) * npcol; |
140 |
kb = n - tmp1 * nb; |
141 |
|
142 |
Aptr = (double *)(A); XC = Mptr( Aptr, 0, Anq, lda ); |
143 |
Mindxg2p( n, nb, nb, Bcol, 0, npcol );
|
144 |
|
145 |
if( ( Anp > 0 ) && ( Alcol != Bcol ) ) |
146 |
{ |
147 |
if( mycol == Bcol )
|
148 |
{ (void) HPL_send( XC, Anp, Alcol, Rmsgid, Rcomm ); }
|
149 |
else if( mycol == Alcol ) |
150 |
{ (void) HPL_recv( XC, Anp, Bcol, Rmsgid, Rcomm ); }
|
151 |
} |
152 |
Rmsgid = ( Rmsgid + 2 >
|
153 |
MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2 );
|
154 |
if( mycol != Alcol )
|
155 |
{ for( tmp1=0; tmp1 < Anp; tmp1++ ) XC[tmp1] = HPL_rzero; } |
156 |
/*
|
157 |
* Set up lookahead
|
158 |
*/
|
159 |
n1 = ( npcol - 1 ) * nb; n1 = Mmax( n1, nb );
|
160 |
if( Anp > 0 ) |
161 |
{ |
162 |
W = (double*)malloc( (size_t)(Mmin( n1, Anp )) * sizeof( double ) ); |
163 |
if( W == NULL ) |
164 |
{ HPL_pabort( __LINE__, "HPL_pdtrsv", "Memory allocation failed" ); } |
165 |
Wfr = 1;
|
166 |
} |
167 |
|
168 |
Anpprev = Anp; Xdprev = XR; Aprev = Aptr = Mptr( Aptr, 0, Anq, lda );
|
169 |
tmp1 = n - kb; tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) ); |
170 |
MnumrocI( n1pprev, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow ); |
171 |
|
172 |
if( myrow == Alrow ) { Anpprev = ( Anp -= kb ); }
|
173 |
if( mycol == Alcol )
|
174 |
{ |
175 |
Aprev = ( Aptr -= lda * kb ); Anq -= kb; Xdprev = ( Xd = XR + Anq ); |
176 |
if( myrow == Alrow )
|
177 |
{ |
178 |
HPL_dtrsv( HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, |
179 |
kb, Aptr+Anp, lda, XC+Anp, 1 );
|
180 |
HPL_dcopy( kb, XC+Anp, 1, Xd, 1 ); |
181 |
} |
182 |
} |
183 |
|
184 |
rowprev = Alrow; Alrow = MModSub1( Alrow, nprow ); |
185 |
colprev = Alcol; Alcol = MModSub1( Alcol, npcol ); |
186 |
kbprev = kb; n -= kb; |
187 |
tmp1 = n - ( kb = nb ); tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) ); |
188 |
MnumrocI( n1p, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow ); |
189 |
/*
|
190 |
* Start the operations
|
191 |
*/
|
192 |
while( n > 0 ) |
193 |
{ |
194 |
if( mycol == Alcol ) { Aptr -= lda * kb; Anq -= kb; Xd = XR + Anq; }
|
195 |
if( myrow == Alrow ) { Anp -= kb; }
|
196 |
/*
|
197 |
* Broadcast (decreasing-ring) of previous solution block in previous
|
198 |
* process column, compute partial update of current block and send it
|
199 |
* to current process column.
|
200 |
*/
|
201 |
if( mycol == colprev )
|
202 |
{ |
203 |
/*
|
204 |
* Send previous solution block in process row above
|
205 |
*/
|
206 |
if( myrow == rowprev )
|
207 |
{ |
208 |
if( GridIsNot1xQ )
|
209 |
(void) HPL_send( Xdprev, kbprev, MModSub1( myrow, nprow ),
|
210 |
Cmsgid, Ccomm ); |
211 |
} |
212 |
else
|
213 |
{ |
214 |
(void) HPL_recv( Xdprev, kbprev, MModAdd1( myrow, nprow ),
|
215 |
Cmsgid, Ccomm ); |
216 |
} |
217 |
/*
|
218 |
* Compute partial update of previous solution block and send it to cur-
|
219 |
* rent column
|
220 |
*/
|
221 |
if( n1pprev > 0 ) |
222 |
{ |
223 |
tmp1 = Anpprev - n1pprev; |
224 |
HPL_dgemv( HplColumnMajor, HplNoTrans, n1pprev, kbprev, |
225 |
-HPL_rone, Aprev+tmp1, lda, Xdprev, 1, HPL_rone,
|
226 |
XC+tmp1, 1 );
|
227 |
if( GridIsNotPx1 )
|
228 |
(void) HPL_send( XC+tmp1, n1pprev, Alcol, Rmsgid, Rcomm );
|
229 |
} |
230 |
/*
|
231 |
* Finish the (decreasing-ring) broadcast of the solution block in pre-
|
232 |
* vious process column
|
233 |
*/
|
234 |
if( ( myrow != rowprev ) &&
|
235 |
( myrow != MModAdd1( rowprev, nprow ) ) ) |
236 |
(void) HPL_send( Xdprev, kbprev, MModSub1( myrow, nprow ),
|
237 |
Cmsgid, Ccomm ); |
238 |
} |
239 |
else if( mycol == Alcol ) |
240 |
{ |
241 |
/*
|
242 |
* Current column receives and accumulates partial update of previous
|
243 |
* solution block
|
244 |
*/
|
245 |
if( n1pprev > 0 ) |
246 |
{ |
247 |
(void) HPL_recv( W, n1pprev, colprev, Rmsgid, Rcomm );
|
248 |
HPL_daxpy( n1pprev, HPL_rone, W, 1, XC+Anpprev-n1pprev, 1 ); |
249 |
} |
250 |
} |
251 |
/*
|
252 |
* Solve current diagonal block
|
253 |
*/
|
254 |
if( ( mycol == Alcol ) && ( myrow == Alrow ) )
|
255 |
{ |
256 |
HPL_dtrsv( HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, |
257 |
kb, Aptr+Anp, lda, XC+Anp, 1 );
|
258 |
HPL_dcopy( kb, XC+Anp, 1, XR+Anq, 1 ); |
259 |
} |
260 |
/*
|
261 |
* Finish previous update
|
262 |
*/
|
263 |
if( ( mycol == colprev ) && ( ( tmp1 = Anpprev - n1pprev ) > 0 ) ) |
264 |
HPL_dgemv( HplColumnMajor, HplNoTrans, tmp1, kbprev, -HPL_rone, |
265 |
Aprev, lda, Xdprev, 1, HPL_rone, XC, 1 ); |
266 |
/*
|
267 |
* Save info of current step and update info for the next step
|
268 |
*/
|
269 |
if( mycol == Alcol ) { Xdprev = Xd; Aprev = Aptr; }
|
270 |
if( myrow == Alrow ) { Anpprev -= kb; }
|
271 |
rowprev = Alrow; colprev = Alcol; |
272 |
n1pprev = n1p; kbprev = kb; n -= kb; |
273 |
Alrow = MModSub1( Alrow, nprow ); Alcol = MModSub1( Alcol, npcol ); |
274 |
tmp1 = n - ( kb = nb ); tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) ); |
275 |
MnumrocI( n1p, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow ); |
276 |
|
277 |
Rmsgid = ( Rmsgid+2 > MSGID_END_PTRSV ?
|
278 |
MSGID_BEGIN_PTRSV : Rmsgid+2 );
|
279 |
Cmsgid = ( Cmsgid+2 > MSGID_END_PTRSV ?
|
280 |
MSGID_BEGIN_PTRSV+1 : Cmsgid+2 ); |
281 |
} |
282 |
/*
|
283 |
* Replicate last solution block
|
284 |
*/
|
285 |
if( mycol == colprev )
|
286 |
(void) HPL_broadcast( (void *)(XR), kbprev, HPL_DOUBLE, rowprev, |
287 |
Ccomm ); |
288 |
|
289 |
if( Wfr ) free( W );
|
290 |
#ifdef HPL_DETAILED_TIMING
|
291 |
HPL_ptimer( HPL_TIMING_PTRSV ); |
292 |
#endif
|
293 |
/*
|
294 |
* End of HPL_pdtrsv
|
295 |
*/
|
296 |
} |