root / src / pgesv / HPL_pdtrsv.c @ 9
Historique | Voir | Annoter | Télécharger (11,45 ko)
1 | 1 | equemene | /*
|
---|---|---|---|
2 | 1 | equemene | * -- High Performance Computing Linpack Benchmark (HPL)
|
3 | 1 | equemene | * HPL - 2.0 - September 10, 2008
|
4 | 1 | equemene | * Antoine P. Petitet
|
5 | 1 | equemene | * University of Tennessee, Knoxville
|
6 | 1 | equemene | * Innovative Computing Laboratory
|
7 | 1 | equemene | * (C) Copyright 2000-2008 All Rights Reserved
|
8 | 1 | equemene | *
|
9 | 1 | equemene | * -- Copyright notice and Licensing terms:
|
10 | 1 | equemene | *
|
11 | 1 | equemene | * Redistribution and use in source and binary forms, with or without
|
12 | 1 | equemene | * modification, are permitted provided that the following conditions
|
13 | 1 | equemene | * are met:
|
14 | 1 | equemene | *
|
15 | 1 | equemene | * 1. Redistributions of source code must retain the above copyright
|
16 | 1 | equemene | * notice, this list of conditions and the following disclaimer.
|
17 | 1 | equemene | *
|
18 | 1 | equemene | * 2. Redistributions in binary form must reproduce the above copyright
|
19 | 1 | equemene | * notice, this list of conditions, and the following disclaimer in the
|
20 | 1 | equemene | * documentation and/or other materials provided with the distribution.
|
21 | 1 | equemene | *
|
22 | 1 | equemene | * 3. All advertising materials mentioning features or use of this
|
23 | 1 | equemene | * software must display the following acknowledgement:
|
24 | 1 | equemene | * This product includes software developed at the University of
|
25 | 1 | equemene | * Tennessee, Knoxville, Innovative Computing Laboratory.
|
26 | 1 | equemene | *
|
27 | 1 | equemene | * 4. The name of the University, the name of the Laboratory, or the
|
28 | 1 | equemene | * names of its contributors may not be used to endorse or promote
|
29 | 1 | equemene | * products derived from this software without specific written
|
30 | 1 | equemene | * permission.
|
31 | 1 | equemene | *
|
32 | 1 | equemene | * -- Disclaimer:
|
33 | 1 | equemene | *
|
34 | 1 | equemene | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
35 | 1 | equemene | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
36 | 1 | equemene | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
37 | 1 | equemene | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
38 | 1 | equemene | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
39 | 1 | equemene | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
40 | 1 | equemene | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
41 | 1 | equemene | * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
42 | 1 | equemene | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
43 | 1 | equemene | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
44 | 1 | equemene | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
45 | 1 | equemene | * ---------------------------------------------------------------------
|
46 | 1 | equemene | */
|
47 | 1 | equemene | /*
|
48 | 1 | equemene | * Include files
|
49 | 1 | equemene | */
|
50 | 1 | equemene | #include "hpl.h" |
51 | 1 | equemene | |
52 | 1 | equemene | #ifdef STDC_HEADERS
|
53 | 1 | equemene | void HPL_pdtrsv
|
54 | 1 | equemene | ( |
55 | 1 | equemene | HPL_T_grid * GRID, |
56 | 1 | equemene | HPL_T_pmat * AMAT |
57 | 1 | equemene | ) |
58 | 1 | equemene | #else
|
59 | 1 | equemene | void HPL_pdtrsv
|
60 | 1 | equemene | ( GRID, AMAT ) |
61 | 1 | equemene | HPL_T_grid * GRID; |
62 | 1 | equemene | HPL_T_pmat * AMAT; |
63 | 1 | equemene | #endif
|
64 | 1 | equemene | { |
65 | 1 | equemene | /*
|
66 | 1 | equemene | * Purpose
|
67 | 1 | equemene | * =======
|
68 | 1 | equemene | *
|
69 | 1 | equemene | * HPL_pdtrsv solves an upper triangular system of linear equations.
|
70 | 1 | equemene | *
|
71 | 1 | equemene | * The rhs is the last column of the N by N+1 matrix A. The solve starts
|
72 | 1 | equemene | * in the process column owning the Nth column of A, so the rhs b may
|
73 | 1 | equemene | * need to be moved one process column to the left at the beginning. The
|
74 | 1 | equemene | * routine therefore needs a column vector in every process column but
|
75 | 1 | equemene | * the one owning b. The result is replicated in all process rows, and
|
76 | 1 | equemene | * returned in XR, i.e. XR is of size nq = LOCq( N ) in all processes.
|
77 | 1 | equemene | *
|
78 | 1 | equemene | * The algorithm uses decreasing one-ring broadcast in process rows and
|
79 | 1 | equemene | * columns implemented in terms of synchronous communication point to
|
80 | 1 | equemene | * point primitives. The lookahead of depth 1 is used to minimize the
|
81 | 1 | equemene | * critical path. This entire operation is essentially ``latency'' bound
|
82 | 1 | equemene | * and an estimate of its running time is given by:
|
83 | 1 | equemene | *
|
84 | 1 | equemene | * (move rhs) lat + N / ( P bdwth ) +
|
85 | 1 | equemene | * (solve) ((N / NB)-1) 2 (lat + NB / bdwth) +
|
86 | 1 | equemene | * gam2 N^2 / ( P Q ),
|
87 | 1 | equemene | *
|
88 | 1 | equemene | * where gam2 is an estimate of the Level 2 BLAS rate of execution.
|
89 | 1 | equemene | * There are N / NB diagonal blocks. One must exchange 2 messages of
|
90 | 1 | equemene | * length NB to compute the next NB entries of the vector solution, as
|
91 | 1 | equemene | * well as performing a total of N^2 floating point operations.
|
92 | 1 | equemene | *
|
93 | 1 | equemene | * Arguments
|
94 | 1 | equemene | * =========
|
95 | 1 | equemene | *
|
96 | 1 | equemene | * GRID (local input) HPL_T_grid *
|
97 | 1 | equemene | * On entry, GRID points to the data structure containing the
|
98 | 1 | equemene | * process grid information.
|
99 | 1 | equemene | *
|
100 | 1 | equemene | * AMAT (local input/output) HPL_T_pmat *
|
101 | 1 | equemene | * On entry, AMAT points to the data structure containing the
|
102 | 1 | equemene | * local array information.
|
103 | 1 | equemene | *
|
104 | 1 | equemene | * ---------------------------------------------------------------------
|
105 | 1 | equemene | */
|
106 | 1 | equemene | /*
|
107 | 1 | equemene | * .. Local Variables ..
|
108 | 1 | equemene | */
|
109 | 1 | equemene | MPI_Comm Ccomm, Rcomm; |
110 | 1 | equemene | double * A=NULL, * Aprev=NULL, * Aptr, * XC=NULL, |
111 | 1 | equemene | * XR=NULL, * Xd=NULL, * Xdprev=NULL, |
112 | 1 | equemene | * W=NULL;
|
113 | 1 | equemene | int Alcol, Alrow, Anpprev, Anp, Anq, Bcol,
|
114 | 1 | equemene | Cmsgid, GridIsNotPx1, GridIsNot1xQ, Rmsgid, |
115 | 1 | equemene | Wfr=0, colprev, kb, kbprev, lda, mycol,
|
116 | 1 | equemene | myrow, n, n1, n1p, n1pprev=0, nb, npcol,
|
117 | 1 | equemene | nprow, rowprev, tmp1, tmp2; |
118 | 1 | equemene | /* ..
|
119 | 1 | equemene | * .. Executable Statements ..
|
120 | 1 | equemene | */
|
121 | 1 | equemene | #ifdef HPL_DETAILED_TIMING
|
122 | 1 | equemene | HPL_ptimer( HPL_TIMING_PTRSV ); |
123 | 1 | equemene | #endif
|
124 | 1 | equemene | if( ( n = AMAT->n ) <= 0 ) return; |
125 | 1 | equemene | nb = AMAT->nb; lda = AMAT->ld; A = AMAT->A; XR = AMAT->X; |
126 | 1 | equemene | |
127 | 1 | equemene | (void) HPL_grid_info( GRID, &nprow, &npcol, &myrow, &mycol );
|
128 | 1 | equemene | Rcomm = GRID->row_comm; Rmsgid = MSGID_BEGIN_PTRSV; |
129 | 1 | equemene | Ccomm = GRID->col_comm; Cmsgid = MSGID_BEGIN_PTRSV + 1;
|
130 | 1 | equemene | GridIsNot1xQ = ( nprow > 1 ); GridIsNotPx1 = ( npcol > 1 ); |
131 | 1 | equemene | /*
|
132 | 1 | equemene | * Move the rhs in the process column owning the last column of A.
|
133 | 1 | equemene | */
|
134 | 1 | equemene | Mnumroc( Anp, n, nb, nb, myrow, 0, nprow );
|
135 | 1 | equemene | Mnumroc( Anq, n, nb, nb, mycol, 0, npcol );
|
136 | 1 | equemene | |
137 | 1 | equemene | tmp1 = ( n - 1 ) / nb;
|
138 | 1 | equemene | Alrow = tmp1 - ( tmp1 / nprow ) * nprow; |
139 | 1 | equemene | Alcol = tmp1 - ( tmp1 / npcol ) * npcol; |
140 | 1 | equemene | kb = n - tmp1 * nb; |
141 | 1 | equemene | |
142 | 1 | equemene | Aptr = (double *)(A); XC = Mptr( Aptr, 0, Anq, lda ); |
143 | 1 | equemene | Mindxg2p( n, nb, nb, Bcol, 0, npcol );
|
144 | 1 | equemene | |
145 | 1 | equemene | if( ( Anp > 0 ) && ( Alcol != Bcol ) ) |
146 | 1 | equemene | { |
147 | 1 | equemene | if( mycol == Bcol )
|
148 | 1 | equemene | { (void) HPL_send( XC, Anp, Alcol, Rmsgid, Rcomm ); }
|
149 | 1 | equemene | else if( mycol == Alcol ) |
150 | 1 | equemene | { (void) HPL_recv( XC, Anp, Bcol, Rmsgid, Rcomm ); }
|
151 | 1 | equemene | } |
152 | 1 | equemene | Rmsgid = ( Rmsgid + 2 >
|
153 | 1 | equemene | MSGID_END_PTRSV ? MSGID_BEGIN_PTRSV : Rmsgid + 2 );
|
154 | 1 | equemene | if( mycol != Alcol )
|
155 | 1 | equemene | { for( tmp1=0; tmp1 < Anp; tmp1++ ) XC[tmp1] = HPL_rzero; } |
156 | 1 | equemene | /*
|
157 | 1 | equemene | * Set up lookahead
|
158 | 1 | equemene | */
|
159 | 1 | equemene | n1 = ( npcol - 1 ) * nb; n1 = Mmax( n1, nb );
|
160 | 1 | equemene | if( Anp > 0 ) |
161 | 1 | equemene | { |
162 | 1 | equemene | W = (double*)malloc( (size_t)(Mmin( n1, Anp )) * sizeof( double ) ); |
163 | 1 | equemene | if( W == NULL ) |
164 | 1 | equemene | { HPL_pabort( __LINE__, "HPL_pdtrsv", "Memory allocation failed" ); } |
165 | 1 | equemene | Wfr = 1;
|
166 | 1 | equemene | } |
167 | 1 | equemene | |
168 | 1 | equemene | Anpprev = Anp; Xdprev = XR; Aprev = Aptr = Mptr( Aptr, 0, Anq, lda );
|
169 | 1 | equemene | tmp1 = n - kb; tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) ); |
170 | 1 | equemene | MnumrocI( n1pprev, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow ); |
171 | 1 | equemene | |
172 | 1 | equemene | if( myrow == Alrow ) { Anpprev = ( Anp -= kb ); }
|
173 | 1 | equemene | if( mycol == Alcol )
|
174 | 1 | equemene | { |
175 | 1 | equemene | Aprev = ( Aptr -= lda * kb ); Anq -= kb; Xdprev = ( Xd = XR + Anq ); |
176 | 1 | equemene | if( myrow == Alrow )
|
177 | 1 | equemene | { |
178 | 1 | equemene | HPL_dtrsv( HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, |
179 | 1 | equemene | kb, Aptr+Anp, lda, XC+Anp, 1 );
|
180 | 1 | equemene | HPL_dcopy( kb, XC+Anp, 1, Xd, 1 ); |
181 | 1 | equemene | } |
182 | 1 | equemene | } |
183 | 1 | equemene | |
184 | 1 | equemene | rowprev = Alrow; Alrow = MModSub1( Alrow, nprow ); |
185 | 1 | equemene | colprev = Alcol; Alcol = MModSub1( Alcol, npcol ); |
186 | 1 | equemene | kbprev = kb; n -= kb; |
187 | 1 | equemene | tmp1 = n - ( kb = nb ); tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) ); |
188 | 1 | equemene | MnumrocI( n1p, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow ); |
189 | 1 | equemene | /*
|
190 | 1 | equemene | * Start the operations
|
191 | 1 | equemene | */
|
192 | 1 | equemene | while( n > 0 ) |
193 | 1 | equemene | { |
194 | 1 | equemene | if( mycol == Alcol ) { Aptr -= lda * kb; Anq -= kb; Xd = XR + Anq; }
|
195 | 1 | equemene | if( myrow == Alrow ) { Anp -= kb; }
|
196 | 1 | equemene | /*
|
197 | 1 | equemene | * Broadcast (decreasing-ring) of previous solution block in previous
|
198 | 1 | equemene | * process column, compute partial update of current block and send it
|
199 | 1 | equemene | * to current process column.
|
200 | 1 | equemene | */
|
201 | 1 | equemene | if( mycol == colprev )
|
202 | 1 | equemene | { |
203 | 1 | equemene | /*
|
204 | 1 | equemene | * Send previous solution block in process row above
|
205 | 1 | equemene | */
|
206 | 1 | equemene | if( myrow == rowprev )
|
207 | 1 | equemene | { |
208 | 1 | equemene | if( GridIsNot1xQ )
|
209 | 1 | equemene | (void) HPL_send( Xdprev, kbprev, MModSub1( myrow, nprow ),
|
210 | 1 | equemene | Cmsgid, Ccomm ); |
211 | 1 | equemene | } |
212 | 1 | equemene | else
|
213 | 1 | equemene | { |
214 | 1 | equemene | (void) HPL_recv( Xdprev, kbprev, MModAdd1( myrow, nprow ),
|
215 | 1 | equemene | Cmsgid, Ccomm ); |
216 | 1 | equemene | } |
217 | 1 | equemene | /*
|
218 | 1 | equemene | * Compute partial update of previous solution block and send it to cur-
|
219 | 1 | equemene | * rent column
|
220 | 1 | equemene | */
|
221 | 1 | equemene | if( n1pprev > 0 ) |
222 | 1 | equemene | { |
223 | 1 | equemene | tmp1 = Anpprev - n1pprev; |
224 | 1 | equemene | HPL_dgemv( HplColumnMajor, HplNoTrans, n1pprev, kbprev, |
225 | 1 | equemene | -HPL_rone, Aprev+tmp1, lda, Xdprev, 1, HPL_rone,
|
226 | 1 | equemene | XC+tmp1, 1 );
|
227 | 1 | equemene | if( GridIsNotPx1 )
|
228 | 1 | equemene | (void) HPL_send( XC+tmp1, n1pprev, Alcol, Rmsgid, Rcomm );
|
229 | 1 | equemene | } |
230 | 1 | equemene | /*
|
231 | 1 | equemene | * Finish the (decreasing-ring) broadcast of the solution block in pre-
|
232 | 1 | equemene | * vious process column
|
233 | 1 | equemene | */
|
234 | 1 | equemene | if( ( myrow != rowprev ) &&
|
235 | 1 | equemene | ( myrow != MModAdd1( rowprev, nprow ) ) ) |
236 | 1 | equemene | (void) HPL_send( Xdprev, kbprev, MModSub1( myrow, nprow ),
|
237 | 1 | equemene | Cmsgid, Ccomm ); |
238 | 1 | equemene | } |
239 | 1 | equemene | else if( mycol == Alcol ) |
240 | 1 | equemene | { |
241 | 1 | equemene | /*
|
242 | 1 | equemene | * Current column receives and accumulates partial update of previous
|
243 | 1 | equemene | * solution block
|
244 | 1 | equemene | */
|
245 | 1 | equemene | if( n1pprev > 0 ) |
246 | 1 | equemene | { |
247 | 1 | equemene | (void) HPL_recv( W, n1pprev, colprev, Rmsgid, Rcomm );
|
248 | 1 | equemene | HPL_daxpy( n1pprev, HPL_rone, W, 1, XC+Anpprev-n1pprev, 1 ); |
249 | 1 | equemene | } |
250 | 1 | equemene | } |
251 | 1 | equemene | /*
|
252 | 1 | equemene | * Solve current diagonal block
|
253 | 1 | equemene | */
|
254 | 1 | equemene | if( ( mycol == Alcol ) && ( myrow == Alrow ) )
|
255 | 1 | equemene | { |
256 | 1 | equemene | HPL_dtrsv( HplColumnMajor, HplUpper, HplNoTrans, HplNonUnit, |
257 | 1 | equemene | kb, Aptr+Anp, lda, XC+Anp, 1 );
|
258 | 1 | equemene | HPL_dcopy( kb, XC+Anp, 1, XR+Anq, 1 ); |
259 | 1 | equemene | } |
260 | 1 | equemene | /*
|
261 | 1 | equemene | * Finish previous update
|
262 | 1 | equemene | */
|
263 | 1 | equemene | if( ( mycol == colprev ) && ( ( tmp1 = Anpprev - n1pprev ) > 0 ) ) |
264 | 1 | equemene | HPL_dgemv( HplColumnMajor, HplNoTrans, tmp1, kbprev, -HPL_rone, |
265 | 1 | equemene | Aprev, lda, Xdprev, 1, HPL_rone, XC, 1 ); |
266 | 1 | equemene | /*
|
267 | 1 | equemene | * Save info of current step and update info for the next step
|
268 | 1 | equemene | */
|
269 | 1 | equemene | if( mycol == Alcol ) { Xdprev = Xd; Aprev = Aptr; }
|
270 | 1 | equemene | if( myrow == Alrow ) { Anpprev -= kb; }
|
271 | 1 | equemene | rowprev = Alrow; colprev = Alcol; |
272 | 1 | equemene | n1pprev = n1p; kbprev = kb; n -= kb; |
273 | 1 | equemene | Alrow = MModSub1( Alrow, nprow ); Alcol = MModSub1( Alcol, npcol ); |
274 | 1 | equemene | tmp1 = n - ( kb = nb ); tmp1 -= ( tmp2 = Mmin( tmp1, n1 ) ); |
275 | 1 | equemene | MnumrocI( n1p, tmp2, Mmax( 0, tmp1 ), nb, nb, myrow, 0, nprow ); |
276 | 1 | equemene | |
277 | 1 | equemene | Rmsgid = ( Rmsgid+2 > MSGID_END_PTRSV ?
|
278 | 1 | equemene | MSGID_BEGIN_PTRSV : Rmsgid+2 );
|
279 | 1 | equemene | Cmsgid = ( Cmsgid+2 > MSGID_END_PTRSV ?
|
280 | 1 | equemene | MSGID_BEGIN_PTRSV+1 : Cmsgid+2 ); |
281 | 1 | equemene | } |
282 | 1 | equemene | /*
|
283 | 1 | equemene | * Replicate last solution block
|
284 | 1 | equemene | */
|
285 | 1 | equemene | if( mycol == colprev )
|
286 | 1 | equemene | (void) HPL_broadcast( (void *)(XR), kbprev, HPL_DOUBLE, rowprev, |
287 | 1 | equemene | Ccomm ); |
288 | 1 | equemene | |
289 | 1 | equemene | if( Wfr ) free( W );
|
290 | 1 | equemene | #ifdef HPL_DETAILED_TIMING
|
291 | 1 | equemene | HPL_ptimer( HPL_TIMING_PTRSV ); |
292 | 1 | equemene | #endif
|
293 | 1 | equemene | /*
|
294 | 1 | equemene | * End of HPL_pdtrsv
|
295 | 1 | equemene | */
|
296 | 1 | equemene | } |