root / src / pfact / HPL_pdmxswp.c @ 1
Historique | Voir | Annoter | Télécharger (12,31 ko)
1 |
/*
|
---|---|
2 |
* -- High Performance Computing Linpack Benchmark (HPL)
|
3 |
* HPL - 2.0 - September 10, 2008
|
4 |
* Antoine P. Petitet
|
5 |
* University of Tennessee, Knoxville
|
6 |
* Innovative Computing Laboratory
|
7 |
* (C) Copyright 2000-2008 All Rights Reserved
|
8 |
*
|
9 |
* -- Copyright notice and Licensing terms:
|
10 |
*
|
11 |
* Redistribution and use in source and binary forms, with or without
|
12 |
* modification, are permitted provided that the following conditions
|
13 |
* are met:
|
14 |
*
|
15 |
* 1. Redistributions of source code must retain the above copyright
|
16 |
* notice, this list of conditions and the following disclaimer.
|
17 |
*
|
18 |
* 2. Redistributions in binary form must reproduce the above copyright
|
19 |
* notice, this list of conditions, and the following disclaimer in the
|
20 |
* documentation and/or other materials provided with the distribution.
|
21 |
*
|
22 |
* 3. All advertising materials mentioning features or use of this
|
23 |
* software must display the following acknowledgement:
|
24 |
* This product includes software developed at the University of
|
25 |
* Tennessee, Knoxville, Innovative Computing Laboratory.
|
26 |
*
|
27 |
* 4. The name of the University, the name of the Laboratory, or the
|
28 |
* names of its contributors may not be used to endorse or promote
|
29 |
* products derived from this software without specific written
|
30 |
* permission.
|
31 |
*
|
32 |
* -- Disclaimer:
|
33 |
*
|
34 |
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
35 |
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
36 |
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
37 |
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
38 |
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
39 |
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
40 |
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
41 |
* DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
42 |
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
43 |
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
44 |
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
45 |
* ---------------------------------------------------------------------
|
46 |
*/
|
47 |
/*
|
48 |
* Include files
|
49 |
*/
|
50 |
#include "hpl.h" |
51 |
|
52 |
#ifdef STDC_HEADERS
|
53 |
void HPL_pdmxswp
|
54 |
( |
55 |
HPL_T_panel * PANEL, |
56 |
const int M, |
57 |
const int II, |
58 |
const int JJ, |
59 |
double * WORK
|
60 |
) |
61 |
#else
|
62 |
void HPL_pdmxswp
|
63 |
( PANEL, M, II, JJ, WORK ) |
64 |
HPL_T_panel * PANEL; |
65 |
const int M; |
66 |
const int II; |
67 |
const int JJ; |
68 |
double * WORK;
|
69 |
#endif
|
70 |
{ |
71 |
/*
|
72 |
* Purpose
|
73 |
* =======
|
74 |
*
|
75 |
* HPL_pdmxswp swaps and broadcasts the absolute value max row using
|
76 |
* bi-directional exchange. The buffer is partially set by HPL_dlocmax.
|
77 |
*
|
78 |
* Bi-directional exchange is used to perform the swap::broadcast
|
79 |
* operations at once for one column in the panel. This results in a
|
80 |
* lower number of slightly larger messages than usual. On P processes
|
81 |
* and assuming bi-directional links, the running time of this function
|
82 |
* can be approximated by
|
83 |
*
|
84 |
* log_2( P ) * ( lat + ( 2 * N0 + 4 ) / bdwth )
|
85 |
*
|
86 |
* where lat and bdwth are the latency and bandwidth of the network for
|
87 |
* double precision real elements. Communication only occurs in one
|
88 |
* process column. Mono-directional links will cause the communication
|
89 |
* cost to double.
|
90 |
*
|
91 |
* Arguments
|
92 |
* =========
|
93 |
*
|
94 |
* PANEL (local input/output) HPL_T_panel *
|
95 |
* On entry, PANEL points to the data structure containing the
|
96 |
* panel information.
|
97 |
*
|
98 |
* M (local input) const int
|
99 |
* On entry, M specifies the local number of rows of the matrix
|
100 |
* column on which this function operates.
|
101 |
*
|
102 |
* II (local input) const int
|
103 |
* On entry, II specifies the row offset where the column to be
|
104 |
* operated on starts with respect to the panel.
|
105 |
*
|
106 |
* JJ (local input) const int
|
107 |
* On entry, JJ specifies the column offset where the column to
|
108 |
* be operated on starts with respect to the panel.
|
109 |
*
|
110 |
* WORK (local workspace) double *
|
111 |
* On entry, WORK is a workarray of size at least 2 * (4+2*N0).
|
112 |
* It is assumed that HPL_dlocmax was called prior to this
|
113 |
* routine to initialize the first four entries of this array.
|
114 |
* On exit, the N0 length max row is stored in WORK[4:4+N0-1];
|
115 |
* Note that this is also the JJth row (or column) of L1. The
|
116 |
* remaining part is used as a temporary array.
|
117 |
*
|
118 |
* ---------------------------------------------------------------------
|
119 |
*/
|
120 |
/*
|
121 |
* .. Local Variables ..
|
122 |
*/
|
123 |
double gmax, tmp1;
|
124 |
double * A0, * Wmx, * Wwork;
|
125 |
HPL_T_grid * grid; |
126 |
MPI_Comm comm; |
127 |
unsigned int hdim, ip2, ip2_, ipow, k, mask; |
128 |
int Np2, cnt_, cnt0, i, icurrow, lda, mydist,
|
129 |
mydis_, myrow, n0, nprow, partner, rcnt, |
130 |
root, scnt, size_; |
131 |
/* ..
|
132 |
* .. Executable Statements ..
|
133 |
*/
|
134 |
#ifdef HPL_DETAILED_TIMING
|
135 |
HPL_ptimer( HPL_TIMING_MXSWP ); |
136 |
#endif
|
137 |
grid = PANEL->grid; myrow = grid->myrow; nprow = grid->nprow; |
138 |
/*
|
139 |
* ip2 : the smallest power of two less than or equal to nprow;
|
140 |
* hdim : dimension of the hypercube made of those ip2 processes;
|
141 |
* Np2 : logical flag indicating whether or not nprow is a power of 2;
|
142 |
*/
|
143 |
comm = grid->col_comm; ip2 = (unsigned int)(grid->row_ip2); |
144 |
hdim = (unsigned int)(grid->row_hdim); n0 = PANEL->jb; |
145 |
icurrow = PANEL->prow; Np2 = (int)( ( size_ = nprow - ip2 ) != 0 ); |
146 |
mydist = MModSub( myrow, icurrow, nprow ); |
147 |
/*
|
148 |
* Set up pointers in workspace: WORK and Wwork point to the beginning
|
149 |
* of the buffers of size 4 + 2*N0 to be combined. Wmx points to the row
|
150 |
* owning the local (before combine) and global (after combine) absolute
|
151 |
* value max. A0 points to the copy of the current row of the matrix.
|
152 |
*/
|
153 |
cnt0 = ( cnt_ = n0 + 4 ) + n0; A0 = ( Wmx = WORK + 4 ) + n0; |
154 |
Wwork = WORK + cnt0; |
155 |
/*
|
156 |
* Wmx[0:N0-1] := A[ilindx,0:N0-1] where ilindx is (int)(WORK[1]) (row
|
157 |
* with max in current column). If I am the current process row, pack in
|
158 |
* addition the current row of A in A0[0:N0-1]. If I do not own any row
|
159 |
* of A, then zero out Wmx[0:N0-1].
|
160 |
*/
|
161 |
if( M > 0 ) |
162 |
{ |
163 |
lda = PANEL->lda; |
164 |
HPL_dcopy( n0, Mptr( PANEL->A, II+(int)(WORK[1]), 0, lda ), lda, |
165 |
Wmx, 1 );
|
166 |
if( myrow == icurrow )
|
167 |
{ HPL_dcopy( n0, Mptr( PANEL->A, II, 0, lda ), lda, A0, 1 ); } |
168 |
} |
169 |
else { for( i = 0; i < n0; i++ ) Wmx[i] = HPL_rzero; } |
170 |
/*
|
171 |
* Combine the results (bi-directional exchange): the process coordina-
|
172 |
* tes are relative to icurrow, this allows to reduce the communication
|
173 |
* volume when nprow is not a power of 2.
|
174 |
*
|
175 |
* When nprow is not a power of 2: proc[i-ip2] receives local data from
|
176 |
* proc[i] for all i in [ip2..nprow). In addition, proc[0] (icurrow)
|
177 |
* sends to proc[ip2] the current row of A for later broadcast in procs
|
178 |
* [ip2..nprow).
|
179 |
*/
|
180 |
if( ( Np2 != 0 ) && |
181 |
( ( partner = (int)((unsigned int)(mydist) ^ ip2 ) ) < nprow ) ) |
182 |
{ |
183 |
if( ( mydist & ip2 ) != 0 ) |
184 |
{ |
185 |
if( mydist == (int)(ip2) ) |
186 |
(void) HPL_sdrv( WORK, cnt_, MSGID_BEGIN_PFACT, A0, n0,
|
187 |
MSGID_BEGIN_PFACT, MModAdd( partner, |
188 |
icurrow, nprow ), comm ); |
189 |
else
|
190 |
(void) HPL_send( WORK, cnt_, MModAdd( partner, icurrow,
|
191 |
nprow ), MSGID_BEGIN_PFACT, comm ); |
192 |
} |
193 |
else
|
194 |
{ |
195 |
if( mydist == 0 ) |
196 |
(void) HPL_sdrv( A0, n0, MSGID_BEGIN_PFACT, Wwork, cnt_,
|
197 |
MSGID_BEGIN_PFACT, MModAdd( partner, |
198 |
icurrow, nprow ), comm ); |
199 |
else
|
200 |
(void) HPL_recv( Wwork, cnt_, MModAdd( partner, icurrow,
|
201 |
nprow ), MSGID_BEGIN_PFACT, comm ); |
202 |
|
203 |
tmp1 = Mabs( Wwork[0] ); gmax = Mabs( WORK[0] ); |
204 |
if( ( tmp1 > gmax ) ||
|
205 |
( ( tmp1 == gmax ) && ( Wwork[3] < WORK[3] ) ) ) |
206 |
{ HPL_dcopy( cnt_, Wwork, 1, WORK, 1 ); } |
207 |
} |
208 |
} |
209 |
|
210 |
if( mydist < (int)(ip2) ) |
211 |
{ |
212 |
/*
|
213 |
* power of 2 part of the processes collection: processes [0..ip2) are
|
214 |
* combining (binary exchange); proc[0] has two rows to send, but one to
|
215 |
* receive. At every step k in [0..hdim) of the algorithm, a process
|
216 |
* pair exchanging 2 rows is such that myrow >> k+1 is 0. Among those
|
217 |
* processes the ones that are sending one more row than what they are
|
218 |
* receiving are such that myrow >> k is equal to 0.
|
219 |
*/
|
220 |
k = 0; ipow = 1; |
221 |
|
222 |
while( k < hdim )
|
223 |
{ |
224 |
if( ( (unsigned int)(mydist) >> ( k + 1 ) ) == 0 ) |
225 |
{ |
226 |
if( ( (unsigned int)(mydist) >> k ) == 0 ) |
227 |
{ scnt = cnt0; rcnt = cnt_; } |
228 |
else
|
229 |
{ scnt = cnt_; rcnt = cnt0; } |
230 |
} |
231 |
else { scnt = rcnt = cnt_; }
|
232 |
|
233 |
partner = (int)( (unsigned int)(mydist) ^ ipow ); |
234 |
(void) HPL_sdrv( WORK, scnt, MSGID_BEGIN_PFACT, Wwork, rcnt,
|
235 |
MSGID_BEGIN_PFACT, MModAdd( partner, icurrow, |
236 |
nprow ), comm ); |
237 |
|
238 |
tmp1 = Mabs( Wwork[0] ); gmax = Mabs( WORK[0] ); |
239 |
if( ( tmp1 > gmax ) ||
|
240 |
( ( tmp1 == gmax ) && ( Wwork[3] < WORK[3] ) ) ) |
241 |
{ |
242 |
HPL_dcopy( ( rcnt == cnt0 ? cnt0 : cnt_ ), Wwork, 1,
|
243 |
WORK, 1 );
|
244 |
} |
245 |
else if( rcnt == cnt0 ) |
246 |
{ HPL_dcopy( n0, Wwork+cnt_, 1, A0, 1 ); } |
247 |
|
248 |
ipow <<= 1; k++;
|
249 |
} |
250 |
} |
251 |
else if( size_ > 1 ) |
252 |
{ |
253 |
/*
|
254 |
* proc[ip2] broadcast current row of A to procs [ip2+1..nprow).
|
255 |
*/
|
256 |
k = (unsigned int)(size_) - 1; ip2_ = mask = 1; |
257 |
while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; } |
258 |
|
259 |
root = MModAdd( icurrow, (int)(ip2), nprow );
|
260 |
mydis_ = MModSub( myrow, root, nprow ); |
261 |
|
262 |
do
|
263 |
{ |
264 |
mask ^= ip2_; |
265 |
if( ( mydis_ & mask ) == 0 ) |
266 |
{ |
267 |
partner = (int)(mydis_ ^ ip2_);
|
268 |
if( ( mydis_ & ip2_ ) != 0 ) |
269 |
{ |
270 |
(void) HPL_recv( A0, n0, MModAdd( root, partner,
|
271 |
nprow ), MSGID_BEGIN_PFACT, comm ); |
272 |
} |
273 |
else if( partner < size_ ) |
274 |
{ |
275 |
(void) HPL_send( A0, n0, MModAdd( root, partner,
|
276 |
nprow ), MSGID_BEGIN_PFACT, comm ); |
277 |
} |
278 |
} |
279 |
ip2_ >>= 1;
|
280 |
} while( ip2_ > 0 ); |
281 |
} |
282 |
/*
|
283 |
* If nprow is not a power of 2, for all i in [ip2..nprow), proc[i-ip2]
|
284 |
* sends the pivot row to proc[i] along with the first four entries of
|
285 |
* the WORK array.
|
286 |
*/
|
287 |
if( ( Np2 != 0 ) && |
288 |
( ( partner = (int)((unsigned int)(mydist) ^ ip2 ) ) < nprow ) ) |
289 |
{ |
290 |
if( ( mydist & ip2 ) != 0 ) |
291 |
{ |
292 |
(void) HPL_recv( WORK, cnt_, MModAdd( partner, icurrow,
|
293 |
nprow ), MSGID_BEGIN_PFACT, comm ); |
294 |
} |
295 |
else
|
296 |
{ |
297 |
(void) HPL_send( WORK, cnt_, MModAdd( partner, icurrow,
|
298 |
nprow ), MSGID_BEGIN_PFACT, comm ); |
299 |
} |
300 |
} |
301 |
/*
|
302 |
* Save the global pivot index in pivot array
|
303 |
*/
|
304 |
(PANEL->DPIV)[JJ] = WORK[2];
|
305 |
#ifdef HPL_DETAILED_TIMING
|
306 |
HPL_ptimer( HPL_TIMING_MXSWP ); |
307 |
#endif
|
308 |
/*
|
309 |
* End of HPL_pdmxswp
|
310 |
*/
|
311 |
} |