root / src / pfact / HPL_pdmxswp.c @ 9
Historique | Voir | Annoter | Télécharger (12,31 ko)
1 | 1 | equemene | /*
|
---|---|---|---|
2 | 1 | equemene | * -- High Performance Computing Linpack Benchmark (HPL)
|
3 | 1 | equemene | * HPL - 2.0 - September 10, 2008
|
4 | 1 | equemene | * Antoine P. Petitet
|
5 | 1 | equemene | * University of Tennessee, Knoxville
|
6 | 1 | equemene | * Innovative Computing Laboratory
|
7 | 1 | equemene | * (C) Copyright 2000-2008 All Rights Reserved
|
8 | 1 | equemene | *
|
9 | 1 | equemene | * -- Copyright notice and Licensing terms:
|
10 | 1 | equemene | *
|
11 | 1 | equemene | * Redistribution and use in source and binary forms, with or without
|
12 | 1 | equemene | * modification, are permitted provided that the following conditions
|
13 | 1 | equemene | * are met:
|
14 | 1 | equemene | *
|
15 | 1 | equemene | * 1. Redistributions of source code must retain the above copyright
|
16 | 1 | equemene | * notice, this list of conditions and the following disclaimer.
|
17 | 1 | equemene | *
|
18 | 1 | equemene | * 2. Redistributions in binary form must reproduce the above copyright
|
19 | 1 | equemene | * notice, this list of conditions, and the following disclaimer in the
|
20 | 1 | equemene | * documentation and/or other materials provided with the distribution.
|
21 | 1 | equemene | *
|
22 | 1 | equemene | * 3. All advertising materials mentioning features or use of this
|
23 | 1 | equemene | * software must display the following acknowledgement:
|
24 | 1 | equemene | * This product includes software developed at the University of
|
25 | 1 | equemene | * Tennessee, Knoxville, Innovative Computing Laboratory.
|
26 | 1 | equemene | *
|
27 | 1 | equemene | * 4. The name of the University, the name of the Laboratory, or the
|
28 | 1 | equemene | * names of its contributors may not be used to endorse or promote
|
29 | 1 | equemene | * products derived from this software without specific written
|
30 | 1 | equemene | * permission.
|
31 | 1 | equemene | *
|
32 | 1 | equemene | * -- Disclaimer:
|
33 | 1 | equemene | *
|
34 | 1 | equemene | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
35 | 1 | equemene | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
36 | 1 | equemene | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
37 | 1 | equemene | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
38 | 1 | equemene | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
39 | 1 | equemene | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
40 | 1 | equemene | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
41 | 1 | equemene | * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
42 | 1 | equemene | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
43 | 1 | equemene | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
44 | 1 | equemene | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
45 | 1 | equemene | * ---------------------------------------------------------------------
|
46 | 1 | equemene | */
|
47 | 1 | equemene | /*
|
48 | 1 | equemene | * Include files
|
49 | 1 | equemene | */
|
50 | 1 | equemene | #include "hpl.h" |
51 | 1 | equemene | |
52 | 1 | equemene | #ifdef STDC_HEADERS
|
53 | 1 | equemene | void HPL_pdmxswp
|
54 | 1 | equemene | ( |
55 | 1 | equemene | HPL_T_panel * PANEL, |
56 | 1 | equemene | const int M, |
57 | 1 | equemene | const int II, |
58 | 1 | equemene | const int JJ, |
59 | 1 | equemene | double * WORK
|
60 | 1 | equemene | ) |
61 | 1 | equemene | #else
|
62 | 1 | equemene | void HPL_pdmxswp
|
63 | 1 | equemene | ( PANEL, M, II, JJ, WORK ) |
64 | 1 | equemene | HPL_T_panel * PANEL; |
65 | 1 | equemene | const int M; |
66 | 1 | equemene | const int II; |
67 | 1 | equemene | const int JJ; |
68 | 1 | equemene | double * WORK;
|
69 | 1 | equemene | #endif
|
70 | 1 | equemene | { |
71 | 1 | equemene | /*
|
72 | 1 | equemene | * Purpose
|
73 | 1 | equemene | * =======
|
74 | 1 | equemene | *
|
75 | 1 | equemene | * HPL_pdmxswp swaps and broadcasts the absolute value max row using
|
76 | 1 | equemene | * bi-directional exchange. The buffer is partially set by HPL_dlocmax.
|
77 | 1 | equemene | *
|
78 | 1 | equemene | * Bi-directional exchange is used to perform the swap::broadcast
|
79 | 1 | equemene | * operations at once for one column in the panel. This results in a
|
80 | 1 | equemene | * lower number of slightly larger messages than usual. On P processes
|
81 | 1 | equemene | * and assuming bi-directional links, the running time of this function
|
82 | 1 | equemene | * can be approximated by
|
83 | 1 | equemene | *
|
84 | 1 | equemene | * log_2( P ) * ( lat + ( 2 * N0 + 4 ) / bdwth )
|
85 | 1 | equemene | *
|
86 | 1 | equemene | * where lat and bdwth are the latency and bandwidth of the network for
|
87 | 1 | equemene | * double precision real elements. Communication only occurs in one
|
88 | 1 | equemene | * process column. Mono-directional links will cause the communication
|
89 | 1 | equemene | * cost to double.
|
90 | 1 | equemene | *
|
91 | 1 | equemene | * Arguments
|
92 | 1 | equemene | * =========
|
93 | 1 | equemene | *
|
94 | 1 | equemene | * PANEL (local input/output) HPL_T_panel *
|
95 | 1 | equemene | * On entry, PANEL points to the data structure containing the
|
96 | 1 | equemene | * panel information.
|
97 | 1 | equemene | *
|
98 | 1 | equemene | * M (local input) const int
|
99 | 1 | equemene | * On entry, M specifies the local number of rows of the matrix
|
100 | 1 | equemene | * column on which this function operates.
|
101 | 1 | equemene | *
|
102 | 1 | equemene | * II (local input) const int
|
103 | 1 | equemene | * On entry, II specifies the row offset where the column to be
|
104 | 1 | equemene | * operated on starts with respect to the panel.
|
105 | 1 | equemene | *
|
106 | 1 | equemene | * JJ (local input) const int
|
107 | 1 | equemene | * On entry, JJ specifies the column offset where the column to
|
108 | 1 | equemene | * be operated on starts with respect to the panel.
|
109 | 1 | equemene | *
|
110 | 1 | equemene | * WORK (local workspace) double *
|
111 | 1 | equemene | * On entry, WORK is a workarray of size at least 2 * (4+2*N0).
|
112 | 1 | equemene | * It is assumed that HPL_dlocmax was called prior to this
|
113 | 1 | equemene | * routine to initialize the first four entries of this array.
|
114 | 1 | equemene | * On exit, the N0 length max row is stored in WORK[4:4+N0-1];
|
115 | 1 | equemene | * Note that this is also the JJth row (or column) of L1. The
|
116 | 1 | equemene | * remaining part is used as a temporary array.
|
117 | 1 | equemene | *
|
118 | 1 | equemene | * ---------------------------------------------------------------------
|
119 | 1 | equemene | */
|
120 | 1 | equemene | /*
|
121 | 1 | equemene | * .. Local Variables ..
|
122 | 1 | equemene | */
|
123 | 1 | equemene | double gmax, tmp1;
|
124 | 1 | equemene | double * A0, * Wmx, * Wwork;
|
125 | 1 | equemene | HPL_T_grid * grid; |
126 | 1 | equemene | MPI_Comm comm; |
127 | 1 | equemene | unsigned int hdim, ip2, ip2_, ipow, k, mask; |
128 | 1 | equemene | int Np2, cnt_, cnt0, i, icurrow, lda, mydist,
|
129 | 1 | equemene | mydis_, myrow, n0, nprow, partner, rcnt, |
130 | 1 | equemene | root, scnt, size_; |
131 | 1 | equemene | /* ..
|
132 | 1 | equemene | * .. Executable Statements ..
|
133 | 1 | equemene | */
|
134 | 1 | equemene | #ifdef HPL_DETAILED_TIMING
|
135 | 1 | equemene | HPL_ptimer( HPL_TIMING_MXSWP ); |
136 | 1 | equemene | #endif
|
137 | 1 | equemene | grid = PANEL->grid; myrow = grid->myrow; nprow = grid->nprow; |
138 | 1 | equemene | /*
|
139 | 1 | equemene | * ip2 : the smallest power of two less than or equal to nprow;
|
140 | 1 | equemene | * hdim : dimension of the hypercube made of those ip2 processes;
|
141 | 1 | equemene | * Np2 : logical flag indicating whether or not nprow is a power of 2;
|
142 | 1 | equemene | */
|
143 | 1 | equemene | comm = grid->col_comm; ip2 = (unsigned int)(grid->row_ip2); |
144 | 1 | equemene | hdim = (unsigned int)(grid->row_hdim); n0 = PANEL->jb; |
145 | 1 | equemene | icurrow = PANEL->prow; Np2 = (int)( ( size_ = nprow - ip2 ) != 0 ); |
146 | 1 | equemene | mydist = MModSub( myrow, icurrow, nprow ); |
147 | 1 | equemene | /*
|
148 | 1 | equemene | * Set up pointers in workspace: WORK and Wwork point to the beginning
|
149 | 1 | equemene | * of the buffers of size 4 + 2*N0 to be combined. Wmx points to the row
|
150 | 1 | equemene | * owning the local (before combine) and global (after combine) absolute
|
151 | 1 | equemene | * value max. A0 points to the copy of the current row of the matrix.
|
152 | 1 | equemene | */
|
153 | 1 | equemene | cnt0 = ( cnt_ = n0 + 4 ) + n0; A0 = ( Wmx = WORK + 4 ) + n0; |
154 | 1 | equemene | Wwork = WORK + cnt0; |
155 | 1 | equemene | /*
|
156 | 1 | equemene | * Wmx[0:N0-1] := A[ilindx,0:N0-1] where ilindx is (int)(WORK[1]) (row
|
157 | 1 | equemene | * with max in current column). If I am the current process row, pack in
|
158 | 1 | equemene | * addition the current row of A in A0[0:N0-1]. If I do not own any row
|
159 | 1 | equemene | * of A, then zero out Wmx[0:N0-1].
|
160 | 1 | equemene | */
|
161 | 1 | equemene | if( M > 0 ) |
162 | 1 | equemene | { |
163 | 1 | equemene | lda = PANEL->lda; |
164 | 1 | equemene | HPL_dcopy( n0, Mptr( PANEL->A, II+(int)(WORK[1]), 0, lda ), lda, |
165 | 1 | equemene | Wmx, 1 );
|
166 | 1 | equemene | if( myrow == icurrow )
|
167 | 1 | equemene | { HPL_dcopy( n0, Mptr( PANEL->A, II, 0, lda ), lda, A0, 1 ); } |
168 | 1 | equemene | } |
169 | 1 | equemene | else { for( i = 0; i < n0; i++ ) Wmx[i] = HPL_rzero; } |
170 | 1 | equemene | /*
|
171 | 1 | equemene | * Combine the results (bi-directional exchange): the process coordina-
|
172 | 1 | equemene | * tes are relative to icurrow, this allows to reduce the communication
|
173 | 1 | equemene | * volume when nprow is not a power of 2.
|
174 | 1 | equemene | *
|
175 | 1 | equemene | * When nprow is not a power of 2: proc[i-ip2] receives local data from
|
176 | 1 | equemene | * proc[i] for all i in [ip2..nprow). In addition, proc[0] (icurrow)
|
177 | 1 | equemene | * sends to proc[ip2] the current row of A for later broadcast in procs
|
178 | 1 | equemene | * [ip2..nprow).
|
179 | 1 | equemene | */
|
180 | 1 | equemene | if( ( Np2 != 0 ) && |
181 | 1 | equemene | ( ( partner = (int)((unsigned int)(mydist) ^ ip2 ) ) < nprow ) ) |
182 | 1 | equemene | { |
183 | 1 | equemene | if( ( mydist & ip2 ) != 0 ) |
184 | 1 | equemene | { |
185 | 1 | equemene | if( mydist == (int)(ip2) ) |
186 | 1 | equemene | (void) HPL_sdrv( WORK, cnt_, MSGID_BEGIN_PFACT, A0, n0,
|
187 | 1 | equemene | MSGID_BEGIN_PFACT, MModAdd( partner, |
188 | 1 | equemene | icurrow, nprow ), comm ); |
189 | 1 | equemene | else
|
190 | 1 | equemene | (void) HPL_send( WORK, cnt_, MModAdd( partner, icurrow,
|
191 | 1 | equemene | nprow ), MSGID_BEGIN_PFACT, comm ); |
192 | 1 | equemene | } |
193 | 1 | equemene | else
|
194 | 1 | equemene | { |
195 | 1 | equemene | if( mydist == 0 ) |
196 | 1 | equemene | (void) HPL_sdrv( A0, n0, MSGID_BEGIN_PFACT, Wwork, cnt_,
|
197 | 1 | equemene | MSGID_BEGIN_PFACT, MModAdd( partner, |
198 | 1 | equemene | icurrow, nprow ), comm ); |
199 | 1 | equemene | else
|
200 | 1 | equemene | (void) HPL_recv( Wwork, cnt_, MModAdd( partner, icurrow,
|
201 | 1 | equemene | nprow ), MSGID_BEGIN_PFACT, comm ); |
202 | 1 | equemene | |
203 | 1 | equemene | tmp1 = Mabs( Wwork[0] ); gmax = Mabs( WORK[0] ); |
204 | 1 | equemene | if( ( tmp1 > gmax ) ||
|
205 | 1 | equemene | ( ( tmp1 == gmax ) && ( Wwork[3] < WORK[3] ) ) ) |
206 | 1 | equemene | { HPL_dcopy( cnt_, Wwork, 1, WORK, 1 ); } |
207 | 1 | equemene | } |
208 | 1 | equemene | } |
209 | 1 | equemene | |
210 | 1 | equemene | if( mydist < (int)(ip2) ) |
211 | 1 | equemene | { |
212 | 1 | equemene | /*
|
213 | 1 | equemene | * power of 2 part of the processes collection: processes [0..ip2) are
|
214 | 1 | equemene | * combining (binary exchange); proc[0] has two rows to send, but one to
|
215 | 1 | equemene | * receive. At every step k in [0..hdim) of the algorithm, a process
|
216 | 1 | equemene | * pair exchanging 2 rows is such that myrow >> k+1 is 0. Among those
|
217 | 1 | equemene | * processes the ones that are sending one more row than what they are
|
218 | 1 | equemene | * receiving are such that myrow >> k is equal to 0.
|
219 | 1 | equemene | */
|
220 | 1 | equemene | k = 0; ipow = 1; |
221 | 1 | equemene | |
222 | 1 | equemene | while( k < hdim )
|
223 | 1 | equemene | { |
224 | 1 | equemene | if( ( (unsigned int)(mydist) >> ( k + 1 ) ) == 0 ) |
225 | 1 | equemene | { |
226 | 1 | equemene | if( ( (unsigned int)(mydist) >> k ) == 0 ) |
227 | 1 | equemene | { scnt = cnt0; rcnt = cnt_; } |
228 | 1 | equemene | else
|
229 | 1 | equemene | { scnt = cnt_; rcnt = cnt0; } |
230 | 1 | equemene | } |
231 | 1 | equemene | else { scnt = rcnt = cnt_; }
|
232 | 1 | equemene | |
233 | 1 | equemene | partner = (int)( (unsigned int)(mydist) ^ ipow ); |
234 | 1 | equemene | (void) HPL_sdrv( WORK, scnt, MSGID_BEGIN_PFACT, Wwork, rcnt,
|
235 | 1 | equemene | MSGID_BEGIN_PFACT, MModAdd( partner, icurrow, |
236 | 1 | equemene | nprow ), comm ); |
237 | 1 | equemene | |
238 | 1 | equemene | tmp1 = Mabs( Wwork[0] ); gmax = Mabs( WORK[0] ); |
239 | 1 | equemene | if( ( tmp1 > gmax ) ||
|
240 | 1 | equemene | ( ( tmp1 == gmax ) && ( Wwork[3] < WORK[3] ) ) ) |
241 | 1 | equemene | { |
242 | 1 | equemene | HPL_dcopy( ( rcnt == cnt0 ? cnt0 : cnt_ ), Wwork, 1,
|
243 | 1 | equemene | WORK, 1 );
|
244 | 1 | equemene | } |
245 | 1 | equemene | else if( rcnt == cnt0 ) |
246 | 1 | equemene | { HPL_dcopy( n0, Wwork+cnt_, 1, A0, 1 ); } |
247 | 1 | equemene | |
248 | 1 | equemene | ipow <<= 1; k++;
|
249 | 1 | equemene | } |
250 | 1 | equemene | } |
251 | 1 | equemene | else if( size_ > 1 ) |
252 | 1 | equemene | { |
253 | 1 | equemene | /*
|
254 | 1 | equemene | * proc[ip2] broadcast current row of A to procs [ip2+1..nprow).
|
255 | 1 | equemene | */
|
256 | 1 | equemene | k = (unsigned int)(size_) - 1; ip2_ = mask = 1; |
257 | 1 | equemene | while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; } |
258 | 1 | equemene | |
259 | 1 | equemene | root = MModAdd( icurrow, (int)(ip2), nprow );
|
260 | 1 | equemene | mydis_ = MModSub( myrow, root, nprow ); |
261 | 1 | equemene | |
262 | 1 | equemene | do
|
263 | 1 | equemene | { |
264 | 1 | equemene | mask ^= ip2_; |
265 | 1 | equemene | if( ( mydis_ & mask ) == 0 ) |
266 | 1 | equemene | { |
267 | 1 | equemene | partner = (int)(mydis_ ^ ip2_);
|
268 | 1 | equemene | if( ( mydis_ & ip2_ ) != 0 ) |
269 | 1 | equemene | { |
270 | 1 | equemene | (void) HPL_recv( A0, n0, MModAdd( root, partner,
|
271 | 1 | equemene | nprow ), MSGID_BEGIN_PFACT, comm ); |
272 | 1 | equemene | } |
273 | 1 | equemene | else if( partner < size_ ) |
274 | 1 | equemene | { |
275 | 1 | equemene | (void) HPL_send( A0, n0, MModAdd( root, partner,
|
276 | 1 | equemene | nprow ), MSGID_BEGIN_PFACT, comm ); |
277 | 1 | equemene | } |
278 | 1 | equemene | } |
279 | 1 | equemene | ip2_ >>= 1;
|
280 | 1 | equemene | } while( ip2_ > 0 ); |
281 | 1 | equemene | } |
282 | 1 | equemene | /*
|
283 | 1 | equemene | * If nprow is not a power of 2, for all i in [ip2..nprow), proc[i-ip2]
|
284 | 1 | equemene | * sends the pivot row to proc[i] along with the first four entries of
|
285 | 1 | equemene | * the WORK array.
|
286 | 1 | equemene | */
|
287 | 1 | equemene | if( ( Np2 != 0 ) && |
288 | 1 | equemene | ( ( partner = (int)((unsigned int)(mydist) ^ ip2 ) ) < nprow ) ) |
289 | 1 | equemene | { |
290 | 1 | equemene | if( ( mydist & ip2 ) != 0 ) |
291 | 1 | equemene | { |
292 | 1 | equemene | (void) HPL_recv( WORK, cnt_, MModAdd( partner, icurrow,
|
293 | 1 | equemene | nprow ), MSGID_BEGIN_PFACT, comm ); |
294 | 1 | equemene | } |
295 | 1 | equemene | else
|
296 | 1 | equemene | { |
297 | 1 | equemene | (void) HPL_send( WORK, cnt_, MModAdd( partner, icurrow,
|
298 | 1 | equemene | nprow ), MSGID_BEGIN_PFACT, comm ); |
299 | 1 | equemene | } |
300 | 1 | equemene | } |
301 | 1 | equemene | /*
|
302 | 1 | equemene | * Save the global pivot index in pivot array
|
303 | 1 | equemene | */
|
304 | 1 | equemene | (PANEL->DPIV)[JJ] = WORK[2];
|
305 | 1 | equemene | #ifdef HPL_DETAILED_TIMING
|
306 | 1 | equemene | HPL_ptimer( HPL_TIMING_MXSWP ); |
307 | 1 | equemene | #endif
|
308 | 1 | equemene | /*
|
309 | 1 | equemene | * End of HPL_pdmxswp
|
310 | 1 | equemene | */
|
311 | 1 | equemene | } |