root / src / pgesv / HPL_pdlaswp00T.c @ 9
Historique | Voir | Annoter | Télécharger (16,8 ko)
1 | 1 | equemene | /*
|
---|---|---|---|
2 | 1 | equemene | * -- High Performance Computing Linpack Benchmark (HPL)
|
3 | 1 | equemene | * HPL - 2.0 - September 10, 2008
|
4 | 1 | equemene | * Antoine P. Petitet
|
5 | 1 | equemene | * University of Tennessee, Knoxville
|
6 | 1 | equemene | * Innovative Computing Laboratory
|
7 | 1 | equemene | * (C) Copyright 2000-2008 All Rights Reserved
|
8 | 1 | equemene | *
|
9 | 1 | equemene | * -- Copyright notice and Licensing terms:
|
10 | 1 | equemene | *
|
11 | 1 | equemene | * Redistribution and use in source and binary forms, with or without
|
12 | 1 | equemene | * modification, are permitted provided that the following conditions
|
13 | 1 | equemene | * are met:
|
14 | 1 | equemene | *
|
15 | 1 | equemene | * 1. Redistributions of source code must retain the above copyright
|
16 | 1 | equemene | * notice, this list of conditions and the following disclaimer.
|
17 | 1 | equemene | *
|
18 | 1 | equemene | * 2. Redistributions in binary form must reproduce the above copyright
|
19 | 1 | equemene | * notice, this list of conditions, and the following disclaimer in the
|
20 | 1 | equemene | * documentation and/or other materials provided with the distribution.
|
21 | 1 | equemene | *
|
22 | 1 | equemene | * 3. All advertising materials mentioning features or use of this
|
23 | 1 | equemene | * software must display the following acknowledgement:
|
24 | 1 | equemene | * This product includes software developed at the University of
|
25 | 1 | equemene | * Tennessee, Knoxville, Innovative Computing Laboratory.
|
26 | 1 | equemene | *
|
27 | 1 | equemene | * 4. The name of the University, the name of the Laboratory, or the
|
28 | 1 | equemene | * names of its contributors may not be used to endorse or promote
|
29 | 1 | equemene | * products derived from this software without specific written
|
30 | 1 | equemene | * permission.
|
31 | 1 | equemene | *
|
32 | 1 | equemene | * -- Disclaimer:
|
33 | 1 | equemene | *
|
34 | 1 | equemene | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
35 | 1 | equemene | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
36 | 1 | equemene | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
37 | 1 | equemene | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
|
38 | 1 | equemene | * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
39 | 1 | equemene | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
40 | 1 | equemene | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
41 | 1 | equemene | * DATA OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
42 | 1 | equemene | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
43 | 1 | equemene | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
44 | 1 | equemene | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
45 | 1 | equemene | * ---------------------------------------------------------------------
|
46 | 1 | equemene | */
|
47 | 1 | equemene | /*
|
48 | 1 | equemene | * Include files
|
49 | 1 | equemene | */
|
50 | 1 | equemene | #include "hpl.h" |
51 | 1 | equemene | |
52 | 1 | equemene | #ifdef STDC_HEADERS
|
53 | 1 | equemene | void HPL_pdlaswp00T
|
54 | 1 | equemene | ( |
55 | 1 | equemene | HPL_T_panel * PBCST, |
56 | 1 | equemene | int * IFLAG,
|
57 | 1 | equemene | HPL_T_panel * PANEL, |
58 | 1 | equemene | const int NN |
59 | 1 | equemene | ) |
60 | 1 | equemene | #else
|
61 | 1 | equemene | void HPL_pdlaswp00T
|
62 | 1 | equemene | ( PBCST, IFLAG, PANEL, NN ) |
63 | 1 | equemene | HPL_T_panel * PBCST; |
64 | 1 | equemene | int * IFLAG;
|
65 | 1 | equemene | HPL_T_panel * PANEL; |
66 | 1 | equemene | const int NN; |
67 | 1 | equemene | #endif
|
68 | 1 | equemene | { |
69 | 1 | equemene | /*
|
70 | 1 | equemene | * Purpose
|
71 | 1 | equemene | * =======
|
72 | 1 | equemene | *
|
73 | 1 | equemene | * HPL_pdlaswp00T applies the NB row interchanges to NN columns of the
|
74 | 1 | equemene | * trailing submatrix and broadcast a column panel.
|
75 | 1 | equemene | *
|
76 | 1 | equemene | * Bi-directional exchange is used to perform the swap :: broadcast of
|
77 | 1 | equemene | * the row panel U at once, resulting in a lower number of messages than
|
78 | 1 | equemene | * usual as well as a lower communication volume. With P process rows and
|
79 | 1 | equemene | * assuming bi-directional links, the running time of this function can
|
80 | 1 | equemene | * be approximated by:
|
81 | 1 | equemene | *
|
82 | 1 | equemene | * log_2(P) * (lat + NB*LocQ(N) / bdwth)
|
83 | 1 | equemene | *
|
84 | 1 | equemene | * where NB is the number of rows of the row panel U, N is the global
|
85 | 1 | equemene | * number of columns being updated, lat and bdwth are the latency and
|
86 | 1 | equemene | * bandwidth of the network for double precision real words. Mono
|
87 | 1 | equemene | * directional links will double this communication cost.
|
88 | 1 | equemene | *
|
89 | 1 | equemene | * Arguments
|
90 | 1 | equemene | * =========
|
91 | 1 | equemene | *
|
92 | 1 | equemene | * PBCST (local input/output) HPL_T_panel *
|
93 | 1 | equemene | * On entry, PBCST points to the data structure containing the
|
94 | 1 | equemene | * panel (to be broadcast) information.
|
95 | 1 | equemene | *
|
96 | 1 | equemene | * IFLAG (local input/output) int *
|
97 | 1 | equemene | * On entry, IFLAG indicates whether or not the broadcast has
|
98 | 1 | equemene | * already been completed. If not, probing will occur, and the
|
99 | 1 | equemene | * outcome will be contained in IFLAG on exit.
|
100 | 1 | equemene | *
|
101 | 1 | equemene | * PANEL (local input/output) HPL_T_panel *
|
102 | 1 | equemene | * On entry, PANEL points to the data structure containing the
|
103 | 1 | equemene | * panel (to be broadcast and swapped) information.
|
104 | 1 | equemene | *
|
105 | 1 | equemene | * NN (local input) const int
|
106 | 1 | equemene | * On entry, NN specifies the local number of columns of the
|
107 | 1 | equemene | * trailing submatrix to be swapped and broadcast starting at
|
108 | 1 | equemene | * the current position. NN must be at least zero.
|
109 | 1 | equemene | *
|
110 | 1 | equemene | * ---------------------------------------------------------------------
|
111 | 1 | equemene | */
|
112 | 1 | equemene | /*
|
113 | 1 | equemene | * .. Local Variables ..
|
114 | 1 | equemene | */
|
115 | 1 | equemene | MPI_Comm comm; |
116 | 1 | equemene | HPL_T_grid * grid; |
117 | 1 | equemene | double * A, * U, * W;
|
118 | 1 | equemene | void * vptr = NULL; |
119 | 1 | equemene | int * ipID, * lindxA, * lindxAU, * llen,
|
120 | 1 | equemene | * llen_sv; |
121 | 1 | equemene | unsigned int ip2, ip2_=1, ipdist, ipow=1, mask=1, |
122 | 1 | equemene | mydist, mydis_; |
123 | 1 | equemene | int Cmsgid=MSGID_BEGIN_PFACT, Np2, align,
|
124 | 1 | equemene | hdim, i, icurrow, *iflag, ipA, ipW, *ipl, |
125 | 1 | equemene | iprow, jb, k, lda, ldW, myrow, n, nprow, |
126 | 1 | equemene | partner, root, size_, usize; |
127 | 1 | equemene | #define LDU n
|
128 | 1 | equemene | /* ..
|
129 | 1 | equemene | * .. Executable Statements ..
|
130 | 1 | equemene | */
|
131 | 1 | equemene | n = Mmin( NN, PANEL->n ); jb = PANEL->jb; |
132 | 1 | equemene | /*
|
133 | 1 | equemene | * Quick return if there is nothing to do
|
134 | 1 | equemene | */
|
135 | 1 | equemene | if( ( n <= 0 ) || ( jb <= 0 ) ) return; |
136 | 1 | equemene | |
137 | 1 | equemene | #ifdef HPL_DETAILED_TIMING
|
138 | 1 | equemene | HPL_ptimer( HPL_TIMING_LASWP ); |
139 | 1 | equemene | #endif
|
140 | 1 | equemene | /*
|
141 | 1 | equemene | * Retrieve parameters from the PANEL data structure
|
142 | 1 | equemene | */
|
143 | 1 | equemene | grid = PANEL->grid; nprow = grid->nprow; myrow = grid->myrow; |
144 | 1 | equemene | comm = grid->col_comm; ip2 = (unsigned int)grid->row_ip2; |
145 | 1 | equemene | hdim = grid->row_hdim; align = PANEL->algo->align; |
146 | 1 | equemene | A = PANEL->A; U = PANEL->U; iflag = PANEL->IWORK; |
147 | 1 | equemene | lda = PANEL->lda; icurrow = PANEL->prow; usize = jb * n; |
148 | 1 | equemene | ldW = n + 1;
|
149 | 1 | equemene | /*
|
150 | 1 | equemene | * Allocate space for temporary W (ldW * jb)
|
151 | 1 | equemene | */
|
152 | 1 | equemene | vptr = (void*)malloc( ( (size_t)(align) +
|
153 | 1 | equemene | ((size_t)(jb) * (size_t)(ldW))) * |
154 | 1 | equemene | sizeof(double) ); |
155 | 1 | equemene | if( vptr == NULL ) |
156 | 1 | equemene | { HPL_pabort( __LINE__, "HPL_pdlaswp00T", "Memory allocation failed" ); } |
157 | 1 | equemene | |
158 | 1 | equemene | W = (double *)HPL_PTR( vptr, ((size_t)(align) * sizeof(double) ) ); |
159 | 1 | equemene | /*
|
160 | 1 | equemene | * Construct ipID and its local counter parts lindxA, lindxAU - llen is
|
161 | 1 | equemene | * the number of rows/columns that I have in workspace and that I should
|
162 | 1 | equemene | * send. Compute lindx_, ipA, llen if it has not already been done for
|
163 | 1 | equemene | * this panel;
|
164 | 1 | equemene | */
|
165 | 1 | equemene | k = (int)((unsigned int)(jb) << 1); ipl = iflag + 1; ipID = ipl + 1; |
166 | 1 | equemene | lindxA = ipID + ((unsigned int)(k) << 1); lindxAU = lindxA + k; |
167 | 1 | equemene | llen = lindxAU + k; llen_sv = llen + nprow; |
168 | 1 | equemene | |
169 | 1 | equemene | if( *iflag == -1 ) /* no index arrays have been computed so far */ |
170 | 1 | equemene | { |
171 | 1 | equemene | HPL_pipid( PANEL, ipl, ipID ); |
172 | 1 | equemene | HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv ); |
173 | 1 | equemene | *iflag = 0;
|
174 | 1 | equemene | } |
175 | 1 | equemene | else if( *iflag == 1 ) /* HPL_pdlaswp01T called before: reuse ipID */ |
176 | 1 | equemene | { |
177 | 1 | equemene | HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv ); |
178 | 1 | equemene | *iflag = 0;
|
179 | 1 | equemene | } |
180 | 1 | equemene | /*
|
181 | 1 | equemene | * Copy the llen_sv into llen - Reset ipA to its correct value
|
182 | 1 | equemene | */
|
183 | 1 | equemene | ipA = llen_sv[myrow]; |
184 | 1 | equemene | for( i = 0; i < nprow; i++ ) { llen[i] = llen_sv[i]; } |
185 | 1 | equemene | /*
|
186 | 1 | equemene | * For i in [0..2*jb), lindxA[i] is the offset in A of a row that ulti-
|
187 | 1 | equemene | * mately goes to U( lindxAU[i], : ) or U( :, lindxAU[i] ). In icurrow,
|
188 | 1 | equemene | * we directly pack into U, otherwise we pack into workspace. The first
|
189 | 1 | equemene | * entry of each column packed in workspace is in fact the row or column
|
190 | 1 | equemene | * offset in U where it should go to.
|
191 | 1 | equemene | */
|
192 | 1 | equemene | if( myrow == icurrow )
|
193 | 1 | equemene | { |
194 | 1 | equemene | HPL_dlaswp01T( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); |
195 | 1 | equemene | } |
196 | 1 | equemene | else
|
197 | 1 | equemene | { |
198 | 1 | equemene | HPL_dlaswp02N( ipA, n, A, lda, W, W+1, ldW, lindxA, lindxAU );
|
199 | 1 | equemene | } |
200 | 1 | equemene | /*
|
201 | 1 | equemene | * Probe for column panel - forward it when available
|
202 | 1 | equemene | */
|
203 | 1 | equemene | if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); |
204 | 1 | equemene | /*
|
205 | 1 | equemene | * Algorithm for bi-directional data exchange:
|
206 | 1 | equemene | *
|
207 | 1 | equemene | * As long as I have not talked to a process that already had the data
|
208 | 1 | equemene | * from icurrow, I will be sending the workspace, otherwise I will be
|
209 | 1 | equemene | * sending U. Note that the columns in workspace contain the local index
|
210 | 1 | equemene | * in U they should go to.
|
211 | 1 | equemene | *
|
212 | 1 | equemene | * If I am receiving from a process that has the data from icurrow, I
|
213 | 1 | equemene | * will be receiving in U, copy the data of U that stays into A, and
|
214 | 1 | equemene | * then the columns I have in workspace into U; otherwise I will be re-
|
215 | 1 | equemene | * ceiving in the remaining workspace. If I am one of those processes
|
216 | 1 | equemene | * that already has the data from icurrow, I will be immediately copying
|
217 | 1 | equemene | * the data I have in my workspace into U.
|
218 | 1 | equemene | *
|
219 | 1 | equemene | * When I receive U, some of U should be copied in my piece of A before
|
220 | 1 | equemene | * I can copy the rows I have in my workspace into U. This information
|
221 | 1 | equemene | * is kept in the lists lindx_: the row lindxAU[i] should be copied in
|
222 | 1 | equemene | * the row lindxA[i] of my piece of A, just as in the reversed initial
|
223 | 1 | equemene | * packing operation. Those rows are thus the first ones in the work ar-
|
224 | 1 | equemene | * ray. After this operation has been performed, I will not need
|
225 | 1 | equemene | * those lindx arrays, and I will always be sending a buffer of size
|
226 | 1 | equemene | * jb x n, or n x jb, that is, U.
|
227 | 1 | equemene | *
|
228 | 1 | equemene | * At every step of the algorithm, it is necesary to update the list
|
229 | 1 | equemene | * llen, so that I can figure out how large the next messages I will be
|
230 | 1 | equemene | * sending/receiving are. It is obvious when I am sending U. It is not
|
231 | 1 | equemene | * otherwise.
|
232 | 1 | equemene | *
|
233 | 1 | equemene | * We choose icurrow to be the source of the bi-directional exchange.
|
234 | 1 | equemene | * This allows the processes in the non-power 2 part to receive U at the
|
235 | 1 | equemene | * first exchange, and then broadcast internally this U so that those
|
236 | 1 | equemene | * processes can grab their piece of A.
|
237 | 1 | equemene | */
|
238 | 1 | equemene | if( myrow == icurrow ) { llen[myrow] = 0; ipA = 0; } |
239 | 1 | equemene | ipW = ipA; |
240 | 1 | equemene | Np2 = ( ( size_ = nprow - ip2 ) != 0 );
|
241 | 1 | equemene | mydist = (unsigned int)MModSub( myrow, icurrow, nprow ); |
242 | 1 | equemene | /*
|
243 | 1 | equemene | * bi-directional exchange: If nprow is not a power of 2, proc[i-ip2]
|
244 | 1 | equemene | * receives local data from proc[i] for all i in [ip2..nprow); icurrow
|
245 | 1 | equemene | * is the source, these last process indexes are relative to icurrow.
|
246 | 1 | equemene | */
|
247 | 1 | equemene | if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) ) |
248 | 1 | equemene | { |
249 | 1 | equemene | partner = MModAdd( icurrow, partner, nprow ); |
250 | 1 | equemene | |
251 | 1 | equemene | if( mydist == 0 ) /* I am the current row: I send U and recv W */ |
252 | 1 | equemene | { |
253 | 1 | equemene | (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW,
|
254 | 1 | equemene | Cmsgid, partner, comm ); |
255 | 1 | equemene | if( llen[partner] > 0 ) |
256 | 1 | equemene | HPL_dlaswp03T( llen[partner], n, U, LDU, W, W+1, ldW );
|
257 | 1 | equemene | } |
258 | 1 | equemene | else if( mydist == ip2 ) |
259 | 1 | equemene | { /* I recv U for later Bcast, I send my W */
|
260 | 1 | equemene | (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
|
261 | 1 | equemene | Cmsgid, partner, comm ); |
262 | 1 | equemene | } |
263 | 1 | equemene | else /* None of us is icurrow, we exchange our Ws */ |
264 | 1 | equemene | { |
265 | 1 | equemene | if( ( mydist & ip2 ) != 0 ) |
266 | 1 | equemene | { |
267 | 1 | equemene | (void) HPL_send( W, llen[myrow]*ldW, partner, Cmsgid, comm );
|
268 | 1 | equemene | } |
269 | 1 | equemene | else
|
270 | 1 | equemene | { |
271 | 1 | equemene | (void) HPL_recv( Mptr( W, 0, ipW, ldW ), llen[partner]*ldW, |
272 | 1 | equemene | partner, Cmsgid, comm ); |
273 | 1 | equemene | if( llen[partner] > 0 ) ipW += llen[partner]; |
274 | 1 | equemene | } |
275 | 1 | equemene | } |
276 | 1 | equemene | } |
277 | 1 | equemene | /*
|
278 | 1 | equemene | * Update llen
|
279 | 1 | equemene | */
|
280 | 1 | equemene | for( i = 1; i < size_; i++ ) |
281 | 1 | equemene | { |
282 | 1 | equemene | iprow = MModAdd( icurrow, i, nprow ); |
283 | 1 | equemene | partner = MModAdd( iprow, (int)(ip2), nprow );
|
284 | 1 | equemene | llen[ iprow ] += llen[ partner ]; |
285 | 1 | equemene | } |
286 | 1 | equemene | /*
|
287 | 1 | equemene | * Probe for column panel - forward it when available
|
288 | 1 | equemene | */
|
289 | 1 | equemene | if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); |
290 | 1 | equemene | /*
|
291 | 1 | equemene | * power of 2 part of the processes collection: only processes [0..ip2)
|
292 | 1 | equemene | * are working; some of them (mydist >> (k+1) == 0) either send or re-
|
293 | 1 | equemene | * ceive U. At every step k, k is in [0 .. hdim), of the algorithm, a
|
294 | 1 | equemene | * process pair that exchanges U is such that (mydist >> (k+1) == 0).
|
295 | 1 | equemene | * Among those processes, the ones that are sending U are such that
|
296 | 1 | equemene | * mydist >> k == 0.
|
297 | 1 | equemene | */
|
298 | 1 | equemene | if( mydist < ip2 )
|
299 | 1 | equemene | { |
300 | 1 | equemene | k = 0;
|
301 | 1 | equemene | |
302 | 1 | equemene | while( k < hdim )
|
303 | 1 | equemene | { |
304 | 1 | equemene | partner = (int)(mydist ^ ipow);
|
305 | 1 | equemene | partner = MModAdd( icurrow, partner, nprow ); |
306 | 1 | equemene | /*
|
307 | 1 | equemene | * Exchange and combine the local results - If I receive U, then I must
|
308 | 1 | equemene | * copy from U the rows that belong to my piece of A, and then update U
|
309 | 1 | equemene | * by copying in it the rows I have accumulated in W. Otherwise, I re-
|
310 | 1 | equemene | * ceive W. In this later case, and I have U, I shall update my copy of
|
311 | 1 | equemene | * U by copying in it the rows I have accumulated in W. If I did not
|
312 | 1 | equemene | * have U before, I simply need to update my pointer in W for later use.
|
313 | 1 | equemene | */
|
314 | 1 | equemene | if( ( mydist >> (unsigned int)( k + 1 ) ) == 0 ) |
315 | 1 | equemene | { |
316 | 1 | equemene | if( ( mydist >> (unsigned int)(k) ) == 0 ) |
317 | 1 | equemene | { |
318 | 1 | equemene | (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW, |
319 | 1 | equemene | ldW ), llen[partner]*ldW, Cmsgid, |
320 | 1 | equemene | partner, comm ); |
321 | 1 | equemene | HPL_dlaswp03T( llen[partner], n, U, LDU, Mptr( W, 0, ipW,
|
322 | 1 | equemene | ldW ), Mptr( W, 1, ipW, ldW ), ldW );
|
323 | 1 | equemene | ipW += llen[partner]; |
324 | 1 | equemene | } |
325 | 1 | equemene | else
|
326 | 1 | equemene | { |
327 | 1 | equemene | (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
|
328 | 1 | equemene | Cmsgid, partner, comm ); |
329 | 1 | equemene | HPL_dlaswp04T( ipA, llen[myrow], n, U, LDU, A, lda, W, |
330 | 1 | equemene | W+1, ldW, lindxA, lindxAU );
|
331 | 1 | equemene | } |
332 | 1 | equemene | } |
333 | 1 | equemene | else
|
334 | 1 | equemene | { |
335 | 1 | equemene | (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, Mptr( W, 0, |
336 | 1 | equemene | ipW, ldW ), llen[partner]*ldW, Cmsgid, |
337 | 1 | equemene | partner, comm ); |
338 | 1 | equemene | ipW += llen[partner]; |
339 | 1 | equemene | } |
340 | 1 | equemene | /*
|
341 | 1 | equemene | * Update llen - Go to next process pairs
|
342 | 1 | equemene | */
|
343 | 1 | equemene | iprow = icurrow; ipdist = 0;
|
344 | 1 | equemene | do
|
345 | 1 | equemene | { |
346 | 1 | equemene | if( (unsigned int)( partner = (int)(ipdist ^ ipow) ) > ipdist ) |
347 | 1 | equemene | { |
348 | 1 | equemene | partner = MModAdd( icurrow, partner, nprow ); |
349 | 1 | equemene | llen[iprow] += llen[partner]; |
350 | 1 | equemene | llen[partner] = llen[iprow]; |
351 | 1 | equemene | } |
352 | 1 | equemene | iprow = MModAdd( iprow, 1, nprow ); ipdist++;
|
353 | 1 | equemene | |
354 | 1 | equemene | } while( ipdist < ip2 );
|
355 | 1 | equemene | |
356 | 1 | equemene | ipow <<= 1; k++;
|
357 | 1 | equemene | /*
|
358 | 1 | equemene | * Probe for column panel - forward it when available
|
359 | 1 | equemene | */
|
360 | 1 | equemene | if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); |
361 | 1 | equemene | } |
362 | 1 | equemene | } |
363 | 1 | equemene | else
|
364 | 1 | equemene | { |
365 | 1 | equemene | /*
|
366 | 1 | equemene | * non power of 2 part of the process collection: proc[ip2] broadcast U
|
367 | 1 | equemene | * to procs[ip2..nprow) (relatively to icurrow).
|
368 | 1 | equemene | */
|
369 | 1 | equemene | if( size_ > 1 ) |
370 | 1 | equemene | { |
371 | 1 | equemene | k = size_ - 1;
|
372 | 1 | equemene | while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; } |
373 | 1 | equemene | root = MModAdd( icurrow, (int)(ip2), nprow );
|
374 | 1 | equemene | mydis_ = (unsigned int)MModSub( myrow, root, nprow ); |
375 | 1 | equemene | |
376 | 1 | equemene | do
|
377 | 1 | equemene | { |
378 | 1 | equemene | mask ^= ip2_; |
379 | 1 | equemene | if( ( mydis_ & mask ) == 0 ) |
380 | 1 | equemene | { |
381 | 1 | equemene | partner = (int)(mydis_ ^ ip2_);
|
382 | 1 | equemene | if( ( mydis_ & ip2_ ) != 0 ) |
383 | 1 | equemene | { |
384 | 1 | equemene | (void) HPL_recv( U, usize, MModAdd( root, partner,
|
385 | 1 | equemene | nprow ), Cmsgid, comm ); |
386 | 1 | equemene | |
387 | 1 | equemene | } |
388 | 1 | equemene | else if( partner < size_ ) |
389 | 1 | equemene | { |
390 | 1 | equemene | (void) HPL_send( U, usize, MModAdd( root, partner,
|
391 | 1 | equemene | nprow ), Cmsgid, comm ); |
392 | 1 | equemene | } |
393 | 1 | equemene | } |
394 | 1 | equemene | ip2_ >>= 1;
|
395 | 1 | equemene | /*
|
396 | 1 | equemene | * Probe for column panel - forward it when available
|
397 | 1 | equemene | */
|
398 | 1 | equemene | if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); |
399 | 1 | equemene | |
400 | 1 | equemene | } while( ip2_ > 0 ); |
401 | 1 | equemene | } |
402 | 1 | equemene | /*
|
403 | 1 | equemene | * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece
|
404 | 1 | equemene | * of A.
|
405 | 1 | equemene | */
|
406 | 1 | equemene | HPL_dlaswp05T( ipA, n, A, lda, U, LDU, lindxA, lindxAU ); |
407 | 1 | equemene | } |
408 | 1 | equemene | /*
|
409 | 1 | equemene | * If nprow is not a power of 2, proc[i-ip2] sends global result to
|
410 | 1 | equemene | * proc[i] for all i in [ip2..nprow);
|
411 | 1 | equemene | */
|
412 | 1 | equemene | if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) ) |
413 | 1 | equemene | { |
414 | 1 | equemene | partner = MModAdd( icurrow, partner, nprow ); |
415 | 1 | equemene | if( ( mydist & ip2 ) != 0 ) |
416 | 1 | equemene | { (void) HPL_recv( U, usize, partner, Cmsgid, comm ); }
|
417 | 1 | equemene | else
|
418 | 1 | equemene | { (void) HPL_send( U, usize, partner, Cmsgid, comm ); }
|
419 | 1 | equemene | } |
420 | 1 | equemene | |
421 | 1 | equemene | if( vptr ) free( vptr );
|
422 | 1 | equemene | /*
|
423 | 1 | equemene | * Probe for column panel - forward it when available
|
424 | 1 | equemene | */
|
425 | 1 | equemene | if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG ); |
426 | 1 | equemene | |
427 | 1 | equemene | #ifdef HPL_DETAILED_TIMING
|
428 | 1 | equemene | HPL_ptimer( HPL_TIMING_LASWP ); |
429 | 1 | equemene | #endif
|
430 | 1 | equemene | /*
|
431 | 1 | equemene | * End of HPL_pdlaswp00T
|
432 | 1 | equemene | */
|
433 | 1 | equemene | } |