Statistiques
| Révision :

root / src / pfact / HPL_pdmxswp.c @ 9

Historique | Voir | Annoter | Télécharger (12,31 ko)

1 1 equemene
/*
2 1 equemene
 * -- High Performance Computing Linpack Benchmark (HPL)
3 1 equemene
 *    HPL - 2.0 - September 10, 2008
4 1 equemene
 *    Antoine P. Petitet
5 1 equemene
 *    University of Tennessee, Knoxville
6 1 equemene
 *    Innovative Computing Laboratory
7 1 equemene
 *    (C) Copyright 2000-2008 All Rights Reserved
8 1 equemene
 *
9 1 equemene
 * -- Copyright notice and Licensing terms:
10 1 equemene
 *
11 1 equemene
 * Redistribution  and  use in  source and binary forms, with or without
12 1 equemene
 * modification, are  permitted provided  that the following  conditions
13 1 equemene
 * are met:
14 1 equemene
 *
15 1 equemene
 * 1. Redistributions  of  source  code  must retain the above copyright
16 1 equemene
 * notice, this list of conditions and the following disclaimer.
17 1 equemene
 *
18 1 equemene
 * 2. Redistributions in binary form must reproduce  the above copyright
19 1 equemene
 * notice, this list of conditions,  and the following disclaimer in the
20 1 equemene
 * documentation and/or other materials provided with the distribution.
21 1 equemene
 *
22 1 equemene
 * 3. All  advertising  materials  mentioning  features  or  use of this
23 1 equemene
 * software must display the following acknowledgement:
24 1 equemene
 * This  product  includes  software  developed  at  the  University  of
25 1 equemene
 * Tennessee, Knoxville, Innovative Computing Laboratory.
26 1 equemene
 *
27 1 equemene
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28 1 equemene
 * names  of  its  contributors  may  not  be used to endorse or promote
29 1 equemene
 * products  derived   from   this  software  without  specific  written
30 1 equemene
 * permission.
31 1 equemene
 *
32 1 equemene
 * -- Disclaimer:
33 1 equemene
 *
34 1 equemene
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 1 equemene
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36 1 equemene
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 1 equemene
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38 1 equemene
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39 1 equemene
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40 1 equemene
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 1 equemene
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42 1 equemene
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43 1 equemene
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 1 equemene
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45 1 equemene
 * ---------------------------------------------------------------------
46 1 equemene
 */
47 1 equemene
/*
48 1 equemene
 * Include files
49 1 equemene
 */
50 1 equemene
#include "hpl.h"
51 1 equemene
52 1 equemene
#ifdef STDC_HEADERS
53 1 equemene
void HPL_pdmxswp
54 1 equemene
(
55 1 equemene
   HPL_T_panel *                    PANEL,
56 1 equemene
   const int                        M,
57 1 equemene
   const int                        II,
58 1 equemene
   const int                        JJ,
59 1 equemene
   double *                         WORK
60 1 equemene
)
61 1 equemene
#else
62 1 equemene
void HPL_pdmxswp
63 1 equemene
( PANEL, M, II, JJ, WORK )
64 1 equemene
   HPL_T_panel *                    PANEL;
65 1 equemene
   const int                        M;
66 1 equemene
   const int                        II;
67 1 equemene
   const int                        JJ;
68 1 equemene
   double *                         WORK;
69 1 equemene
#endif
70 1 equemene
{
71 1 equemene
/*
72 1 equemene
 * Purpose
73 1 equemene
 * =======
74 1 equemene
 *
75 1 equemene
 * HPL_pdmxswp swaps  and  broadcasts  the  absolute value max row using
76 1 equemene
 * bi-directional exchange.  The buffer is partially set by HPL_dlocmax.
77 1 equemene
 *
78 1 equemene
 * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
79 1 equemene
 * operations  at once  for one column in the panel.  This  results in a
80 1 equemene
 * lower number of slightly larger  messages than usual.  On P processes
81 1 equemene
 * and assuming bi-directional links,  the running time of this function
82 1 equemene
 * can be approximated by
83 1 equemene
 *
84 1 equemene
 *    log_2( P ) * ( lat + ( 2 * N0 + 4 ) / bdwth )
85 1 equemene
 *
86 1 equemene
 * where  lat and bdwth are the latency and bandwidth of the network for
87 1 equemene
 * double precision real elements.  Communication  only  occurs  in  one
88 1 equemene
 * process  column. Mono-directional links  will cause the communication
89 1 equemene
 * cost to double.
90 1 equemene
 *
91 1 equemene
 * Arguments
92 1 equemene
 * =========
93 1 equemene
 *
94 1 equemene
 * PANEL   (local input/output)          HPL_T_panel *
95 1 equemene
 *         On entry,  PANEL  points to the data structure containing the
96 1 equemene
 *         panel information.
97 1 equemene
 *
98 1 equemene
 * M       (local input)                 const int
99 1 equemene
 *         On entry,  M specifies the local number of rows of the matrix
100 1 equemene
 *         column on which this function operates.
101 1 equemene
 *
102 1 equemene
 * II      (local input)                 const int
103 1 equemene
 *         On entry, II  specifies the row offset where the column to be
104 1 equemene
 *         operated on starts with respect to the panel.
105 1 equemene
 *
106 1 equemene
 * JJ      (local input)                 const int
107 1 equemene
 *         On entry, JJ  specifies the column offset where the column to
108 1 equemene
 *         be operated on starts with respect to the panel.
109 1 equemene
 *
110 1 equemene
 * WORK    (local workspace)             double *
111 1 equemene
 *         On entry, WORK  is a workarray of size at least 2 * (4+2*N0).
112 1 equemene
 *         It  is assumed that  HPL_dlocmax  was called  prior  to  this
113 1 equemene
 *         routine to  initialize  the first four entries of this array.
114 1 equemene
 *         On exit, the  N0  length max row is stored in WORK[4:4+N0-1];
115 1 equemene
 *         Note that this is also the  JJth  row  (or column) of L1. The
116 1 equemene
 *         remaining part is used as a temporary array.
117 1 equemene
 *
118 1 equemene
 * ---------------------------------------------------------------------
119 1 equemene
 */
120 1 equemene
/*
121 1 equemene
 * .. Local Variables ..
122 1 equemene
 */
123 1 equemene
   double                     gmax, tmp1;
124 1 equemene
   double                     * A0, * Wmx, * Wwork;
125 1 equemene
   HPL_T_grid                 * grid;
126 1 equemene
   MPI_Comm                   comm;
127 1 equemene
   unsigned int               hdim, ip2, ip2_, ipow, k, mask;
128 1 equemene
   int                        Np2, cnt_, cnt0, i, icurrow, lda, mydist,
129 1 equemene
                              mydis_, myrow, n0, nprow, partner, rcnt,
130 1 equemene
                              root, scnt, size_;
131 1 equemene
/* ..
132 1 equemene
 * .. Executable Statements ..
133 1 equemene
 */
134 1 equemene
#ifdef HPL_DETAILED_TIMING
135 1 equemene
   HPL_ptimer( HPL_TIMING_MXSWP );
136 1 equemene
#endif
137 1 equemene
   grid = PANEL->grid; myrow = grid->myrow; nprow = grid->nprow;
138 1 equemene
/*
139 1 equemene
 * ip2  : the smallest power of two less than or equal to nprow;
140 1 equemene
 * hdim : dimension of the hypercube made of those ip2 processes;
141 1 equemene
 * Np2  : logical flag indicating whether or not nprow is a power of 2;
142 1 equemene
 */
143 1 equemene
   comm    = grid->col_comm; ip2 = (unsigned int)(grid->row_ip2);
144 1 equemene
   hdim    = (unsigned int)(grid->row_hdim);     n0  = PANEL->jb;
145 1 equemene
   icurrow = PANEL->prow; Np2 = (int)( ( size_ = nprow - ip2 ) != 0 );
146 1 equemene
   mydist  = MModSub( myrow, icurrow, nprow );
147 1 equemene
/*
148 1 equemene
 * Set up pointers in workspace:  WORK and Wwork  point to the beginning
149 1 equemene
 * of the buffers of size 4 + 2*N0 to be combined. Wmx points to the row
150 1 equemene
 * owning the local (before combine) and global (after combine) absolute
151 1 equemene
 * value max. A0 points to the copy of the current row of the matrix.
152 1 equemene
 */
153 1 equemene
   cnt0  = ( cnt_ = n0 + 4 ) + n0; A0 = ( Wmx = WORK + 4 ) + n0;
154 1 equemene
   Wwork = WORK + cnt0;
155 1 equemene
/*
156 1 equemene
 * Wmx[0:N0-1] := A[ilindx,0:N0-1] where ilindx is  (int)(WORK[1])  (row
157 1 equemene
 * with max in current column). If I am the current process row, pack in
158 1 equemene
 * addition the current row of A in A0[0:N0-1].  If I do not own any row
159 1 equemene
 * of A, then zero out Wmx[0:N0-1].
160 1 equemene
 */
161 1 equemene
   if( M > 0 )
162 1 equemene
   {
163 1 equemene
      lda = PANEL->lda;
164 1 equemene
      HPL_dcopy( n0, Mptr( PANEL->A, II+(int)(WORK[1]), 0, lda ), lda,
165 1 equemene
                 Wmx, 1 );
166 1 equemene
      if( myrow == icurrow )
167 1 equemene
      { HPL_dcopy( n0, Mptr( PANEL->A, II, 0, lda ), lda, A0, 1 ); }
168 1 equemene
   }
169 1 equemene
   else { for( i = 0; i < n0; i++ ) Wmx[i] = HPL_rzero; }
170 1 equemene
/*
171 1 equemene
 * Combine the results (bi-directional exchange):  the process coordina-
172 1 equemene
 * tes are relative to icurrow,  this allows to reduce the communication
173 1 equemene
 * volume when nprow is not a power of 2.
174 1 equemene
 *
175 1 equemene
 * When nprow is not a power of 2:  proc[i-ip2] receives local data from
176 1 equemene
 * proc[i]  for all i in [ip2..nprow).  In addition,  proc[0]  (icurrow)
177 1 equemene
 * sends to proc[ip2] the current row of A  for later broadcast in procs
178 1 equemene
 * [ip2..nprow).
179 1 equemene
 */
180 1 equemene
   if( ( Np2 != 0 ) &&
181 1 equemene
       ( ( partner = (int)((unsigned int)(mydist) ^ ip2 ) ) < nprow ) )
182 1 equemene
   {
183 1 equemene
      if( ( mydist & ip2 ) != 0 )
184 1 equemene
      {
185 1 equemene
         if( mydist == (int)(ip2) )
186 1 equemene
            (void) HPL_sdrv( WORK, cnt_, MSGID_BEGIN_PFACT, A0, n0,
187 1 equemene
                             MSGID_BEGIN_PFACT, MModAdd( partner,
188 1 equemene
                             icurrow, nprow ), comm );
189 1 equemene
         else
190 1 equemene
            (void) HPL_send( WORK, cnt_, MModAdd( partner, icurrow,
191 1 equemene
                             nprow ), MSGID_BEGIN_PFACT, comm );
192 1 equemene
      }
193 1 equemene
      else
194 1 equemene
      {
195 1 equemene
         if( mydist == 0 )
196 1 equemene
            (void) HPL_sdrv( A0, n0, MSGID_BEGIN_PFACT, Wwork, cnt_,
197 1 equemene
                             MSGID_BEGIN_PFACT, MModAdd( partner,
198 1 equemene
                             icurrow, nprow ), comm );
199 1 equemene
         else
200 1 equemene
            (void) HPL_recv( Wwork, cnt_, MModAdd( partner, icurrow,
201 1 equemene
                             nprow ), MSGID_BEGIN_PFACT, comm );
202 1 equemene
203 1 equemene
         tmp1 = Mabs( Wwork[0] ); gmax = Mabs( WORK[0] );
204 1 equemene
         if( ( tmp1 > gmax ) ||
205 1 equemene
             ( ( tmp1 == gmax ) && ( Wwork[3] < WORK[3] ) ) )
206 1 equemene
         { HPL_dcopy( cnt_, Wwork, 1, WORK, 1 ); }
207 1 equemene
      }
208 1 equemene
   }
209 1 equemene
210 1 equemene
   if( mydist < (int)(ip2) )
211 1 equemene
   {
212 1 equemene
/*
213 1 equemene
 * power of 2 part of the processes collection: processes  [0..ip2)  are
214 1 equemene
 * combining (binary exchange); proc[0] has two rows to send, but one to
215 1 equemene
 * receive.  At every step  k  in [0..hdim) of the algorithm,  a process
216 1 equemene
 * pair exchanging 2 rows is such that  myrow >> k+1 is 0.  Among  those
217 1 equemene
 * processes the ones  that are sending one more row than  what they are
218 1 equemene
 * receiving are such that myrow >> k is equal to 0.
219 1 equemene
 */
220 1 equemene
      k = 0; ipow = 1;
221 1 equemene
222 1 equemene
      while( k < hdim )
223 1 equemene
      {
224 1 equemene
         if( ( (unsigned int)(mydist) >> ( k + 1 ) ) == 0 )
225 1 equemene
         {
226 1 equemene
            if( ( (unsigned int)(mydist) >> k ) == 0 )
227 1 equemene
            { scnt = cnt0; rcnt = cnt_; }
228 1 equemene
            else
229 1 equemene
            { scnt = cnt_; rcnt = cnt0; }
230 1 equemene
         }
231 1 equemene
         else { scnt = rcnt = cnt_; }
232 1 equemene
233 1 equemene
         partner = (int)( (unsigned int)(mydist) ^ ipow );
234 1 equemene
         (void) HPL_sdrv( WORK, scnt, MSGID_BEGIN_PFACT, Wwork, rcnt,
235 1 equemene
                          MSGID_BEGIN_PFACT, MModAdd( partner, icurrow,
236 1 equemene
                          nprow ), comm );
237 1 equemene
238 1 equemene
         tmp1 = Mabs( Wwork[0] ); gmax = Mabs( WORK[0] );
239 1 equemene
         if( ( tmp1 > gmax ) ||
240 1 equemene
             ( ( tmp1 == gmax ) && ( Wwork[3] < WORK[3] ) ) )
241 1 equemene
         {
242 1 equemene
            HPL_dcopy( ( rcnt == cnt0 ? cnt0 : cnt_ ), Wwork, 1,
243 1 equemene
                       WORK, 1 );
244 1 equemene
         }
245 1 equemene
         else if( rcnt == cnt0 )
246 1 equemene
         { HPL_dcopy( n0, Wwork+cnt_, 1, A0, 1 ); }
247 1 equemene
248 1 equemene
         ipow <<= 1; k++;
249 1 equemene
      }
250 1 equemene
   }
251 1 equemene
   else if( size_ > 1 )
252 1 equemene
   {
253 1 equemene
/*
254 1 equemene
 * proc[ip2] broadcast current row of A to procs [ip2+1..nprow).
255 1 equemene
 */
256 1 equemene
      k = (unsigned int)(size_) - 1; ip2_ = mask = 1;
257 1 equemene
      while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; }
258 1 equemene
259 1 equemene
      root   = MModAdd( icurrow, (int)(ip2), nprow );
260 1 equemene
      mydis_ = MModSub( myrow,   root,       nprow );
261 1 equemene
262 1 equemene
      do
263 1 equemene
      {
264 1 equemene
         mask ^= ip2_;
265 1 equemene
         if( ( mydis_ & mask ) == 0 )
266 1 equemene
         {
267 1 equemene
            partner = (int)(mydis_ ^ ip2_);
268 1 equemene
            if( ( mydis_ & ip2_ ) != 0 )
269 1 equemene
            {
270 1 equemene
               (void) HPL_recv( A0, n0, MModAdd( root, partner,
271 1 equemene
                                nprow ), MSGID_BEGIN_PFACT, comm );
272 1 equemene
            }
273 1 equemene
            else if( partner < size_ )
274 1 equemene
            {
275 1 equemene
               (void) HPL_send( A0, n0, MModAdd( root, partner,
276 1 equemene
                                nprow ), MSGID_BEGIN_PFACT, comm );
277 1 equemene
            }
278 1 equemene
         }
279 1 equemene
         ip2_ >>= 1;
280 1 equemene
      } while( ip2_ > 0 );
281 1 equemene
   }
282 1 equemene
/*
283 1 equemene
 * If nprow is not a power of 2,  for all i in [ip2..nprow), proc[i-ip2]
284 1 equemene
 * sends the pivot row to proc[i]  along  with the first four entries of
285 1 equemene
 * the WORK array.
286 1 equemene
 */
287 1 equemene
   if( ( Np2 != 0 ) &&
288 1 equemene
       ( ( partner = (int)((unsigned int)(mydist) ^ ip2 ) ) < nprow ) )
289 1 equemene
   {
290 1 equemene
      if( ( mydist & ip2 ) != 0 )
291 1 equemene
      {
292 1 equemene
         (void) HPL_recv( WORK, cnt_, MModAdd( partner, icurrow,
293 1 equemene
                          nprow ), MSGID_BEGIN_PFACT, comm );
294 1 equemene
      }
295 1 equemene
      else
296 1 equemene
      {
297 1 equemene
         (void) HPL_send( WORK, cnt_, MModAdd( partner, icurrow,
298 1 equemene
                          nprow ), MSGID_BEGIN_PFACT, comm );
299 1 equemene
      }
300 1 equemene
   }
301 1 equemene
/*
302 1 equemene
 * Save the global pivot index in pivot array
303 1 equemene
 */
304 1 equemene
   (PANEL->DPIV)[JJ] = WORK[2];
305 1 equemene
#ifdef HPL_DETAILED_TIMING
306 1 equemene
   HPL_ptimer( HPL_TIMING_MXSWP );
307 1 equemene
#endif
308 1 equemene
/*
309 1 equemene
 * End of HPL_pdmxswp
310 1 equemene
 */
311 1 equemene
}