Statistiques
| Révision :

root / src / pgesv / HPL_pdlaswp00T.c @ 1

Historique | Voir | Annoter | Télécharger (16,8 ko)

1
/* 
2
 * -- High Performance Computing Linpack Benchmark (HPL)                
3
 *    HPL - 2.0 - September 10, 2008                          
4
 *    Antoine P. Petitet                                                
5
 *    University of Tennessee, Knoxville                                
6
 *    Innovative Computing Laboratory                                 
7
 *    (C) Copyright 2000-2008 All Rights Reserved                       
8
 *                                                                      
9
 * -- Copyright notice and Licensing terms:                             
10
 *                                                                      
11
 * Redistribution  and  use in  source and binary forms, with or without
12
 * modification, are  permitted provided  that the following  conditions
13
 * are met:                                                             
14
 *                                                                      
15
 * 1. Redistributions  of  source  code  must retain the above copyright
16
 * notice, this list of conditions and the following disclaimer.        
17
 *                                                                      
18
 * 2. Redistributions in binary form must reproduce  the above copyright
19
 * notice, this list of conditions,  and the following disclaimer in the
20
 * documentation and/or other materials provided with the distribution. 
21
 *                                                                      
22
 * 3. All  advertising  materials  mentioning  features  or  use of this
23
 * software must display the following acknowledgement:                 
24
 * This  product  includes  software  developed  at  the  University  of
25
 * Tennessee, Knoxville, Innovative Computing Laboratory.             
26
 *                                                                      
27
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28
 * names  of  its  contributors  may  not  be used to endorse or promote
29
 * products  derived   from   this  software  without  specific  written
30
 * permission.                                                          
31
 *                                                                      
32
 * -- Disclaimer:                                                       
33
 *                                                                      
34
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
45
 * ---------------------------------------------------------------------
46
 */ 
47
/*
48
 * Include files
49
 */
50
#include "hpl.h"
51

    
52
#ifdef STDC_HEADERS
53
void HPL_pdlaswp00T
54
(
55
   HPL_T_panel *                    PBCST,
56
   int *                            IFLAG,
57
   HPL_T_panel *                    PANEL,
58
   const int                        NN
59
)
60
#else
61
void HPL_pdlaswp00T
62
( PBCST, IFLAG, PANEL, NN )
63
   HPL_T_panel *                    PBCST;
64
   int *                            IFLAG;
65
   HPL_T_panel *                    PANEL;
66
   const int                        NN;
67
#endif
68
{
69
/* 
70
 * Purpose
71
 * =======
72
 *
73
 * HPL_pdlaswp00T applies the  NB  row interchanges to  NN columns of the
74
 * trailing submatrix and broadcast a column panel.
75
 *  
76
 * Bi-directional  exchange  is used to perform the  swap :: broadcast of
77
 * the row  panel U at once, resulting in a lower number of messages than
78
 * usual as well as a lower communication volume. With P process rows and
79
 * assuming  bi-directional links,  the running time of this function can
80
 * be approximated by:
81
 *  
82
 *    log_2(P) * (lat + NB*LocQ(N) / bdwth)
83
 *  
84
 * where  NB  is the number of rows of the row panel U,  N is the global
85
 * number of columns being updated,  lat and bdwth  are the latency  and
86
 * bandwidth  of  the  network  for  double  precision real words.  Mono
87
 * directional links will double this communication cost.
88
 *
89
 * Arguments
90
 * =========
91
 *
92
 * PBCST   (local input/output)          HPL_T_panel *
93
 *         On entry,  PBCST  points to the data structure containing the
94
 *         panel (to be broadcast) information.
95
 *
96
 * IFLAG   (local input/output)          int *
97
 *         On entry, IFLAG  indicates  whether or not  the broadcast has
98
 *         already been completed.  If not,  probing will occur, and the
99
 *         outcome will be contained in IFLAG on exit.
100
 *
101
 * PANEL   (local input/output)          HPL_T_panel *
102
 *         On entry,  PANEL  points to the data structure containing the
103
 *         panel (to be broadcast and swapped) information.
104
 *
105
 * NN      (local input)                 const int
106
 *         On entry, NN specifies  the  local  number  of columns of the
107
 *         trailing  submatrix  to  be swapped and broadcast starting at
108
 *         the current position. NN must be at least zero.
109
 *
110
 * ---------------------------------------------------------------------
111
 */ 
112
/*
113
 * .. Local Variables ..
114
 */
115
   MPI_Comm                  comm;
116
   HPL_T_grid                * grid;
117
   double                    * A, * U, * W;
118
   void                       * vptr = NULL;
119
   int                       * ipID, * lindxA, * lindxAU, * llen,
120
                             * llen_sv;
121
   unsigned int              ip2, ip2_=1, ipdist, ipow=1, mask=1,
122
                             mydist, mydis_;
123
   int                       Cmsgid=MSGID_BEGIN_PFACT, Np2, align,
124
                             hdim, i, icurrow, *iflag, ipA, ipW, *ipl,
125
                             iprow, jb, k, lda, ldW, myrow, n, nprow,
126
                             partner, root, size_, usize;
127
#define LDU                  n
128
/* ..
129
 * .. Executable Statements ..
130
 */
131
   n = Mmin( NN, PANEL->n ); jb = PANEL->jb;
132
/*
133
 * Quick return if there is nothing to do
134
 */
135
   if( ( n <= 0 ) || ( jb <= 0 ) ) return;
136

    
137
#ifdef HPL_DETAILED_TIMING
138
   HPL_ptimer( HPL_TIMING_LASWP );
139
#endif
140
/*
141
 * Retrieve parameters from the PANEL data structure
142
 */
143
   grid  = PANEL->grid;    nprow   = grid->nprow; myrow = grid->myrow;
144
   comm  = grid->col_comm; ip2     = (unsigned int)grid->row_ip2;
145
   hdim  = grid->row_hdim; align   = PANEL->algo->align;
146
   A     = PANEL->A;       U       = PANEL->U;    iflag = PANEL->IWORK;
147
   lda   = PANEL->lda;     icurrow = PANEL->prow; usize = jb * n;
148
   ldW   = n + 1;
149
/*
150
 * Allocate space for temporary W (ldW * jb)
151
 */
152
   vptr = (void*)malloc( ( (size_t)(align) + 
153
                           ((size_t)(jb) * (size_t)(ldW))) * 
154
                           sizeof(double) );
155
   if( vptr == NULL )
156
   { HPL_pabort( __LINE__, "HPL_pdlaswp00T", "Memory allocation failed" ); }
157

    
158
   W = (double *)HPL_PTR( vptr, ((size_t)(align) * sizeof(double) ) );
159
/*
160
 * Construct ipID and its local counter parts lindxA, lindxAU -  llen is
161
 * the number of rows/columns that I have in workspace and that I should
162
 * send.  Compute  lindx_, ipA, llen if it has not already been done for
163
 * this panel;
164
 */
165
   k = (int)((unsigned int)(jb) << 1); ipl = iflag + 1; ipID = ipl + 1;
166
   lindxA  = ipID + ((unsigned int)(k) << 1); lindxAU = lindxA + k;
167
   llen    = lindxAU + k; llen_sv = llen + nprow;
168

    
169
   if( *iflag == -1 )    /* no index arrays have been computed so far */
170
   {
171
      HPL_pipid(   PANEL,  ipl, ipID );
172
      HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv );
173
      *iflag = 0;
174
   }
175
   else if( *iflag == 1 ) /* HPL_pdlaswp01T called before: reuse ipID */
176
   {
177
      HPL_plindx0( PANEL, *ipl, ipID, lindxA, lindxAU, llen_sv );
178
      *iflag = 0;
179
   }
180
/*
181
 * Copy the llen_sv into llen - Reset ipA to its correct value
182
 */
183
   ipA = llen_sv[myrow];
184
   for( i = 0; i < nprow; i++ ) { llen[i]  = llen_sv[i]; }
185
/*
186
 * For i in [0..2*jb),  lindxA[i] is the offset in A of a row that ulti-
187
 * mately goes to U( lindxAU[i], : ) or U( :, lindxAU[i] ).  In icurrow,
188
 * we directly pack into U, otherwise we pack into workspace. The  first
189
 * entry of each column packed in workspace is in fact the row or column
190
 * offset in U where it should go to.
191
 */
192
   if( myrow == icurrow ) 
193
   {
194
      HPL_dlaswp01T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
195
   }
196
   else
197
   {
198
      HPL_dlaswp02N( ipA, n, A, lda, W, W+1, ldW, lindxA, lindxAU );
199
   }
200
/*
201
 * Probe for column panel - forward it when available 
202
 */
203
   if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
204
/*
205
 * Algorithm for bi-directional data exchange:
206
 *
207
 * As long as I have not talked to a process that  already  had the data
208
 * from icurrow,  I will be sending the workspace,  otherwise  I will be
209
 * sending U. Note that the columns in workspace contain the local index
210
 * in U they should go to.
211
 *
212
 * If I am receiving from a process that  has the data from  icurrow,  I
213
 * will be receiving in  U, copy the data of  U  that stays into  A, and
214
 * then the columns I have in workspace into U; otherwise  I will be re-
215
 * ceiving in the remaining workspace.  If I am one  of  those processes 
216
 * that already has the data from icurrow, I will be immediately copying
217
 * the data I have in my workspace into U.
218
 *
219
 * When I receive U, some of U should be copied in my piece of A  before
220
 * I can copy the rows I have in my workspace into  U.  This information
221
 * is kept in the lists  lindx_:  the row lindxAU[i] should be copied in
222
 * the row  lindxA[i] of my piece of  A, just as in the reversed initial
223
 * packing operation. Those rows are thus the first ones in the work ar-
224
 * ray.  After  this  operation  has  been  performed,  I will not  need
225
 * those lindx arrays,  and  I  will  always be sending a buffer of size
226
 * jb x n, or n x jb, that is, U.
227
 *
228
 * At  every  step  of  the algorithm, it is necesary to update the list 
229
 * llen,  so that I can figure out how large the next messages I will be
230
 * sending/receiving are.  It is  obvious when I am sending U. It is not
231
 * otherwise.
232
 *
233
 * We  choose  icurrow  to be the source of the bi-directional exchange.
234
 * This allows the processes in the non-power 2 part to receive U at the
235
 * first exchange,  and  then  broadcast internally this U so that those 
236
 * processes can grab their piece of A.
237
 */
238
   if( myrow == icurrow ) { llen[myrow] = 0; ipA = 0; }
239
   ipW    = ipA;
240
   Np2    = ( ( size_ = nprow - ip2 ) != 0 );
241
   mydist = (unsigned int)MModSub( myrow, icurrow, nprow );
242
/*
243
 * bi-directional exchange:   If nprow is not a power of 2,  proc[i-ip2]
244
 * receives local data from proc[i] for all i in  [ip2..nprow);  icurrow
245
 * is the source, these last process indexes are relative to icurrow.
246
 */
247
   if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) )
248
   {
249
      partner = MModAdd( icurrow, partner, nprow );
250

    
251
      if( mydist == 0 )  /* I am the current row: I send U and recv W */
252
      {
253
         (void) HPL_sdrv( U, usize, Cmsgid, W, llen[partner] * ldW,
254
                          Cmsgid, partner, comm );
255
         if( llen[partner] > 0 )
256
            HPL_dlaswp03T( llen[partner], n, U, LDU, W, W+1, ldW );
257
      }
258
      else if( mydist == ip2 )
259
      {                      /* I recv U for later Bcast, I send my W */
260
         (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
261
                          Cmsgid, partner, comm );
262
      }
263
      else               /* None of us is icurrow, we exchange our Ws */
264
      {
265
         if( ( mydist & ip2 ) != 0 ) 
266
         {
267
            (void) HPL_send( W, llen[myrow]*ldW, partner, Cmsgid, comm );
268
         }
269
         else
270
         {
271
            (void) HPL_recv( Mptr( W, 0, ipW, ldW ), llen[partner]*ldW,
272
                             partner, Cmsgid, comm );
273
            if( llen[partner] > 0 ) ipW += llen[partner];
274
         }
275
      }
276
   }
277
/*
278
 * Update llen
279
 */
280
   for( i = 1; i < size_; i++ )
281
   {
282
      iprow   = MModAdd( icurrow, i,          nprow );
283
      partner = MModAdd( iprow,   (int)(ip2), nprow );
284
      llen[ iprow ] += llen[ partner ];
285
   }
286
/*
287
 * Probe for column panel - forward it when available 
288
 */
289
   if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
290
/*
291
 * power of 2 part of the processes collection:  only processes [0..ip2)
292
 * are working;  some of them  (mydist >> (k+1) == 0) either send or re-
293
 * ceive U.  At every step k, k is in [0 .. hdim),  of the algorithm,  a
294
 * process pair that exchanges  U  is such that  (mydist >> (k+1) == 0).
295
 * Among  those  processes,  the  ones  that are sending U are such that 
296
 * mydist >> k == 0.
297
 */
298
   if( mydist < ip2 )
299
   {
300
      k = 0;
301

    
302
      while( k < hdim )
303
      {
304
         partner = (int)(mydist ^ ipow);
305
         partner = MModAdd( icurrow, partner, nprow );
306
/*
307
 * Exchange and combine the local results - If I receive U,  then I must
308
 * copy from U the rows that belong to my piece of A, and then update  U
309
 * by  copying in it the rows I have accumulated in W.  Otherwise, I re-
310
 * ceive W.  In this later case, and I have U, I shall update my copy of
311
 * U by copying in it the rows I have accumulated in  W.  If  I  did not
312
 * have U before, I simply need to update my pointer in W for later use.
313
 */
314
         if( ( mydist >> (unsigned int)( k + 1 ) ) == 0 )
315
         {
316
            if( ( mydist >> (unsigned int)(k) ) == 0 )
317
            {
318
               (void) HPL_sdrv( U, usize, Cmsgid, Mptr( W, 0, ipW,
319
                                ldW ), llen[partner]*ldW, Cmsgid,
320
                                partner, comm );
321
               HPL_dlaswp03T( llen[partner], n, U, LDU, Mptr( W, 0, ipW,
322
                              ldW ), Mptr( W, 1, ipW, ldW ), ldW );
323
               ipW += llen[partner];
324
            }
325
            else
326
            {
327
               (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, U, usize,
328
                                Cmsgid, partner, comm );
329
               HPL_dlaswp04T( ipA, llen[myrow], n, U, LDU, A, lda, W,
330
                              W+1, ldW, lindxA, lindxAU );
331
            }
332
         }
333
         else
334
         {
335
            (void) HPL_sdrv( W, llen[myrow]*ldW, Cmsgid, Mptr( W, 0,
336
                             ipW, ldW ), llen[partner]*ldW, Cmsgid,
337
                             partner, comm );
338
            ipW += llen[partner];
339
         }
340
/*
341
 * Update llen - Go to next process pairs
342
 */
343
         iprow = icurrow; ipdist = 0;
344
         do
345
         {
346
            if( (unsigned int)( partner = (int)(ipdist ^ ipow) ) > ipdist )
347
            {
348
               partner = MModAdd( icurrow, partner, nprow );
349
               llen[iprow]  += llen[partner];
350
               llen[partner] = llen[iprow];
351
            }
352
            iprow = MModAdd( iprow, 1, nprow ); ipdist++;
353

    
354
         } while( ipdist < ip2 );
355

    
356
         ipow <<= 1; k++;
357
/*
358
 * Probe for column panel - forward it when available 
359
 */
360
         if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
361
      }
362
   }
363
   else
364
   {
365
/*
366
 * non power of 2 part of the process collection:  proc[ip2] broadcast U
367
 * to procs[ip2..nprow) (relatively to icurrow).
368
 */
369
      if( size_ > 1 )
370
      {
371
         k = size_ - 1;
372
         while( k > 1 ) { k >>= 1; ip2_ <<= 1; mask <<= 1; mask++; }
373
         root   = MModAdd( icurrow, (int)(ip2), nprow );
374
         mydis_ = (unsigned int)MModSub( myrow,  root, nprow );
375

    
376
         do
377
         {
378
            mask ^= ip2_;
379
            if( ( mydis_ & mask ) == 0 )
380
            {
381
               partner = (int)(mydis_ ^ ip2_);
382
               if( ( mydis_ & ip2_ ) != 0 )
383
               {
384
                  (void) HPL_recv( U, usize, MModAdd( root, partner,
385
                                   nprow ), Cmsgid, comm );
386

    
387
               }
388
               else if( partner < size_ )
389
               {
390
                  (void) HPL_send( U, usize, MModAdd( root, partner,
391
                                   nprow ), Cmsgid, comm );
392
               }
393
            }
394
            ip2_ >>= 1;
395
/*
396
 * Probe for column panel - forward it when available 
397
 */
398
            if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
399

    
400
         } while( ip2_ > 0 );
401
      }
402
/*
403
 * Every process in [ip2..nprow) (relatively to icurrow) grabs its piece
404
 * of A.
405
 */
406
      HPL_dlaswp05T( ipA, n, A, lda, U, LDU, lindxA, lindxAU );
407
   }
408
/*
409
 * If  nprow  is not a power of 2,  proc[i-ip2]  sends  global result to
410
 * proc[i] for all i in [ip2..nprow);
411
 */
412
   if( ( Np2 != 0 ) && ( ( partner = (int)(mydist ^ ip2) ) < nprow ) )
413
   {
414
      partner = MModAdd( icurrow, partner, nprow );
415
      if( ( mydist & ip2 ) != 0 )
416
      { (void) HPL_recv( U, usize, partner, Cmsgid, comm ); }
417
      else
418
      { (void) HPL_send( U, usize, partner, Cmsgid, comm ); }
419
   }
420

    
421
   if( vptr ) free( vptr );
422
/*
423
 * Probe for column panel - forward it when available 
424
 */
425
   if( *IFLAG == HPL_KEEP_TESTING ) (void) HPL_bcast( PBCST, IFLAG );
426

    
427
#ifdef HPL_DETAILED_TIMING
428
   HPL_ptimer( HPL_TIMING_LASWP );
429
#endif
430
/*
431
 * End of HPL_pdlaswp00T
432
 */
433
}