Statistiques
| Révision :

root / src / pfact / HPL_pdrpancrN.c @ 1

Historique | Voir | Annoter | Télécharger (11,08 ko)

1
/* 
2
 * -- High Performance Computing Linpack Benchmark (HPL)                
3
 *    HPL - 2.0 - September 10, 2008                          
4
 *    Antoine P. Petitet                                                
5
 *    University of Tennessee, Knoxville                                
6
 *    Innovative Computing Laboratory                                 
7
 *    (C) Copyright 2000-2008 All Rights Reserved                       
8
 *                                                                      
9
 * -- Copyright notice and Licensing terms:                             
10
 *                                                                      
11
 * Redistribution  and  use in  source and binary forms, with or without
12
 * modification, are  permitted provided  that the following  conditions
13
 * are met:                                                             
14
 *                                                                      
15
 * 1. Redistributions  of  source  code  must retain the above copyright
16
 * notice, this list of conditions and the following disclaimer.        
17
 *                                                                      
18
 * 2. Redistributions in binary form must reproduce  the above copyright
19
 * notice, this list of conditions,  and the following disclaimer in the
20
 * documentation and/or other materials provided with the distribution. 
21
 *                                                                      
22
 * 3. All  advertising  materials  mentioning  features  or  use of this
23
 * software must display the following acknowledgement:                 
24
 * This  product  includes  software  developed  at  the  University  of
25
 * Tennessee, Knoxville, Innovative Computing Laboratory.             
26
 *                                                                      
27
 * 4. The name of the  University,  the name of the  Laboratory,  or the
28
 * names  of  its  contributors  may  not  be used to endorse or promote
29
 * products  derived   from   this  software  without  specific  written
30
 * permission.                                                          
31
 *                                                                      
32
 * -- Disclaimer:                                                       
33
 *                                                                      
34
 * THIS  SOFTWARE  IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,  INCLUDING,  BUT NOT
36
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
38
 * OR  CONTRIBUTORS  BE  LIABLE FOR ANY  DIRECT,  INDIRECT,  INCIDENTAL,
39
 * SPECIAL,  EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES  (INCLUDING,  BUT NOT
40
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41
 * DATA OR PROFITS; OR BUSINESS INTERRUPTION)  HOWEVER CAUSED AND ON ANY
42
 * THEORY OF LIABILITY, WHETHER IN CONTRACT,  STRICT LIABILITY,  OR TORT
43
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
45
 * ---------------------------------------------------------------------
46
 */ 
47
/*
48
 * Include files
49
 */
50
#include "hpl.h"
51

    
52
#ifdef STDC_HEADERS
53
void HPL_pdrpancrN
54
(
55
   HPL_T_panel *                    PANEL,
56
   const int                        M,
57
   const int                        N,
58
   const int                        ICOFF,
59
   double *                         WORK
60
)
61
#else
62
void HPL_pdrpancrN
63
( PANEL, M, N, ICOFF, WORK )
64
   HPL_T_panel *                    PANEL;
65
   const int                        M;
66
   const int                        N;
67
   const int                        ICOFF;
68
   double *                         WORK;
69
#endif
70
{
71
/* 
72
 * Purpose
73
 * =======
74
 *
75
 * HPL_pdrpancrN HPL_pdrpancrN recursively  factorizes  a panel of columns  using  the
76
 * recursive  Crout  variant of the usual one-dimensional algorithm. The
77
 * lower triangular  N0-by-N0  upper block  of  the  panel  is stored in
78
 * no-transpose form (i.e. just like the input matrix itself).
79
 *  
80
 * Bi-directional  exchange  is  used  to  perform  the  swap::broadcast
81
 * operations  at once  for one column in the panel.  This  results in a
82
 * lower number of slightly larger  messages than usual.  On P processes
83
 * and assuming bi-directional links,  the running time of this function
84
 * can be approximated by (when N is equal to N0):                      
85
 *  
86
 *    N0 * log_2( P ) * ( lat + ( 2*N0 + 4 ) / bdwth ) +
87
 *    N0^2 * ( M - N0/3 ) * gam2-3
88
 *  
89
 * where M is the local number of rows of  the panel, lat and bdwth  are
90
 * the latency and bandwidth of the network for  double  precision  real
91
 * words, and  gam2-3  is  an estimate of the  Level 2 and Level 3  BLAS
92
 * rate of execution. The  recursive  algorithm  allows indeed to almost
93
 * achieve  Level 3 BLAS  performance  in the panel factorization.  On a
94
 * large  number of modern machines,  this  operation is however latency
95
 * bound,  meaning  that its cost can  be estimated  by only the latency
96
 * portion N0 * log_2(P) * lat.  Mono-directional links will double this
97
 * communication cost.
98
 *
99
 * Arguments
100
 * =========
101
 *
102
 * PANEL   (local input/output)          HPL_T_panel *
103
 *         On entry,  PANEL  points to the data structure containing the
104
 *         panel information.
105
 *
106
 * M       (local input)                 const int
107
 *         On entry,  M specifies the local number of rows of sub(A).
108
 *
109
 * N       (local input)                 const int
110
 *         On entry,  N specifies the local number of columns of sub(A).
111
 *
112
 * ICOFF   (global input)                const int
113
 *         On entry, ICOFF specifies the row and column offset of sub(A)
114
 *         in A.
115
 *
116
 * WORK    (local workspace)             double *
117
 *         On entry, WORK  is a workarray of size at least 2*(4+2*N0).
118
 *
119
 * ---------------------------------------------------------------------
120
 */ 
121
/*
122
 * .. Local Variables ..
123
 */
124
   double                     * A, * Aptr, * L1, * L1ptr;
125
#ifdef HPL_CALL_VSIPL
126
   vsip_mview_d               * Av0, * Lv0, * Av1, * Av2, * Lv1;
127
#endif
128
   int                        curr, ii, ioff, jb, jj, lda, m, n, n0, nb,
129
                              nbdiv, nbmin;
130
/* ..
131
 * .. Executable Statements ..
132
 */
133
   if( N <= ( nbmin = PANEL->algo->nbmin ) )
134
   { PANEL->algo->pffun( PANEL, M, N, ICOFF, WORK ); return; }
135
/*
136
 * Find  new recursive blocking factor.  To avoid an infinite loop,  one
137
 * must guarantee: 1 <= jb < N, knowing that  N  is greater than  NBMIN.
138
 * First, we compute nblocks:  the number of blocks of size  NBMIN in N,
139
 * including the last one that may be smaller.  nblocks  is thus  larger
140
 * than or equal to one, since N >= NBMIN.
141
 * The ratio ( nblocks + NDIV - 1 ) / NDIV  is thus larger than or equal
142
 * to one as  well.  For  NDIV >= 2,  we  are guaranteed  that the quan-
143
 * tity ( ( nblocks + NDIV  - 1 ) / NDIV ) * NBMIN  is less  than N  and
144
 * greater than or equal to NBMIN.
145
 */
146
   nbdiv = PANEL->algo->nbdiv; ii = jj = 0; m = M; n = N;
147
   nb = jb = ( (((N+nbmin-1) / nbmin) + nbdiv  - 1) / nbdiv ) * nbmin;
148

    
149
   A     = PANEL->A;   lda = PANEL->lda;
150
   L1    = PANEL->L1;  n0  = PANEL->jb;
151
   L1ptr = Mptr( L1, ICOFF, ICOFF, n0 );
152
   curr  = (int)( PANEL->grid->myrow == PANEL->prow );
153

    
154
   if( curr != 0 ) Aptr = Mptr( A, ICOFF, ICOFF, lda );
155
   else            Aptr = Mptr( A,     0, ICOFF, lda );
156
/*
157
 * The triangular solve is replicated in every  process row.  The  panel
158
 * factorization is  such that  the first rows of  A  are accumulated in
159
 * every process row during the (panel) swapping phase.  We  ensure this
160
 * way a minimum amount  of communication during the entire panel facto-
161
 * rization.
162
 */
163
   do
164
   {
165
      n -= jb; ioff = ICOFF + jj;
166
/*
167
 * Local update - Factor current panel - Replicated update and solve
168
 */
169
#ifdef HPL_CALL_VSIPL
170
/*
171
 * Admit the blocks
172
 */
173
      (void) vsip_blockadmit_d( PANEL->Ablock,  VSIP_TRUE );
174
      (void) vsip_blockadmit_d( PANEL->L1block, VSIP_TRUE );
175
/*
176
 * Create the matrix views
177
 */
178
      Av0 = vsip_mbind_d( PANEL->Ablock,  0, 1, lda, lda, PANEL->pmat->nq );
179
      Lv0 = vsip_mbind_d( PANEL->L1block, 0, 1,  n0,  n0, n0              );
180
/*
181
 * Create the matrix subviews
182
 */
183
      if( curr != 0 )
184
      {
185
         Av1 = vsip_msubview_d( Av0, PANEL->ii+ICOFF+ii, PANEL->jj+ICOFF,
186
                                m,  jj );
187
         Av2 = vsip_msubview_d( Av0, PANEL->ii+ICOFF+ii, PANEL->jj+ioff,
188
                                m, jb );
189
      }
190
      else
191
      {
192
         Av1 = vsip_msubview_d( Av0, PANEL->ii+ii, PANEL->jj+ICOFF, m, jj );
193
         Av2 = vsip_msubview_d( Av0, PANEL->ii+ii, PANEL->jj+ioff,  m, jb );
194
      }
195
      Lv1 = vsip_msubview_d( Lv0, ICOFF, ioff, jj, jb );
196

    
197
      vsip_gemp_d( -HPL_rone, Av1, VSIP_MAT_NTRANS, Lv1, VSIP_MAT_NTRANS,
198
                   HPL_rone, Av2 );
199
/*
200
 * Destroy the matrix subviews
201
 */
202
      (void) vsip_mdestroy_d( Lv1 );
203
      (void) vsip_mdestroy_d( Av2 );
204
      (void) vsip_mdestroy_d( Av1 );
205
/*
206
 * Release the blocks
207
 */
208
      (void) vsip_blockrelease_d( vsip_mgetblock_d( Lv0 ), VSIP_TRUE );
209
      (void) vsip_blockrelease_d( vsip_mgetblock_d( Av0 ), VSIP_TRUE );
210
/*
211
 * Destroy the matrix views
212
 */
213
      (void) vsip_mdestroy_d( Lv0 );
214
      (void) vsip_mdestroy_d( Av0 );
215
#else
216
      HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, m, jb, jj,
217
                 -HPL_rone, Mptr( Aptr, ii, 0, lda ), lda, Mptr( L1ptr,
218
                 0, jj, n0 ), n0, HPL_rone, Mptr( Aptr, ii, jj, lda ),
219
                 lda );
220
#endif
221
      HPL_pdrpancrN( PANEL, m, jb, ioff, WORK );
222

    
223
      if( n > 0 )
224
      {
225
#ifdef HPL_CALL_VSIPL
226
/*
227
 * Admit the blocks
228
 */
229
         (void) vsip_blockadmit_d( PANEL->L1block, VSIP_TRUE );
230
/*
231
 * Create the matrix views
232
 */
233
         Lv0 = vsip_mbind_d( PANEL->L1block, 0, 1,  n0,  n0, n0 );
234
/*
235
 * Create the matrix subviews
236
 */
237
         Av1 = vsip_msubview_d( Lv0, ioff,  ICOFF,   jb, jj );
238
         Av2 = vsip_msubview_d( Lv0, ioff,  ioff+jb, jb,  n );
239
         Lv1 = vsip_msubview_d( Lv0, ICOFF, ioff+jb, jj,  n );
240

    
241
         vsip_gemp_d( -HPL_rone, Av1, VSIP_MAT_NTRANS, Lv1, VSIP_MAT_NTRANS,
242
                      HPL_rone, Av2 );
243
/*
244
 * Destroy the matrix subviews
245
 */
246
         (void) vsip_mdestroy_d( Lv1 );
247
         (void) vsip_mdestroy_d( Av2 );
248
         (void) vsip_mdestroy_d( Av1 );
249
/*
250
 * Release the blocks
251
 */
252
         (void) vsip_blockrelease_d( vsip_mgetblock_d( Lv0 ), VSIP_TRUE );
253
/*
254
 * Destroy the matrix views
255
 */
256
         (void) vsip_mdestroy_d( Lv0 );
257
#else
258
         HPL_dgemm( HplColumnMajor, HplNoTrans, HplNoTrans, jb, n,
259
                    jj, -HPL_rone, Mptr( L1ptr, jj, 0, n0 ), n0,
260
                    Mptr( L1ptr, 0, jj+jb, n0 ), n0, HPL_rone, 
261
                    Mptr( L1ptr, jj, jj+jb, n0 ), n0 );
262
#endif
263
         HPL_dtrsm( HplColumnMajor, HplLeft, HplLower, HplNoTrans,
264
                    HplUnit, jb, n, HPL_rone, Mptr( L1ptr, jj, jj,
265
                    n0 ), n0, Mptr( L1ptr, jj, jj+jb, n0 ), n0 );
266
      }
267
/*
268
 * Copy back upper part of A in current process row - Go the next block
269
 */
270
      if( curr != 0 )
271
      {
272
         HPL_dlacpy( ioff, jb, Mptr( L1, 0, ioff, n0 ), n0,
273
                     Mptr( A, 0, ioff, lda ), lda );
274
         ii += jb; m -= jb;
275
      }
276
      jj += jb; jb = Mmin( n, nb );
277

    
278
   } while( n > 0 );
279
/*
280
 * End of HPL_pdrpancrN
281
 */
282
}