Actual source code: blockinvert.h

  1: /*
  2:     Kernels used in sparse ILU (and LU) and in the resulting triangular
  3:  solves. These are for block algorithms where the block sizes are on
  4:  the order of 2-6+.

  6:     There are TWO versions of the macros below.
  7:     1) standard for MatScalar == PetscScalar use the standard BLAS for
  8:        block size larger than 7 and
  9:     2) handcoded Fortran single precision for the matrices, since BLAS
 10:        does not have some arguments in single and some in double.

 12: */
 15: #include <petscblaslapack.h>

 17: /*
 18:       These are C kernels,they are contained in
 19:    src/mat/impls/baij/seq
 20: */

 22: PETSC_INTERN PetscErrorCode PetscLINPACKgefa(MatScalar*,PetscInt,PetscInt*,PetscBool,PetscBool*);
 23: PETSC_INTERN PetscErrorCode PetscLINPACKgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
 24: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_2(MatScalar*,PetscReal,PetscBool,PetscBool*);
 25: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_3(MatScalar*,PetscReal,PetscBool,PetscBool*);

 27: #define PetscKernel_A_gets_inverse_A_4_nopivot(mat) 0; do {     \
 28:     MatScalar d, di = mat[0];                                   \
 29:     mat[0]   = d = 1.0 / di;                                    \
 30:     mat[4]  *= -d;                                              \
 31:     mat[8]  *= -d;                                              \
 32:     mat[12] *= -d;                                              \
 33:     mat[1]  *= d;                                               \
 34:     mat[2]  *= d;                                               \
 35:     mat[3]  *= d;                                               \
 36:     mat[5]  += mat[4] * mat[1] * di;                            \
 37:     mat[6]  += mat[4] * mat[2] * di;                            \
 38:     mat[7]  += mat[4] * mat[3] * di;                            \
 39:     mat[9]  += mat[8] * mat[1] * di;                            \
 40:     mat[10] += mat[8] * mat[2] * di;                            \
 41:     mat[11] += mat[8] * mat[3] * di;                            \
 42:     mat[13] += mat[12] * mat[1] * di;                           \
 43:     mat[14] += mat[12] * mat[2] * di;                           \
 44:     mat[15] += mat[12] * mat[3] * di;                           \
 45:     di       = mat[5];                                          \
 46:     mat[5]   = d = 1.0 / di;                                    \
 47:     mat[1]  *= -d;                                              \
 48:     mat[9]  *= -d;                                              \
 49:     mat[13] *= -d;                                              \
 50:     mat[4]  *= d;                                               \
 51:     mat[6]  *= d;                                               \
 52:     mat[7]  *= d;                                               \
 53:     mat[0]  += mat[1] * mat[4] * di;                            \
 54:     mat[2]  += mat[1] * mat[6] * di;                            \
 55:     mat[3]  += mat[1] * mat[7] * di;                            \
 56:     mat[8]  += mat[9] * mat[4] * di;                            \
 57:     mat[10] += mat[9] * mat[6] * di;                            \
 58:     mat[11] += mat[9] * mat[7] * di;                            \
 59:     mat[12] += mat[13] * mat[4] * di;                           \
 60:     mat[14] += mat[13] * mat[6] * di;                           \
 61:     mat[15] += mat[13] * mat[7] * di;                           \
 62:     di       = mat[10];                                         \
 63:     mat[10]  = d = 1.0 / di;                                    \
 64:     mat[2]  *= -d;                                              \
 65:     mat[6]  *= -d;                                              \
 66:     mat[14] *= -d;                                              \
 67:     mat[8]  *= d;                                               \
 68:     mat[9]  *= d;                                               \
 69:     mat[11] *= d;                                               \
 70:     mat[0]  += mat[2] * mat[8] * di;                            \
 71:     mat[1]  += mat[2] * mat[9] * di;                            \
 72:     mat[3]  += mat[2] * mat[11] * di;                           \
 73:     mat[4]  += mat[6] * mat[8] * di;                            \
 74:     mat[5]  += mat[6] * mat[9] * di;                            \
 75:     mat[7]  += mat[6] * mat[11] * di;                           \
 76:     mat[12] += mat[14] * mat[8] * di;                           \
 77:     mat[13] += mat[14] * mat[9] * di;                           \
 78:     mat[15] += mat[14] * mat[11] * di;                          \
 79:     di       = mat[15];                                         \
 80:     mat[15]  = d = 1.0 / di;                                    \
 81:     mat[3]  *= -d;                                              \
 82:     mat[7]  *= -d;                                              \
 83:     mat[11] *= -d;                                              \
 84:     mat[12] *= d;                                               \
 85:     mat[13] *= d;                                               \
 86:     mat[14] *= d;                                               \
 87:     mat[0]  += mat[3] * mat[12] * di;                           \
 88:     mat[1]  += mat[3] * mat[13] * di;                           \
 89:     mat[2]  += mat[3] * mat[14] * di;                           \
 90:     mat[4]  += mat[7] * mat[12] * di;                           \
 91:     mat[5]  += mat[7] * mat[13] * di;                           \
 92:     mat[6]  += mat[7] * mat[14] * di;                           \
 93:     mat[8]  += mat[11] * mat[12] * di;                          \
 94:     mat[9]  += mat[11] * mat[13] * di;                          \
 95:     mat[10] += mat[11] * mat[14] * di;                          \
 96:   } while (0)

 98: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar*,PetscReal,PetscBool,PetscBool*);
 99: # if defined(PETSC_HAVE_SSE)
100: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(MatScalar*);
101: # endif
102: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_5(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);
103: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_6(MatScalar*,PetscReal,PetscBool,PetscBool*);
104: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_7(MatScalar*,PetscReal,PetscBool,PetscBool*);
105: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_9(MatScalar*,PetscReal,PetscBool,PetscBool*);
106: PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_15(MatScalar*,PetscInt*,MatScalar*,PetscReal,PetscBool,PetscBool*);

108: /*
109:     A = inv(A)    A_gets_inverse_A

111:    A      - square bs by bs array stored in column major order
112:    pivots - integer work array of length bs
113:    W      -  bs by 1 work array
114: */
115: #define PetscKernel_A_gets_inverse_A(bs,A,pivots,W,allowzeropivot,zeropivotdetected) (PetscLINPACKgefa((A),(bs),(pivots),(allowzeropivot),(zeropivotdetected)) || PetscLINPACKgedi((A),(bs),(pivots),(W)))

117: /* -----------------------------------------------------------------------*/

119: #if !defined(PETSC_USE_REAL_MAT_SINGLE)
120: /*
121:         Version that calls the BLAS directly
122: */
123: /*
124:       A = A * B   A_gets_A_times_B

126:    A, B - square bs by bs arrays stored in column major order
127:    W    - square bs by bs work array

129: */
130: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do {                     \
131:     PetscBLASInt   _bbs;                                                \
132:     PetscScalar    _one = 1.0,_zero = 0.0;                              \
133:     PetscErrorCode _ierr;                                               \
134:     _PetscBLASIntCast(bs,&_bbs);                   \
135:     _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr);            \
139:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs))); \
140:   } while (0)

142: /*

144:     A = A - B * C  A_gets_A_minus_B_times_C

146:    A, B, C - square bs by bs arrays stored in column major order
147: */
148: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) do {             \
149:     PetscScalar    _mone = -1.0,_one = 1.0;                             \
150:     PetscBLASInt   _bbs;                                                \
151:     PetscErrorCode _ierr;                                               \
152:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
156:     PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
157:   } while (0)

159: /*

161:     A = A + B^T * C  A_gets_A_plus_Btranspose_times_C

163:    A, B, C - square bs by bs arrays stored in column major order
164: */
165: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) do {     \
166:     PetscScalar    _one = 1.0;                                          \
167:     PetscBLASInt   _bbs;                                                \
168:     PetscErrorCode _ierr;                                               \
169:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
173:     PetscStackCallBLAS("BLASgemm",BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs))); \
174:   } while (0)

176: /*
177:     v = v + A^T w  v_gets_v_plus_Atranspose_times_w

179:    v - array of length bs
180:    A - square bs by bs array
181:    w - array of length bs
182: */
183: #define  PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) do {    \
184:     PetscScalar    _one  = 1.0;                                         \
185:     PetscBLASInt   _ione = 1, _bbs;                                     \
186:     PetscErrorCode _ierr;                                               \
187:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
191:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
192:   } while (0)

194: /*
195:     v = v - A w  v_gets_v_minus_A_times_w

197:    v - array of length bs
198:    A - square bs by bs array
199:    w - array of length bs
200: */
201: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) do {            \
202:     PetscScalar    _mone = -1.0,_one = 1.0;                             \
203:     PetscBLASInt   _ione = 1,_bbs;                                      \
204:     PetscErrorCode _ierr;                                               \
205:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
209:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
210:   } while (0)

212: /*
213:     v = v - A w  v_gets_v_minus_transA_times_w

215:    v - array of length bs
216:    A - square bs by bs array
217:    w - array of length bs
218: */
219: #define  PetscKernel_v_gets_v_minus_transA_times_w(bs,v,A,w) do {       \
220:     PetscScalar    _mone = -1.0,_one = 1.0;                             \
221:     PetscBLASInt   _ione = 1,_bbs;                                      \
222:     PetscErrorCode _ierr;                                               \
223:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
227:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
228:   } while (0)

230: /*
231:     v = v + A w  v_gets_v_plus_A_times_w

233:    v - array of length bs
234:    A - square bs by bs array
235:    w - array of length bs
236: */
237: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) do {             \
238:     PetscScalar    _one  = 1.0;                                         \
239:     PetscBLASInt   _ione = 1,_bbs;                                      \
240:     PetscErrorCode _ierr;                                               \
241:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
245:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione)); \
246:   } while (0)

248: /*
249:     v = v + A w  v_gets_v_plus_Ar_times_w

251:    v - array of length bs
252:    A - square bs by bs array
253:    w - array of length bs
254: */
255: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) do {      \
256:     PetscScalar    _one  = 1.0;                                         \
257:     PetscBLASInt   _ione = 1,_bbs,_bncols;                              \
258:     PetscErrorCode _ierr;                                               \
259:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
260:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);            \
264:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
265:   } while (0)

267: /*
268:     w <- w - A v  w_gets_w_minus_Ar_times_v

270:    v - array of length ncol
271:    A(bs,ncols)
272:    w - array of length bs
273: */
274: #define  PetscKernel_w_gets_w_minus_Ar_times_v(bs,ncols,w,A,v) do {     \
275:     PetscScalar    _one  = 1.0,_mone = -1.0;                            \
276:     PetscBLASInt   _ione = 1,_bbs,_bncols;                              \
277:     PetscErrorCode _ierr;                                               \
278:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
279:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);            \
283:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bncols),&_mone,A,&(_bbs),v,&_ione,&_one,w,&_ione)); \
284:   } while (0)

286: /*
287:     w = A v   w_gets_A_times_v

289:    v - array of length bs
290:    A - square bs by bs array
291:    w - array of length bs
292:  */
293: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) do {                     \
294:     PetscScalar    _zero = 0.0,_one = 1.0;                              \
295:     PetscBLASInt   _ione = 1,_bbs;                                      \
296:     PetscErrorCode _ierr;                                               \
297:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
301:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
302:   } while (0)

304: /*
305:     w = A' v   w_gets_transA_times_v

307:    v - array of length bs
308:    A - square bs by bs array
309:    w - array of length bs
310: */
311: #define PetscKernel_w_gets_transA_times_v(bs,v,A,w) do {                \
312:     PetscScalar    _zero = 0.0,_one = 1.0;                              \
313:     PetscBLASInt   _ione = 1,_bbs;                                      \
314:     PetscErrorCode _ierr;                                               \
315:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
319:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione)); \
320:   } while (0)

322: /*
323:         z = A*x
324: */
325: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) do {              \
326:     PetscScalar    _one  = 1.0,_zero = 0.0;                             \
327:     PetscBLASInt   _ione = 1,_bbs,_bncols;                              \
328:     PetscErrorCode _ierr;                                               \
329:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
330:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);            \
334:     PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione)); \
335:   } while (0)

337: /*
338:         z = trans(A)*x
339: */
340: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) do { \
341:     PetscScalar    _one  = 1.0;                                         \
342:     PetscBLASInt   _ione = 1,_bbs,_bncols;                              \
343:     PetscErrorCode _ierr;                                               \
344:     _PetscBLASIntCast(bs,&_bbs);CHKERRQ(_ierr);                  \
345:     _PetscBLASIntCast(ncols,&_bncols);CHKERRQ(_ierr);            \
349:     PetscStackCallBLAS("BLASgemv",BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione)); \
350:   } while (0)

352: #else /* !defined(PETSC_USE_REAL_MAT_SINGLE) */
353: /*
354:        Version that calls Fortran routines; can handle different precision
355:    of matrix (array) and vectors
356: */
357: /*
358:      These are Fortran kernels: They replace certain BLAS routines but
359:    have some arguments that may be single precision,rather than double
360:    These routines are provided in src/fortran/kernels/sgemv.F
361:    They are pretty pitiful but get the job done. The intention is
362:    that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
363:    code is used.
364: */

366: #if defined(PETSC_HAVE_FORTRAN_CAPS)
367: #define msgemv_  MSGEMV
368: #define msgemvp_ MSGEMVP
369: #define msgemvm_ MSGEMVM
370: #define msgemvt_ MSGEMVT
371: #define msgemmi_ MSGEMMI
372: #define msgemm_  MSGEMM
373: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
374: #define msgemv_  msgemv
375: #define msgemvp_ msgemvp
376: #define msgemvm_ msgemvm
377: #define msgemvt_ msgemvt
378: #define msgemmi_ msgemmi
379: #define msgemm_  msgemm
380: #endif

382: PETSC_EXTERN void msgemv_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
383: PETSC_EXTERN void msgemvp_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
384: PETSC_EXTERN void msgemvm_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
385: PETSC_EXTERN void msgemvt_(PetscInt*,PetscInt*,MatScalar*,PetscScalar*,PetscScalar*);
386: PETSC_EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
387: PETSC_EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);

389: /*
390:       A = A * B   A_gets_A_times_B

392:    A, B - square bs by bs arrays stored in column major order
393:    W    - square bs by bs work array

395: */
396: #define PetscKernel_A_gets_A_times_B(bs,A,B,W) do {                     \
397:     PetscErrorCode _ierr;                                               \
398:     _PetscArraycpy((W),(A),(bs)*(bs));CHKERRQ(_ierr);            \
399:     msgemmi_(&bs,A,B,W);                                                \
400:   } while (0)

402: /*

404:     A = A - B * C  A_gets_A_minus_B_times_C

406:    A, B, C - square bs by bs arrays stored in column major order
407: */
408: #define PetscKernel_A_gets_A_minus_B_times_C(bs,A,B,C) msgemm_(&(bs),(A),(B),(C))

410: /*
411:     v = v - A w  v_gets_v_minus_A_times_w

413:    v - array of length bs
414:    A - square bs by bs array
415:    w - array of length bs
416: */
417: #define  PetscKernel_v_gets_v_minus_A_times_w(bs,v,A,w) msgemvm_(&(bs),&(bs),(A),(w),(v))

419: /*
420:     v = v + A w  v_gets_v_plus_A_times_w

422:    v - array of length bs
423:    A - square bs by bs array
424:    w - array of length bs
425: */
426: #define  PetscKernel_v_gets_v_plus_A_times_w(bs,v,A,w) msgemvp_(&(bs),&(bs),(A),(w),(v))

428: /*
429:     v = v + A w  v_gets_v_plus_Ar_times_w

431:    v - array of length bs
432:    A - square bs by bs array
433:    w - array of length bs
434: */
435: #define  PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) msgemvp_(&(bs),&(ncol),(A),(v),(w))

437: /*
438:     w = A v   w_gets_A_times_v

440:    v - array of length bs
441:    A - square bs by bs array
442:    w - array of length bs
443: */
444: #define PetscKernel_w_gets_A_times_v(bs,v,A,w) msgemv_(&(bs),&(bs),(A),(v),(w))

446: /*
447:         z = A*x
448: */
449: #define PetscKernel_w_gets_Ar_times_v(bs,ncols,x,A,z) msgemv_(&(bs),&(ncols),(A),(x),(z))

451: /*
452:         z = trans(A)*x
453: */
454: #define PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) msgemvt_(&(bs),&(ncols),(A),(x),(z))

456: /* These do not work yet */
457: #define PetscKernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
458: #define PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)

460: #endif /* !defined(PETSC_USE_REAL_MAT_SINGLE) */

462: #endif /* __ILU_H */