Actual source code: ilu.h
1: /*
2: Kernels used in sparse ILU (and LU) and in the resulting triangular
3: solves. These are for block algorithms where the block sizes are on
4: the order of 2-6+.
6: There are TWO versions of the macros below.
7: 1) standard for MatScalar == PetscScalar use the standard BLAS for
8: block size larger than 7 and
9: 2) handcoded Fortran single precision for the matrices, since BLAS
10: does not have some arguments in single and some in double.
12: */
16: #include petscblaslapack.h
18: /*
19: These are C kernels,they are contained in
20: src/mat/impls/baij/seq
21: */
23: EXTERN PetscErrorCode LINPACKdgefa(MatScalar*,PetscInt,PetscInt*);
24: EXTERN PetscErrorCode LINPACKdgedi(MatScalar*,PetscInt,PetscInt*,MatScalar*);
25: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_2(MatScalar*);
26: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_3(MatScalar*);
28: #define Kernel_A_gets_inverse_A_4_nopivot(mat) 0;\
29: {\
30: MatScalar d, di;\
31: \
32: di = mat[0];\
33: mat[0] = d = 1.0 / di;\
34: mat[4] *= -d;\
35: mat[8] *= -d;\
36: mat[12] *= -d;\
37: mat[1] *= d;\
38: mat[2] *= d;\
39: mat[3] *= d;\
40: mat[5] += mat[4] * mat[1] * di;\
41: mat[6] += mat[4] * mat[2] * di;\
42: mat[7] += mat[4] * mat[3] * di;\
43: mat[9] += mat[8] * mat[1] * di;\
44: mat[10] += mat[8] * mat[2] * di;\
45: mat[11] += mat[8] * mat[3] * di;\
46: mat[13] += mat[12] * mat[1] * di;\
47: mat[14] += mat[12] * mat[2] * di;\
48: mat[15] += mat[12] * mat[3] * di;\
49: di = mat[5];\
50: mat[5] = d = 1.0 / di;\
51: mat[1] *= -d;\
52: mat[9] *= -d;\
53: mat[13] *= -d;\
54: mat[4] *= d;\
55: mat[6] *= d;\
56: mat[7] *= d;\
57: mat[0] += mat[1] * mat[4] * di;\
58: mat[2] += mat[1] * mat[6] * di;\
59: mat[3] += mat[1] * mat[7] * di;\
60: mat[8] += mat[9] * mat[4] * di;\
61: mat[10] += mat[9] * mat[6] * di;\
62: mat[11] += mat[9] * mat[7] * di;\
63: mat[12] += mat[13] * mat[4] * di;\
64: mat[14] += mat[13] * mat[6] * di;\
65: mat[15] += mat[13] * mat[7] * di;\
66: di = mat[10];\
67: mat[10] = d = 1.0 / di;\
68: mat[2] *= -d;\
69: mat[6] *= -d;\
70: mat[14] *= -d;\
71: mat[8] *= d;\
72: mat[9] *= d;\
73: mat[11] *= d;\
74: mat[0] += mat[2] * mat[8] * di;\
75: mat[1] += mat[2] * mat[9] * di;\
76: mat[3] += mat[2] * mat[11] * di;\
77: mat[4] += mat[6] * mat[8] * di;\
78: mat[5] += mat[6] * mat[9] * di;\
79: mat[7] += mat[6] * mat[11] * di;\
80: mat[12] += mat[14] * mat[8] * di;\
81: mat[13] += mat[14] * mat[9] * di;\
82: mat[15] += mat[14] * mat[11] * di;\
83: di = mat[15];\
84: mat[15] = d = 1.0 / di;\
85: mat[3] *= -d;\
86: mat[7] *= -d;\
87: mat[11] *= -d;\
88: mat[12] *= d;\
89: mat[13] *= d;\
90: mat[14] *= d;\
91: mat[0] += mat[3] * mat[12] * di;\
92: mat[1] += mat[3] * mat[13] * di;\
93: mat[2] += mat[3] * mat[14] * di;\
94: mat[4] += mat[7] * mat[12] * di;\
95: mat[5] += mat[7] * mat[13] * di;\
96: mat[6] += mat[7] * mat[14] * di;\
97: mat[8] += mat[11] * mat[12] * di;\
98: mat[9] += mat[11] * mat[13] * di;\
99: mat[10] += mat[11] * mat[14] * di;\
100: }
102: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *);
103: # if defined(PETSC_HAVE_SSE)
104: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(MatScalar *);
105: # endif
106: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_5(MatScalar *);
107: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_6(MatScalar *);
108: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_7(MatScalar *);
109: EXTERN PetscErrorCode Kernel_A_gets_inverse_A_9(MatScalar *);
111: /*
112: A = inv(A) A_gets_inverse_A
114: A - square bs by bs array stored in column major order
115: pivots - integer work array of length bs
116: W - bs by 1 work array
117: */
118: #define Kernel_A_gets_inverse_A(bs,A,pivots,W) (LINPACKdgefa((A),(bs),(pivots)) || LINPACKdgedi((A),(bs),(pivots),(W)))
120: /* -----------------------------------------------------------------------*/
122: #if !defined(PETSC_USE_MAT_SINGLE)
123: /*
124: Version that calls the BLAS directly
125: */
126: /*
127: A = A * B A_gets_A_times_B
129: A, B - square bs by bs arrays stored in column major order
130: W - square bs by bs work array
132: */
133: #define Kernel_A_gets_A_times_B(bs,A,B,W) \
134: { \
135: PetscBLASInt _bbs = (PetscBLASInt)bs;\
136: PetscScalar _one = 1.0,_zero = 0.0; \
137: PetscErrorCode _ierr; \
138: _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
139: BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_one,(W),&(_bbs),(B),&(_bbs),&_zero,(A),&(_bbs));\
140: }
142: /*
144: A = A - B * C A_gets_A_minus_B_times_C
146: A, B, C - square bs by bs arrays stored in column major order
147: */
148: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
149: { \
150: PetscBLASInt _bbs = (PetscBLASInt)bs;\
151: PetscScalar _mone = -1.0,_one = 1.0; \
152: BLASgemm_("N","N",&(_bbs),&(_bbs),&(_bbs),&_mone,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs));\
153: }
155: /*
157: A = A + B^T * C A_gets_A_plus_Btranspose_times_C
159: A, B, C - square bs by bs arrays stored in column major order
160: */
161: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C) \
162: { \
163: PetscBLASInt _bbs = (PetscBLASInt)bs;\
164: PetscScalar _one = 1.0; \
165: BLASgemm_("T","N",&(_bbs),&(_bbs),&(_bbs),&_one,(B),&(_bbs),(C),&(_bbs),&_one,(A),&(_bbs));\
166: }
168: /*
169: v = v + A^T w v_gets_v_plus_Atranspose_times_w
171: v - array of length bs
172: A - square bs by bs array
173: w - array of length bs
174: */
175: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w) \
176: { \
177: PetscScalar _one = 1.0; \
178: PetscBLASInt _bbs = (PetscBLASInt)bs, _ione = 1; \
179: BLASgemv_("T",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione); \
180: }
182: /*
183: v = v - A w v_gets_v_minus_A_times_w
185: v - array of length bs
186: A - square bs by bs array
187: w - array of length bs
188: */
189: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
190: { \
191: PetscScalar _mone = -1.0,_one = 1.0; \
192: PetscBLASInt _bbs = (PetscBLASInt)bs, _ione = 1; \
193: BLASgemv_("N",&(_bbs),&(_bbs),&_mone,A,&(_bbs),w,&_ione,&_one,v,&_ione); \
194: }
196: /*
197: v = v + A w v_gets_v_plus_A_times_w
199: v - array of length bs
200: A - square bs by bs array
201: w - array of length bs
202: */
203: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
204: { \
205: PetscScalar _one = 1.0; \
206: PetscBLASInt _bbs = (PetscBLASInt)bs,_ione = 1; \
207: BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),w,&_ione,&_one,v,&_ione); \
208: }
210: /*
211: v = v + A w v_gets_v_plus_Ar_times_w
213: v - array of length bs
214: A - square bs by bs array
215: w - array of length bs
216: */
217: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncols,v,A,w) \
218: { \
219: PetscScalar _one = 1.0; \
220: PetscBLASInt _bbs = (PetscBLASInt)bs,_bncols = (PetscBLASInt)ncols,_ione = 1; \
221: BLASgemv_("N",&(_bbs),&(_bncols),&_one,A,&(_bbs),v,&_ione,&_one,w,&_ione); \
222: }
224: /*
225: w = A v w_gets_A_times_v
227: v - array of length bs
228: A - square bs by bs array
229: w - array of length bs
230: */
231: #define Kernel_w_gets_A_times_v(bs,v,A,w) \
232: { \
233: PetscScalar _zero = 0.0,_one = 1.0; \
234: PetscBLASInt _bbs = (PetscBLASInt)bs,_ione = 1; \
235: BLASgemv_("N",&(_bbs),&(_bbs),&_one,A,&(_bbs),v,&_ione,&_zero,w,&_ione); \
236: }
238: /*
239: z = A*x
240: */
241: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
242: { \
243: PetscScalar _one = 1.0,_zero = 0.0; \
244: PetscBLASInt _bbs = (PetscBLASInt)bs,_bncols = (PetscBLASInt)ncols,_ione = 1; \
245: BLASgemv_("N",&(_bbs),&_bncols,&_one,A,&(_bbs),x,&_ione,&_zero,z,&_ione); \
246: }
248: /*
249: z = trans(A)*x
250: */
251: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
252: { \
253: PetscScalar _one = 1.0; \
254: PetscBLASInt _bbs = (PetscBLASInt)bs,_bncols = (PetscBLASInt)ncols,_ione = 1; \
255: BLASgemv_("T",&_bbs,&_bncols,&_one,A,&_bbs,x,&_ione,&_one,z,&_ione); \
256: }
258: #else
259: /*
260: Version that calls Fortran routines; can handle different precision
261: of matrix (array) and vectors
262: */
263: /*
264: These are Fortran kernels: They replace certain BLAS routines but
265: have some arguments that may be single precision,rather than double
266: These routines are provided in src/fortran/kernels/sgemv.F
267: They are pretty pitiful but get the job done. The intention is
268: that for important block sizes (currently 1,2,3,4,5,6,7) custom inlined
269: code is used.
270: */
272: /* BGL kernels */
273: #if defined(PETSC_USE_FORTRAN_KERNELS_BGL)
274: #define msgemv msgemv_bgl
275: #define msgemvp msgemvp_bgl
276: #define msgemvm msgemvm_bgl
277: #define msgemvt msgemvt_bgl
278: #define msgemmi msgemmi_bgl
279: #define msgemm msgemm_bgl
280: #endif
282: #ifdef PETSC_HAVE_FORTRAN_CAPS
283: #define msgemv_ MSGEMV
284: #define msgemvp_ MSGEMVP
285: #define msgemvm_ MSGEMVM
286: #define msgemvt_ MSGEMVT
287: #define msgemmi_ MSGEMMI
288: #define msgemm_ MSGEMM
289: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
290: #define msgemv_ msgemv
291: #define msgemvp_ msgemvp
292: #define msgemvm_ msgemvm
293: #define msgemvt_ msgemvt
294: #define msgemmi_ msgemmi
295: #define msgemm_ msgemm
296: #endif
298: EXTERN void msgemv_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*);
299: EXTERN void msgemvp_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*);
300: EXTERN void msgemvm_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*);
301: EXTERN void msgemvt_(PetscInt*,PetscInt *,MatScalar*,PetscScalar*,PetscScalar*);
302: EXTERN void msgemmi_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
303: EXTERN void msgemm_(PetscInt*,MatScalar*,MatScalar*,MatScalar*);
306: /*
307: A = A * B A_gets_A_times_B
309: A, B - square bs by bs arrays stored in column major order
310: W - square bs by bs work array
312: */
313: #define Kernel_A_gets_A_times_B(bs,A,B,W) \
314: { \
315: PetscErrorCode _PetscMemcpy((W),(A),(bs)*(bs)*sizeof(MatScalar));CHKERRQ(_ierr); \
316: msgemmi_(&bs,A,B,W); \
317: }
319: /*
321: A = A - B * C A_gets_A_minus_B_times_C
323: A, B, C - square bs by bs arrays stored in column major order
324: */
325: #define Kernel_A_gets_A_minus_B_times_C(bs,A,B,C) \
326: { \
327: msgemm_(&bs,A,B,C); \
328: }
330: /*
331: v = v - A w v_gets_v_minus_A_times_w
333: v - array of length bs
334: A - square bs by bs array
335: w - array of length bs
336: */
337: #define Kernel_v_gets_v_minus_A_times_w(bs,v,A,w) \
338: { \
339: msgemvm_(&bs,&bs,A,w,v); \
340: }
342: /*
343: v = v + A w v_gets_v_plus_A_times_w
345: v - array of length bs
346: A - square bs by bs array
347: w - array of length bs
348: */
349: #define Kernel_v_gets_v_plus_A_times_w(bs,v,A,w) \
350: { \
351: msgemvp_(&bs,&bs,A,w,v);\
352: }
354: /*
355: v = v + A w v_gets_v_plus_Ar_times_w
357: v - array of length bs
358: A - square bs by bs array
359: w - array of length bs
360: */
361: #define Kernel_w_gets_w_plus_Ar_times_v(bs,ncol,v,A,w) \
362: { \
363: msgemvp_(&bs,&ncol,A,v,w);\
364: }
366: /*
367: w = A v w_gets_A_times_v
369: v - array of length bs
370: A - square bs by bs array
371: w - array of length bs
372: */
373: #define Kernel_w_gets_A_times_v(bs,v,A,w) \
374: { \
375: msgemv_(&bs,&bs,A,v,w);\
376: }
377:
378: /*
379: z = A*x
380: */
381: #define Kernel_w_gets_Ar_times_v(bs,ncols,x,A,z) \
382: { \
383: msgemv_(&bs,&ncols,A,x,z);\
384: }
386: /*
387: z = trans(A)*x
388: */
389: #define Kernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,x,A,z) \
390: { \
391: msgemvt_(&bs,&ncols,A,x,z);\
392: }
394: /* These do not work yet */
395: #define Kernel_A_gets_A_plus_Btranspose_times_C(bs,A,B,C)
396: #define Kernel_v_gets_v_plus_Atranspose_times_w(bs,v,A,w)
399: #endif
401: #endif