Actual source code: axpy.h
2: /*
3: These are macros for daxpy like operations. The format is
4: APXY(U,Alpha,P,n)
5: for
6: U += Alpha * P
8: In addition,versions that process 2 and 4 vectors are provided;
9: these can give significantly better use of memory resources than
10: successive calls to the regular daxpy.
11: */
13: #ifndef APXY
15: #include petscblaslapack.h
17: /* BGL kernels */
18: #if defined(PETSC_USE_FORTRAN_KERNELS_BGL)
19: #define fortrancopy fortrancopy_bgl
20: #define fortranzero fortranzero_bgl
21: #define fortranmaxpy4 fortranmaxpy4_bgl
22: #define fortranmaxpy3 fortranmaxpy3_bgl
23: #define fortranmaxpy2 fortranmaxpy2_bgl
24: #define fortranaypx fortranaypx_bgl
25: #define fortranwaxpy fortranwaxpy_bgl
27: #endif
29: #if defined(PETSC_HAVE_FORTRAN_CAPS)
30: #define fortrancopy_ FORTRANCOPY
31: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
32: #define fortrancopy_ fortrancopy
33: #endif
38: #if defined(PETSC_HAVE_FORTRAN_CAPS)
39: #define fortranzero_ FORTRANZERO
40: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
41: #define fortranzero_ fortranzero
42: #endif
48: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
49: #if defined(PETSC_HAVE_FORTRAN_CAPS)
50: #define fortranaypx_ FORTRANAYPX
51: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
52: #define fortranaypx_ fortranaypx
53: #endif
57: #endif
59: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
60: #if defined(PETSC_HAVE_FORTRAN_CAPS)
61: #define fortranwaxpy_ FORTRANWAXPY
62: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
63: #define fortranwaxpy_ fortranwaxpy
64: #endif
68: #endif
70: #if defined(PETSC_USE_FORTRAN_KERNEL_MAXPY)
72: #if defined(PETSC_HAVE_FORTRAN_CAPS)
73: #define fortranmaxpy4_ FORTRANMAXPY4
74: #define fortranmaxpy3_ FORTRANMAXPY3
75: #define fortranmaxpy2_ FORTRANMAXPY2
76: #elif !defined(PETSC_HAVE_FORTRAN_UNDERSCORE)
77: #define fortranmaxpy4_ fortranmaxpy4
78: #define fortranmaxpy3_ fortranmaxpy3
79: #define fortranmaxpy2_ fortranmaxpy2
80: #endif
83: EXTERN void fortranmaxpy4_(void*,void*,void*,void*,void*,void*,void*,void*,void*,PetscInt*);
84: EXTERN void fortranmaxpy3_(void*,void*,void*,void*,void*,void*,void*,PetscInt*);
85: EXTERN void fortranmaxpy2_(void*,void*,void*,void*,void*,PetscInt*);
88: #define APXY(U,a1,p1,n) {PetscBLASInt one=1;\
89: BLASaxpy_(&n,&a1,p1,&one,U,&one);}
90: #define APXY2(U,a1,a2,p1,p2,n) { \
91: fortranmaxpy2_(U,&a1,&a2,p1,p2,&n);}
92: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) { \
93: fortranmaxpy3_(U,&a1,&a2,&a3,p1,p2,p3,&n);}
94: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){ \
95: fortranmaxpy4_(U,&a1,&a2,&a3,&a4,p1,p2,p3,p4,&n);}
97: #elif defined(PETSC_USE_UNROLL_KERNELS)
99: #define APXY(U,Alpha,P,n) {\
100: switch (n & 0x3) {\
101: case 3: *U++ += Alpha * *P++;\
102: case 2: *U++ += Alpha * *P++;\
103: case 1: *U++ += Alpha * *P++;\
104: n -= 4;case 0: break;}while (n>0) {U[0] += Alpha * P[0];U[1] += Alpha * P[1];\
105: U[2] += Alpha * P[2]; U[3] += Alpha * P[3]; \
106: U += 4; P += 4; n -= 4;}}
107: #define APXY2(U,a1,a2,p1,p2,n) {\
108: switch (n & 0x3) {\
109: case 3: *U++ += a1 * *p1++ + a2 * *p2++;\
110: case 2: *U++ += a1 * *p1++ + a2 * *p2++;\
111: case 1: *U++ += a1 * *p1++ + a2 * *p2++;\
112: n -= 4;case 0: break;}\
113: while (n>0) {U[0]+=a1*p1[0]+a2*p2[0];U[1]+=a1*p1[1]+a2*p2[1];\
114: U[2]+=a1*p1[2]+a2*p2[2];U[3]+=a1*p1[3]+a2*p2[3];U+=4;p1+=4;p2+=4;n -= 4;}}
115: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
116: switch (n & 0x3) {\
117: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
118: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
119: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;\
120: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0];\
121: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1];\
122: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2];\
123: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3];U+=4;p1+=4;p2+=4;p3+=4;n-=4;}}
124: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
125: switch (n & 0x3) {\
126: case 3: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
127: case 2: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
128: case 1: *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;\
129: n -= 4;case 0:break;}while (n>0) {U[0]+=a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
130: U[1]+=a1*p1[1]+a2*p2[1]+a3*p3[1]+a4*p4[1];\
131: U[2]+=a1*p1[2]+a2*p2[2]+a3*p3[2]+a4*p4[2];\
132: U[3]+=a1*p1[3]+a2*p2[3]+a3*p3[3]+a4*p4[3];U+=4;p1+=4;p2+=4;p3+=4;p4+=4;n-=4;}}
134: #elif defined(PETSC_USE_WHILE_KERNELS)
136: #define APXY(U,a1,p1,n) {\
137: while (n--) *U++ += a1 * *p1++;}
138: #define APXY2(U,a1,a2,p1,p2,n) {\
139: while (n--) *U++ += a1 * *p1++ + a2 * *p2++;}
140: #define APXY3(U,a1,a2,a3,p1,p2,p3,n) {\
141: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++;}
142: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n) {\
143: while (n--) *U++ += a1 * *p1++ + a2 * *p2++ + a3 * *p3++ + a4 * *p4++;}
145: #elif defined(PETSC_USE_BLAS_KERNELS)
147: #define APXY(U,a1,p1,n) {PetscBLASInt one=1;\
148: BLASaxpy_(&n,&a1,p1,&one,U,&one);}
149: #define APXY2(U,a1,a2,p1,p2,n){APXY(U,a1,p1,n);\
150: APXY(U,a2,p2,n);}
151: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){APXY2(U,a1,a2,p1,p2,n);\
152: APXY(U,a3,p3,n);}
153: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){APXY2(U,a1,a2,p1,p2,n);\
154: APXY2(U,a3,a4,p3,p4,n);}
156: #elif defined(PETSC_USE_FOR_KERNELS)
158: #define APXY(U,a1,p1,n) {PetscInt __i;PetscScalar __s1,__s2; \
159: for(__i=0;__i<n-1;__i+=2){__s1=a1*p1[__i];__s2=a1*p1[__i+1];\
160: __s1+=U[__i];__s2+=U[__i+1];U[__i]=__s1;U[__i+1]=__s2;}\
161: if (n & 0x1) U[__i] += a1 * p1[__i];}
162: #define APXY2(U,a1,a2,p1,p2,n) {PetscInt __i;\
163: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
164: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i;\
165: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
166: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;\
167: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
169: #else
171: #define APXY(U,a1,p1,n) {PetscInt __i;PetscScalar _a1=a1;\
172: for(__i=0;__i<n;__i++)U[__i]+=_a1 * p1[__i];}
173: #define APXY2(U,a1,a2,p1,p2,n) {PetscInt __i;\
174: for(__i=0;__i<n;__i++)U[__i] += a1 * p1[__i] + a2 * p2[__i];}
175: #define APXY3(U,a1,a2,a3,p1,p2,p3,n){PetscInt __i;\
176: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i];}
177: #define APXY4(U,a1,a2,a3,a4,p1,p2,p3,p4,n){PetscInt __i;\
178: for(__i=0;__i<n;__i++)U[__i]+=a1*p1[__i]+a2*p2[__i]+a3*p3[__i]+a4*p4[__i];}
180: #endif
183: /* ----------------------------------------------------------------------------
184: axpy() but for increments of inc in both U and P
185: ---------------------------------------------------------------------------*/
186: #ifdef PETSC_USE_UNROLL_KERNELS
187: #define APXYINC(U,Alpha,P,n,inc) {\
188: if (n & 0x1) {\
189: *U += Alpha * *P; U += inc; P += inc; n--;}\
190: while (n>0) {U[0] += Alpha * P[0];U[inc] += Alpha * P[inc];\
191: U += 2*inc; P += 2*inc; n -= 2;}}
192: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
193: if (n & 0x1) {\
194: *U += a1 * *p1 + a2 * *p2; U += inc; p1 += inc; p2 += inc;n--;}\
195: while (n>0) {U[0] += a1*p1[0]+a2*p2[0];U[inc]+=a1*p1[inc]+a2*p2[inc];\
196: U += 2*inc;p1 += 2*inc;p2+=2*inc; n -= 2;}}
197: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {\
198: if (n & 0x1) {\
199: *U += a1 * *p1 + a2 * *p2 + a3 * *p3; \
200: U += inc; p1 += inc; p2 += inc; p3 += inc;n--;}\
201: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0];\
202: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc];\
203: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;n -= 2;}}
204: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
205: if (n & 0x1) {\
206: *U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4; \
207: U += inc; p1 += inc; p2 += inc; p3 += inc; p4 += inc;n--;}\
208: while (n>0) {U[0] += a1*p1[0]+a2*p2[0]+a3*p3[0]+a4*p4[0];\
209: U[inc]+=a1*p1[inc]+a2*p2[inc]+a3*p3[inc]+a4*p4[inc];\
210: U += 2*inc;p1 += 2*inc;p2+=2*inc;p3+=2*inc;p4+=2*inc; n -= 2;}}
212: #elif defined(PETSC_USE_WHILE_KERNELS)
213: #define APXYINC(U,a1,p1,n,inc) {\
214: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
215: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
216: while (n--) {*U += a1 * *p1 + a2 * *p2;\
217: U+=inc;p1+=inc;p2+=inc;}}
218: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc){\
219: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
220: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc) {\
221: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
222: p2+=inc;p3+=inc;p4+=inc;}}
224: #else
225: /* These need to be converted to for loops */
226: #define APXYINC(U,a1,p1,n,inc) {\
227: while (n--){*U += a1 * *p1; U += inc; p1 += inc;}}
228: #define APXY2INC(U,a1,a2,p1,p2,n,inc) {\
229: while (n--) {*U += a1 * *p1 + a2 * *p2;\
230: U+=inc;p1+=inc;p2+=inc;}}
231: #define APXY3INC(U,a1,a2,a3,p1,p2,p3,n,inc) {\
232: while (n--) {*U+=a1**p1+a2**p2+a3 * *p3;U+=inc;p1+=inc;p2+=inc;p3+=inc;}}
233: #define APXY4INC(U,a1,a2,a3,a4,p1,p2,p3,p4,n,inc){\
234: while (n--) {*U += a1 * *p1 + a2 * *p2 + a3 * *p3 + a4 * *p4;U+=inc;p1+=inc;\
235: p2+=inc;p3+=inc;p4+=inc;}}
236: #endif
238: /* --------------------------------------------------------------------
239: This is aypx:
240: for (i=0; i<n; i++)
241: y[i] = x[i] + alpha * y[i];
242: ---------------------------------------------------------------------*/
243: #if defined(PETSC_USE_UNROLL_KERNELS)
244: #define AYPX(U,Alpha,P,n) {\
245: switch (n & 0x3) {\
246: case 3: *U = *P++ + Alpha * *U;U++;\
247: case 2: *U = *P++ + Alpha * *U;U++;\
248: case 1: *U = *P++ + Alpha * *U;U++;\
249: n -= 4;case 0: break;}while (n>0) {U[0] = P[0]+Alpha * U[0];\
250: U[1] = P[1] + Alpha * U[1];\
251: U[2] = P[2] + Alpha * U[2]; U[3] = P[3] + Alpha * U[3]; \
252: U += 4; P += 4; n -= 4;}}
254: #elif defined(PETSC_USE_WHILE_KERNELS)
255: #define AYPX(U,a1,p1,n) {\
256: while (n--) {*U = *p1++ + a1 * *U;U++;}
258: #elif defined(PETSC_USE_FOR_KERNELS)
259: #define AYPX(U,a1,p1,n) {PetscInt __i;PetscScalar __s1,__s2; \
260: for(__i=0;__i<n-1;__i+=2){__s1=p1[__i];__s2=p1[__i+1];\
261: __s1+=a1*U[__i];__s2+=a1*U[__i+1];\
262: U[__i]=__s1;U[__i+1]=__s2;}\
263: if (n & 0x1) U[__i] = p1[__i] + a1 * U[__i];}
265: #else
266: #define AYPX(U,a1,p1,n) {PetscInt __i;\
267: for(__i=0;__i<n;__i++)U[__i]=p1[__i]+a1 * U[__i];}
268: #endif
270: /* ----------------------------------------------------------------------------------
271: Useful for APXY where alpha == -1
272: ----------------------------------------------------------------------------------
273: */
274: #define YMX(U,p1,n) {PetscInt __i;\
275: for(__i=0;__i<n;__i++)U[__i]-=p1[__i];}
276: /* Useful for APXY where alpha == 1 */
277: #define YPX(U,p1,n) {PetscInt __i;\
278: for(__i=0;__i<n;__i++)U[__i]+=p1[__i];}
280: #endif