Actual source code: inode.c
1: #define PETSCMAT_DLL
3: /*
4: This file provides high performance routines for the Inode format (compressed sparse row)
5: by taking advantage of rows with identical nonzero structure (I-nodes).
6: */
7: #include src/mat/impls/aij/seq/aij.h
11: static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns)
12: {
13: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
15: PetscInt i,count,m,n,min_mn,*ns_row,*ns_col;
18: n = A->cmap.n;
19: m = A->rmap.n;
20: ns_row = a->inode.size;
21:
22: min_mn = (m < n) ? m : n;
23: if (!ns) {
24: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++);
25: for(; count+1 < n; count++,i++);
26: if (count < n) {
27: i++;
28: }
29: *size = i;
30: return(0);
31: }
32: PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);
33:
34: /* Use the same row structure wherever feasible. */
35: for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) {
36: ns_col[i] = ns_row[i];
37: }
39: /* if m < n; pad up the remainder with inode_limit */
40: for(; count+1 < n; count++,i++) {
41: ns_col[i] = 1;
42: }
43: /* The last node is the odd ball. padd it up with the remaining rows; */
44: if (count < n) {
45: ns_col[i] = n - count;
46: i++;
47: } else if (count > n) {
48: /* Adjust for the over estimation */
49: ns_col[i-1] += n - count;
50: }
51: *size = i;
52: *ns = ns_col;
53: return(0);
54: }
57: /*
58: This builds symmetric version of nonzero structure,
59: */
62: static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
63: {
64: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
66: PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n;
67: PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j;
70: nslim_row = a->inode.node_count;
71: m = A->rmap.n;
72: n = A->cmap.n;
73: if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square");
74:
75: /* Use the row_inode as column_inode */
76: nslim_col = nslim_row;
77: ns_col = ns_row;
79: /* allocate space for reformated inode structure */
80: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);
81: PetscMalloc((n+1)*sizeof(PetscInt),&tvc);
82: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1];
84: for (i1=0,col=0; i1<nslim_col; ++i1){
85: nsz = ns_col[i1];
86: for (i2=0; i2<nsz; ++i2,++col)
87: tvc[col] = i1;
88: }
89: /* allocate space for row pointers */
90: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
91: *iia = ia;
92: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
93: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
95: /* determine the number of columns in each row */
96: ia[0] = oshift;
97: for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) {
99: j = aj + ai[row] + ishift;
100: jmax = aj + ai[row+1] + ishift;
101: i2 = 0;
102: col = *j++ + ishift;
103: i2 = tvc[col];
104: while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */
105: ia[i1+1]++;
106: ia[i2+1]++;
107: i2++; /* Start col of next node */
108: while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j;
109: i2 = tvc[col];
110: }
111: if(i2 == i1) ia[i2+1]++; /* now the diagonal element */
112: }
114: /* shift ia[i] to point to next row */
115: for (i1=1; i1<nslim_row+1; i1++) {
116: row = ia[i1-1];
117: ia[i1] += row;
118: work[i1-1] = row - oshift;
119: }
121: /* allocate space for column pointers */
122: nz = ia[nslim_row] + (!ishift);
123: PetscMalloc(nz*sizeof(PetscInt),&ja);
124: *jja = ja;
126: /* loop over lower triangular part putting into ja */
127: for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) {
128: j = aj + ai[row] + ishift;
129: jmax = aj + ai[row+1] + ishift;
130: i2 = 0; /* Col inode index */
131: col = *j++ + ishift;
132: i2 = tvc[col];
133: while (i2<i1 && j<jmax) {
134: ja[work[i2]++] = i1 + oshift;
135: ja[work[i1]++] = i2 + oshift;
136: ++i2;
137: while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */
138: i2 = tvc[col];
139: }
140: if (i2 == i1) ja[work[i1]++] = i2 + oshift;
142: }
143: PetscFree(work);
144: PetscFree(tns);
145: PetscFree(tvc);
146: return(0);
147: }
149: /*
150: This builds nonsymmetric version of nonzero structure,
151: */
154: static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
155: {
156: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
158: PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col;
159: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
162: nslim_row = a->inode.node_count;
163: n = A->cmap.n;
165: /* Create The column_inode for this matrix */
166: Mat_CreateColInode(A,&nslim_col,&ns_col);
167:
168: /* allocate space for reformated column_inode structure */
169: PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);
170: PetscMalloc((n +1)*sizeof(PetscInt),&tvc);
171: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
173: for (i1=0,col=0; i1<nslim_col; ++i1){
174: nsz = ns_col[i1];
175: for (i2=0; i2<nsz; ++i2,++col)
176: tvc[col] = i1;
177: }
178: /* allocate space for row pointers */
179: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);
180: *iia = ia;
181: PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));
182: PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);
184: /* determine the number of columns in each row */
185: ia[0] = oshift;
186: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
187: j = aj + ai[row] + ishift;
188: col = *j++ + ishift;
189: i2 = tvc[col];
190: nz = ai[row+1] - ai[row];
191: while (nz-- > 0) { /* off-diagonal elemets */
192: ia[i1+1]++;
193: i2++; /* Start col of next node */
194: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
195: if (nz > 0) i2 = tvc[col];
196: }
197: }
199: /* shift ia[i] to point to next row */
200: for (i1=1; i1<nslim_row+1; i1++) {
201: row = ia[i1-1];
202: ia[i1] += row;
203: work[i1-1] = row - oshift;
204: }
206: /* allocate space for column pointers */
207: nz = ia[nslim_row] + (!ishift);
208: PetscMalloc(nz*sizeof(PetscInt),&ja);
209: *jja = ja;
211: /* loop over matrix putting into ja */
212: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
213: j = aj + ai[row] + ishift;
214: i2 = 0; /* Col inode index */
215: col = *j++ + ishift;
216: i2 = tvc[col];
217: nz = ai[row+1] - ai[row];
218: while (nz-- > 0) {
219: ja[work[i1]++] = i2 + oshift;
220: ++i2;
221: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
222: if (nz > 0) i2 = tvc[col];
223: }
224: }
225: PetscFree(ns_col);
226: PetscFree(work);
227: PetscFree(tns);
228: PetscFree(tvc);
229: return(0);
230: }
234: static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
235: {
236: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
240: *n = a->inode.node_count;
241: if (!ia) return(0);
243: if (symmetric) {
244: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
245: } else {
246: MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
247: }
248: return(0);
249: }
253: static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
254: {
258: if (!ia) return(0);
259: PetscFree(*ia);
260: PetscFree(*ja);
261: return(0);
262: }
264: /* ----------------------------------------------------------- */
268: static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift)
269: {
270: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
272: PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col;
273: PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j;
276: nslim_row = a->inode.node_count;
277: n = A->cmap.n;
279: /* Create The column_inode for this matrix */
280: Mat_CreateColInode(A,&nslim_col,&ns_col);
281:
282: /* allocate space for reformated column_inode structure */
283: PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);
284: PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);
285: for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1];
287: for (i1=0,col=0; i1<nslim_col; ++i1){
288: nsz = ns_col[i1];
289: for (i2=0; i2<nsz; ++i2,++col)
290: tvc[col] = i1;
291: }
292: /* allocate space for column pointers */
293: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);
294: *iia = ia;
295: PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));
296: PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);
298: /* determine the number of columns in each row */
299: ia[0] = oshift;
300: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
301: j = aj + ai[row] + ishift;
302: col = *j++ + ishift;
303: i2 = tvc[col];
304: nz = ai[row+1] - ai[row];
305: while (nz-- > 0) { /* off-diagonal elemets */
306: /* ia[i1+1]++; */
307: ia[i2+1]++;
308: i2++;
309: while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
310: if (nz > 0) i2 = tvc[col];
311: }
312: }
314: /* shift ia[i] to point to next col */
315: for (i1=1; i1<nslim_col+1; i1++) {
316: col = ia[i1-1];
317: ia[i1] += col;
318: work[i1-1] = col - oshift;
319: }
321: /* allocate space for column pointers */
322: nz = ia[nslim_col] + (!ishift);
323: PetscMalloc(nz*sizeof(PetscInt),&ja);
324: *jja = ja;
326: /* loop over matrix putting into ja */
327: for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) {
328: j = aj + ai[row] + ishift;
329: i2 = 0; /* Col inode index */
330: col = *j++ + ishift;
331: i2 = tvc[col];
332: nz = ai[row+1] - ai[row];
333: while (nz-- > 0) {
334: /* ja[work[i1]++] = i2 + oshift; */
335: ja[work[i2]++] = i1 + oshift;
336: i2++;
337: while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;}
338: if (nz > 0) i2 = tvc[col];
339: }
340: }
341: PetscFree(ns_col);
342: PetscFree(work);
343: PetscFree(tns);
344: PetscFree(tvc);
345: return(0);
346: }
350: static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
351: {
355: Mat_CreateColInode(A,n,PETSC_NULL);
356: if (!ia) return(0);
358: if (symmetric) {
359: /* Since the indices are symmetric it does'nt matter */
360: MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);
361: } else {
362: MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);
363: }
364: return(0);
365: }
369: static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done)
370: {
374: if (!ia) return(0);
375: PetscFree(*ia);
376: PetscFree(*ja);
377: return(0);
378: }
380: /* ----------------------------------------------------------- */
384: static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy)
385: {
386: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
387: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
388: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y;
390: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
391:
392: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
393: #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5)
394: #endif
397: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
398: node_max = a->inode.node_count;
399: ns = a->inode.size; /* Node Size array */
400: VecGetArray(xx,&x);
401: VecGetArray(yy,&y);
402: idx = a->j;
403: v1 = a->a;
404: ii = a->i;
406: for (i = 0,row = 0; i< node_max; ++i){
407: nsz = ns[i];
408: n = ii[1] - ii[0];
409: ii += nsz;
410: sz = n; /* No of non zeros in this row */
411: /* Switch on the size of Node */
412: switch (nsz){ /* Each loop in 'case' is unrolled */
413: case 1 :
414: sum1 = 0;
415:
416: for(n = 0; n< sz-1; n+=2) {
417: i1 = idx[0]; /* The instructions are ordered to */
418: i2 = idx[1]; /* make the compiler's job easy */
419: idx += 2;
420: tmp0 = x[i1];
421: tmp1 = x[i2];
422: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
423: }
424:
425: if (n == sz-1){ /* Take care of the last nonzero */
426: tmp0 = x[*idx++];
427: sum1 += *v1++ * tmp0;
428: }
429: y[row++]=sum1;
430: break;
431: case 2:
432: sum1 = 0;
433: sum2 = 0;
434: v2 = v1 + n;
435:
436: for (n = 0; n< sz-1; n+=2) {
437: i1 = idx[0];
438: i2 = idx[1];
439: idx += 2;
440: tmp0 = x[i1];
441: tmp1 = x[i2];
442: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
443: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
444: }
445: if (n == sz-1){
446: tmp0 = x[*idx++];
447: sum1 += *v1++ * tmp0;
448: sum2 += *v2++ * tmp0;
449: }
450: y[row++]=sum1;
451: y[row++]=sum2;
452: v1 =v2; /* Since the next block to be processed starts there*/
453: idx +=sz;
454: break;
455: case 3:
456: sum1 = 0;
457: sum2 = 0;
458: sum3 = 0;
459: v2 = v1 + n;
460: v3 = v2 + n;
461:
462: for (n = 0; n< sz-1; n+=2) {
463: i1 = idx[0];
464: i2 = idx[1];
465: idx += 2;
466: tmp0 = x[i1];
467: tmp1 = x[i2];
468: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
469: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
470: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
471: }
472: if (n == sz-1){
473: tmp0 = x[*idx++];
474: sum1 += *v1++ * tmp0;
475: sum2 += *v2++ * tmp0;
476: sum3 += *v3++ * tmp0;
477: }
478: y[row++]=sum1;
479: y[row++]=sum2;
480: y[row++]=sum3;
481: v1 =v3; /* Since the next block to be processed starts there*/
482: idx +=2*sz;
483: break;
484: case 4:
485: sum1 = 0;
486: sum2 = 0;
487: sum3 = 0;
488: sum4 = 0;
489: v2 = v1 + n;
490: v3 = v2 + n;
491: v4 = v3 + n;
492:
493: for (n = 0; n< sz-1; n+=2) {
494: i1 = idx[0];
495: i2 = idx[1];
496: idx += 2;
497: tmp0 = x[i1];
498: tmp1 = x[i2];
499: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
500: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
501: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
502: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
503: }
504: if (n == sz-1){
505: tmp0 = x[*idx++];
506: sum1 += *v1++ * tmp0;
507: sum2 += *v2++ * tmp0;
508: sum3 += *v3++ * tmp0;
509: sum4 += *v4++ * tmp0;
510: }
511: y[row++]=sum1;
512: y[row++]=sum2;
513: y[row++]=sum3;
514: y[row++]=sum4;
515: v1 =v4; /* Since the next block to be processed starts there*/
516: idx +=3*sz;
517: break;
518: case 5:
519: sum1 = 0;
520: sum2 = 0;
521: sum3 = 0;
522: sum4 = 0;
523: sum5 = 0;
524: v2 = v1 + n;
525: v3 = v2 + n;
526: v4 = v3 + n;
527: v5 = v4 + n;
528:
529: for (n = 0; n<sz-1; n+=2) {
530: i1 = idx[0];
531: i2 = idx[1];
532: idx += 2;
533: tmp0 = x[i1];
534: tmp1 = x[i2];
535: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
536: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
537: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
538: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
539: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
540: }
541: if (n == sz-1){
542: tmp0 = x[*idx++];
543: sum1 += *v1++ * tmp0;
544: sum2 += *v2++ * tmp0;
545: sum3 += *v3++ * tmp0;
546: sum4 += *v4++ * tmp0;
547: sum5 += *v5++ * tmp0;
548: }
549: y[row++]=sum1;
550: y[row++]=sum2;
551: y[row++]=sum3;
552: y[row++]=sum4;
553: y[row++]=sum5;
554: v1 =v5; /* Since the next block to be processed starts there */
555: idx +=4*sz;
556: break;
557: default :
558: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
559: }
560: }
561: VecRestoreArray(xx,&x);
562: VecRestoreArray(yy,&y);
563: PetscLogFlops(2*a->nz - A->rmap.n);
564: return(0);
565: }
566: /* ----------------------------------------------------------- */
567: /* Almost same code as the MatMult_Inode() */
570: static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy)
571: {
572: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
573: PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1;
574: PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt;
576: PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz;
577:
579: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
580: node_max = a->inode.node_count;
581: ns = a->inode.size; /* Node Size array */
582: VecGetArray(xx,&x);
583: VecGetArray(yy,&y);
584: if (zz != yy) {
585: VecGetArray(zz,&z);
586: } else {
587: z = y;
588: }
589: zt = z;
591: idx = a->j;
592: v1 = a->a;
593: ii = a->i;
595: for (i = 0,row = 0; i< node_max; ++i){
596: nsz = ns[i];
597: n = ii[1] - ii[0];
598: ii += nsz;
599: sz = n; /* No of non zeros in this row */
600: /* Switch on the size of Node */
601: switch (nsz){ /* Each loop in 'case' is unrolled */
602: case 1 :
603: sum1 = *zt++;
604:
605: for(n = 0; n< sz-1; n+=2) {
606: i1 = idx[0]; /* The instructions are ordered to */
607: i2 = idx[1]; /* make the compiler's job easy */
608: idx += 2;
609: tmp0 = x[i1];
610: tmp1 = x[i2];
611: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
612: }
613:
614: if(n == sz-1){ /* Take care of the last nonzero */
615: tmp0 = x[*idx++];
616: sum1 += *v1++ * tmp0;
617: }
618: y[row++]=sum1;
619: break;
620: case 2:
621: sum1 = *zt++;
622: sum2 = *zt++;
623: v2 = v1 + n;
624:
625: for(n = 0; n< sz-1; n+=2) {
626: i1 = idx[0];
627: i2 = idx[1];
628: idx += 2;
629: tmp0 = x[i1];
630: tmp1 = x[i2];
631: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
632: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
633: }
634: if(n == sz-1){
635: tmp0 = x[*idx++];
636: sum1 += *v1++ * tmp0;
637: sum2 += *v2++ * tmp0;
638: }
639: y[row++]=sum1;
640: y[row++]=sum2;
641: v1 =v2; /* Since the next block to be processed starts there*/
642: idx +=sz;
643: break;
644: case 3:
645: sum1 = *zt++;
646: sum2 = *zt++;
647: sum3 = *zt++;
648: v2 = v1 + n;
649: v3 = v2 + n;
650:
651: for (n = 0; n< sz-1; n+=2) {
652: i1 = idx[0];
653: i2 = idx[1];
654: idx += 2;
655: tmp0 = x[i1];
656: tmp1 = x[i2];
657: sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
658: sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
659: sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
660: }
661: if (n == sz-1){
662: tmp0 = x[*idx++];
663: sum1 += *v1++ * tmp0;
664: sum2 += *v2++ * tmp0;
665: sum3 += *v3++ * tmp0;
666: }
667: y[row++]=sum1;
668: y[row++]=sum2;
669: y[row++]=sum3;
670: v1 =v3; /* Since the next block to be processed starts there*/
671: idx +=2*sz;
672: break;
673: case 4:
674: sum1 = *zt++;
675: sum2 = *zt++;
676: sum3 = *zt++;
677: sum4 = *zt++;
678: v2 = v1 + n;
679: v3 = v2 + n;
680: v4 = v3 + n;
681:
682: for (n = 0; n< sz-1; n+=2) {
683: i1 = idx[0];
684: i2 = idx[1];
685: idx += 2;
686: tmp0 = x[i1];
687: tmp1 = x[i2];
688: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
689: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
690: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
691: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
692: }
693: if (n == sz-1){
694: tmp0 = x[*idx++];
695: sum1 += *v1++ * tmp0;
696: sum2 += *v2++ * tmp0;
697: sum3 += *v3++ * tmp0;
698: sum4 += *v4++ * tmp0;
699: }
700: y[row++]=sum1;
701: y[row++]=sum2;
702: y[row++]=sum3;
703: y[row++]=sum4;
704: v1 =v4; /* Since the next block to be processed starts there*/
705: idx +=3*sz;
706: break;
707: case 5:
708: sum1 = *zt++;
709: sum2 = *zt++;
710: sum3 = *zt++;
711: sum4 = *zt++;
712: sum5 = *zt++;
713: v2 = v1 + n;
714: v3 = v2 + n;
715: v4 = v3 + n;
716: v5 = v4 + n;
717:
718: for (n = 0; n<sz-1; n+=2) {
719: i1 = idx[0];
720: i2 = idx[1];
721: idx += 2;
722: tmp0 = x[i1];
723: tmp1 = x[i2];
724: sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2;
725: sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2;
726: sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2;
727: sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2;
728: sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2;
729: }
730: if(n == sz-1){
731: tmp0 = x[*idx++];
732: sum1 += *v1++ * tmp0;
733: sum2 += *v2++ * tmp0;
734: sum3 += *v3++ * tmp0;
735: sum4 += *v4++ * tmp0;
736: sum5 += *v5++ * tmp0;
737: }
738: y[row++]=sum1;
739: y[row++]=sum2;
740: y[row++]=sum3;
741: y[row++]=sum4;
742: y[row++]=sum5;
743: v1 =v5; /* Since the next block to be processed starts there */
744: idx +=4*sz;
745: break;
746: default :
747: SETERRQ(PETSC_ERR_COR,"Node size not yet supported");
748: }
749: }
750: VecRestoreArray(xx,&x);
751: VecRestoreArray(yy,&y);
752: if (zz != yy) {
753: VecRestoreArray(zz,&z);
754: }
755: PetscLogFlops(2*a->nz);
756: return(0);
757: }
759: /* ----------------------------------------------------------- */
762: PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx)
763: {
764: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
765: IS iscol = a->col,isrow = a->row;
767: PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j;
768: PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout;
769: PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1;
770: PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5;
773: if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix");
774: if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure");
775: node_max = a->inode.node_count;
776: ns = a->inode.size; /* Node Size array */
778: VecGetArray(bb,&b);
779: VecGetArray(xx,&x);
780: tmp = a->solve_work;
781:
782: ISGetIndices(isrow,&rout); r = rout;
783: ISGetIndices(iscol,&cout); c = cout + (n-1);
784:
785: /* forward solve the lower triangular */
786: tmps = tmp ;
787: aa = a_a ;
788: aj = a_j ;
789: ad = a->diag;
791: for (i = 0,row = 0; i< node_max; ++i){
792: nsz = ns[i];
793: aii = ai[row];
794: v1 = aa + aii;
795: vi = aj + aii;
796: nz = ad[row]- aii;
797:
798: switch (nsz){ /* Each loop in 'case' is unrolled */
799: case 1 :
800: sum1 = b[*r++];
801: /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/
802: for(j=0; j<nz-1; j+=2){
803: i0 = vi[0];
804: i1 = vi[1];
805: vi +=2;
806: tmp0 = tmps[i0];
807: tmp1 = tmps[i1];
808: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
809: }
810: if(j == nz-1){
811: tmp0 = tmps[*vi++];
812: sum1 -= *v1++ *tmp0;
813: }
814: tmp[row ++]=sum1;
815: break;
816: case 2:
817: sum1 = b[*r++];
818: sum2 = b[*r++];
819: v2 = aa + ai[row+1];
821: for(j=0; j<nz-1; j+=2){
822: i0 = vi[0];
823: i1 = vi[1];
824: vi +=2;
825: tmp0 = tmps[i0];
826: tmp1 = tmps[i1];
827: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
828: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
829: }
830: if(j == nz-1){
831: tmp0 = tmps[*vi++];
832: sum1 -= *v1++ *tmp0;
833: sum2 -= *v2++ *tmp0;
834: }
835: sum2 -= *v2++ * sum1;
836: tmp[row ++]=sum1;
837: tmp[row ++]=sum2;
838: break;
839: case 3:
840: sum1 = b[*r++];
841: sum2 = b[*r++];
842: sum3 = b[*r++];
843: v2 = aa + ai[row+1];
844: v3 = aa + ai[row+2];
845:
846: for (j=0; j<nz-1; j+=2){
847: i0 = vi[0];
848: i1 = vi[1];
849: vi +=2;
850: tmp0 = tmps[i0];
851: tmp1 = tmps[i1];
852: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
853: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
854: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
855: }
856: if (j == nz-1){
857: tmp0 = tmps[*vi++];
858: sum1 -= *v1++ *tmp0;
859: sum2 -= *v2++ *tmp0;
860: sum3 -= *v3++ *tmp0;
861: }
862: sum2 -= *v2++ * sum1;
863: sum3 -= *v3++ * sum1;
864: sum3 -= *v3++ * sum2;
865: tmp[row ++]=sum1;
866: tmp[row ++]=sum2;
867: tmp[row ++]=sum3;
868: break;
869:
870: case 4:
871: sum1 = b[*r++];
872: sum2 = b[*r++];
873: sum3 = b[*r++];
874: sum4 = b[*r++];
875: v2 = aa + ai[row+1];
876: v3 = aa + ai[row+2];
877: v4 = aa + ai[row+3];
878:
879: for (j=0; j<nz-1; j+=2){
880: i0 = vi[0];
881: i1 = vi[1];
882: vi +=2;
883: tmp0 = tmps[i0];
884: tmp1 = tmps[i1];
885: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
886: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
887: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
888: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
889: }
890: if (j == nz-1){
891: tmp0 = tmps[*vi++];
892: sum1 -= *v1++ *tmp0;
893: sum2 -= *v2++ *tmp0;
894: sum3 -= *v3++ *tmp0;
895: sum4 -= *v4++ *tmp0;
896: }
897: sum2 -= *v2++ * sum1;
898: sum3 -= *v3++ * sum1;
899: sum4 -= *v4++ * sum1;
900: sum3 -= *v3++ * sum2;
901: sum4 -= *v4++ * sum2;
902: sum4 -= *v4++ * sum3;
903:
904: tmp[row ++]=sum1;
905: tmp[row ++]=sum2;
906: tmp[row ++]=sum3;
907: tmp[row ++]=sum4;
908: break;
909: case 5:
910: sum1 = b[*r++];
911: sum2 = b[*r++];
912: sum3 = b[*r++];
913: sum4 = b[*r++];
914: sum5 = b[*r++];
915: v2 = aa + ai[row+1];
916: v3 = aa + ai[row+2];
917: v4 = aa + ai[row+3];
918: v5 = aa + ai[row+4];
919:
920: for (j=0; j<nz-1; j+=2){
921: i0 = vi[0];
922: i1 = vi[1];
923: vi +=2;
924: tmp0 = tmps[i0];
925: tmp1 = tmps[i1];
926: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2;
927: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2;
928: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2;
929: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2;
930: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2;
931: }
932: if (j == nz-1){
933: tmp0 = tmps[*vi++];
934: sum1 -= *v1++ *tmp0;
935: sum2 -= *v2++ *tmp0;
936: sum3 -= *v3++ *tmp0;
937: sum4 -= *v4++ *tmp0;
938: sum5 -= *v5++ *tmp0;
939: }
941: sum2 -= *v2++ * sum1;
942: sum3 -= *v3++ * sum1;
943: sum4 -= *v4++ * sum1;
944: sum5 -= *v5++ * sum1;
945: sum3 -= *v3++ * sum2;
946: sum4 -= *v4++ * sum2;
947: sum5 -= *v5++ * sum2;
948: sum4 -= *v4++ * sum3;
949: sum5 -= *v5++ * sum3;
950: sum5 -= *v5++ * sum4;
951:
952: tmp[row ++]=sum1;
953: tmp[row ++]=sum2;
954: tmp[row ++]=sum3;
955: tmp[row ++]=sum4;
956: tmp[row ++]=sum5;
957: break;
958: default:
959: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
960: }
961: }
962: /* backward solve the upper triangular */
963: for (i=node_max -1 ,row = n-1 ; i>=0; i--){
964: nsz = ns[i];
965: aii = ai[row+1] -1;
966: v1 = aa + aii;
967: vi = aj + aii;
968: nz = aii- ad[row];
969: switch (nsz){ /* Each loop in 'case' is unrolled */
970: case 1 :
971: sum1 = tmp[row];
973: for(j=nz ; j>1; j-=2){
974: vi -=2;
975: i0 = vi[2];
976: i1 = vi[1];
977: tmp0 = tmps[i0];
978: tmp1 = tmps[i1];
979: v1 -= 2;
980: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
981: }
982: if (j==1){
983: tmp0 = tmps[*vi--];
984: sum1 -= *v1-- * tmp0;
985: }
986: x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
987: break;
988: case 2 :
989: sum1 = tmp[row];
990: sum2 = tmp[row -1];
991: v2 = aa + ai[row]-1;
992: for (j=nz ; j>1; j-=2){
993: vi -=2;
994: i0 = vi[2];
995: i1 = vi[1];
996: tmp0 = tmps[i0];
997: tmp1 = tmps[i1];
998: v1 -= 2;
999: v2 -= 2;
1000: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1001: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1002: }
1003: if (j==1){
1004: tmp0 = tmps[*vi--];
1005: sum1 -= *v1-- * tmp0;
1006: sum2 -= *v2-- * tmp0;
1007: }
1008:
1009: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1010: sum2 -= *v2-- * tmp0;
1011: x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1012: break;
1013: case 3 :
1014: sum1 = tmp[row];
1015: sum2 = tmp[row -1];
1016: sum3 = tmp[row -2];
1017: v2 = aa + ai[row]-1;
1018: v3 = aa + ai[row -1]-1;
1019: for (j=nz ; j>1; j-=2){
1020: vi -=2;
1021: i0 = vi[2];
1022: i1 = vi[1];
1023: tmp0 = tmps[i0];
1024: tmp1 = tmps[i1];
1025: v1 -= 2;
1026: v2 -= 2;
1027: v3 -= 2;
1028: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1029: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1030: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1031: }
1032: if (j==1){
1033: tmp0 = tmps[*vi--];
1034: sum1 -= *v1-- * tmp0;
1035: sum2 -= *v2-- * tmp0;
1036: sum3 -= *v3-- * tmp0;
1037: }
1038: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1039: sum2 -= *v2-- * tmp0;
1040: sum3 -= *v3-- * tmp0;
1041: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1042: sum3 -= *v3-- * tmp0;
1043: x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1044:
1045: break;
1046: case 4 :
1047: sum1 = tmp[row];
1048: sum2 = tmp[row -1];
1049: sum3 = tmp[row -2];
1050: sum4 = tmp[row -3];
1051: v2 = aa + ai[row]-1;
1052: v3 = aa + ai[row -1]-1;
1053: v4 = aa + ai[row -2]-1;
1055: for (j=nz ; j>1; j-=2){
1056: vi -=2;
1057: i0 = vi[2];
1058: i1 = vi[1];
1059: tmp0 = tmps[i0];
1060: tmp1 = tmps[i1];
1061: v1 -= 2;
1062: v2 -= 2;
1063: v3 -= 2;
1064: v4 -= 2;
1065: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1066: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1067: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1068: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1069: }
1070: if (j==1){
1071: tmp0 = tmps[*vi--];
1072: sum1 -= *v1-- * tmp0;
1073: sum2 -= *v2-- * tmp0;
1074: sum3 -= *v3-- * tmp0;
1075: sum4 -= *v4-- * tmp0;
1076: }
1078: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1079: sum2 -= *v2-- * tmp0;
1080: sum3 -= *v3-- * tmp0;
1081: sum4 -= *v4-- * tmp0;
1082: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1083: sum3 -= *v3-- * tmp0;
1084: sum4 -= *v4-- * tmp0;
1085: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1086: sum4 -= *v4-- * tmp0;
1087: x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1088: break;
1089: case 5 :
1090: sum1 = tmp[row];
1091: sum2 = tmp[row -1];
1092: sum3 = tmp[row -2];
1093: sum4 = tmp[row -3];
1094: sum5 = tmp[row -4];
1095: v2 = aa + ai[row]-1;
1096: v3 = aa + ai[row -1]-1;
1097: v4 = aa + ai[row -2]-1;
1098: v5 = aa + ai[row -3]-1;
1099: for (j=nz ; j>1; j-=2){
1100: vi -= 2;
1101: i0 = vi[2];
1102: i1 = vi[1];
1103: tmp0 = tmps[i0];
1104: tmp1 = tmps[i1];
1105: v1 -= 2;
1106: v2 -= 2;
1107: v3 -= 2;
1108: v4 -= 2;
1109: v5 -= 2;
1110: sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
1111: sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
1112: sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
1113: sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
1114: sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
1115: }
1116: if (j==1){
1117: tmp0 = tmps[*vi--];
1118: sum1 -= *v1-- * tmp0;
1119: sum2 -= *v2-- * tmp0;
1120: sum3 -= *v3-- * tmp0;
1121: sum4 -= *v4-- * tmp0;
1122: sum5 -= *v5-- * tmp0;
1123: }
1125: tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--;
1126: sum2 -= *v2-- * tmp0;
1127: sum3 -= *v3-- * tmp0;
1128: sum4 -= *v4-- * tmp0;
1129: sum5 -= *v5-- * tmp0;
1130: tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--;
1131: sum3 -= *v3-- * tmp0;
1132: sum4 -= *v4-- * tmp0;
1133: sum5 -= *v5-- * tmp0;
1134: tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--;
1135: sum4 -= *v4-- * tmp0;
1136: sum5 -= *v5-- * tmp0;
1137: tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--;
1138: sum5 -= *v5-- * tmp0;
1139: x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--;
1140: break;
1141: default:
1142: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1143: }
1144: }
1145: ISRestoreIndices(isrow,&rout);
1146: ISRestoreIndices(iscol,&cout);
1147: VecRestoreArray(bb,&b);
1148: VecRestoreArray(xx,&x);
1149: PetscLogFlops(2*a->nz - A->cmap.n);
1150: return(0);
1151: }
1155: PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B)
1156: {
1157: Mat C = *B;
1158: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data;
1159: IS iscol = b->col,isrow = b->row,isicol = b->icol;
1161: PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i;
1162: PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow;
1163: PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz;
1164: PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj;
1165: PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3;
1166: PetscScalar tmp,*ba = b->a,*aa = a->a,*pv;
1167: PetscReal rs=0.0;
1168: LUShift_Ctx sctx;
1169: PetscInt newshift;
1172: sctx.shift_top = 0;
1173: sctx.nshift_max = 0;
1174: sctx.shift_lo = 0;
1175: sctx.shift_hi = 0;
1177: /* if both shift schemes are chosen by user, only use info->shiftpd */
1178: if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0;
1179: if (info->shiftpd) { /* set sctx.shift_top=max{rs} */
1180: sctx.shift_top = 0;
1181: for (i=0; i<n; i++) {
1182: /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1183: rs = 0.0;
1184: ajtmp = aj + ai[i];
1185: rtmp1 = aa + ai[i];
1186: nz = ai[i+1] - ai[i];
1187: for (j=0; j<nz; j++){
1188: if (*ajtmp != i){
1189: rs += PetscAbsScalar(*rtmp1++);
1190: } else {
1191: rs -= PetscRealPart(*rtmp1++);
1192: }
1193: ajtmp++;
1194: }
1195: if (rs>sctx.shift_top) sctx.shift_top = rs;
1196: }
1197: if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
1198: sctx.shift_top *= 1.1;
1199: sctx.nshift_max = 5;
1200: sctx.shift_lo = 0.;
1201: sctx.shift_hi = 1.;
1202: }
1203: sctx.shift_amount = 0;
1204: sctx.nshift = 0;
1206: ISGetIndices(isrow,&r);
1207: ISGetIndices(iscol,&c);
1208: ISGetIndices(isicol,&ic);
1209: PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);
1210: PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));
1211: ics = ic ;
1212: rtmp2 = rtmp1 + n;
1213: rtmp3 = rtmp2 + n;
1214:
1215: node_max = a->inode.node_count;
1216: ns = a->inode.size ;
1217: if (!ns){
1218: SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information");
1219: }
1221: /* If max inode size > 3, split it into two inodes.*/
1222: /* also map the inode sizes according to the ordering */
1223: PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);
1224: for (i=0,j=0; i<node_max; ++i,++j){
1225: if (ns[i]>3) {
1226: tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */
1227: ++j;
1228: tmp_vec1[j] = ns[i] - tmp_vec1[j-1];
1229: } else {
1230: tmp_vec1[j] = ns[i];
1231: }
1232: }
1233: /* Use the correct node_max */
1234: node_max = j;
1236: /* Now reorder the inode info based on mat re-ordering info */
1237: /* First create a row -> inode_size_array_index map */
1238: PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);
1239: PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);
1240: for (i=0,row=0; i<node_max; i++) {
1241: nodesz = tmp_vec1[i];
1242: for (j=0; j<nodesz; j++,row++) {
1243: nsmap[row] = i;
1244: }
1245: }
1246: /* Using nsmap, create a reordered ns structure */
1247: for (i=0,j=0; i< node_max; i++) {
1248: nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1249: tmp_vec2[i] = nodesz;
1250: j += nodesz;
1251: }
1252: PetscFree(nsmap);
1253: PetscFree(tmp_vec1);
1254: /* Now use the correct ns */
1255: ns = tmp_vec2;
1257: do {
1258: sctx.lushift = PETSC_FALSE;
1259: /* Now loop over each block-row, and do the factorization */
1260: for (i=0,row=0; i<node_max; i++) {
1261: nodesz = ns[i];
1262: nz = bi[row+1] - bi[row];
1263: bjtmp = bj + bi[row];
1265: switch (nodesz){
1266: case 1:
1267: for (j=0; j<nz; j++){
1268: idx = bjtmp[j];
1269: rtmp1[idx] = 0.0;
1270: }
1271:
1272: /* load in initial (unfactored row) */
1273: idx = r[row];
1274: nz_tmp = ai[idx+1] - ai[idx];
1275: ajtmp = aj + ai[idx];
1276: v1 = aa + ai[idx];
1278: for (j=0; j<nz_tmp; j++) {
1279: idx = ics[ajtmp[j]];
1280: rtmp1[idx] = v1[j];
1281: }
1282: rtmp1[ics[r[row]]] += sctx.shift_amount;
1284: prow = *bjtmp++ ;
1285: while (prow < row) {
1286: pc1 = rtmp1 + prow;
1287: if (*pc1 != 0.0){
1288: pv = ba + bd[prow];
1289: pj = nbj + bd[prow];
1290: mul1 = *pc1 * *pv++;
1291: *pc1 = mul1;
1292: nz_tmp = bi[prow+1] - bd[prow] - 1;
1293: PetscLogFlops(2*nz_tmp);
1294: for (j=0; j<nz_tmp; j++) {
1295: tmp = pv[j];
1296: idx = pj[j];
1297: rtmp1[idx] -= mul1 * tmp;
1298: }
1299: }
1300: prow = *bjtmp++ ;
1301: }
1302: pj = bj + bi[row];
1303: pc1 = ba + bi[row];
1305: sctx.pv = rtmp1[row];
1306: rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */
1307: rs = 0.0;
1308: for (j=0; j<nz; j++) {
1309: idx = pj[j];
1310: pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */
1311: if (idx != row) rs += PetscAbsScalar(pc1[j]);
1312: }
1313: sctx.rs = rs;
1314: MatLUCheckShift_inline(info,sctx,row,newshift);
1315: if (newshift == 1) goto endofwhile;
1316: break;
1317:
1318: case 2:
1319: for (j=0; j<nz; j++) {
1320: idx = bjtmp[j];
1321: rtmp1[idx] = 0.0;
1322: rtmp2[idx] = 0.0;
1323: }
1324:
1325: /* load in initial (unfactored row) */
1326: idx = r[row];
1327: nz_tmp = ai[idx+1] - ai[idx];
1328: ajtmp = aj + ai[idx];
1329: v1 = aa + ai[idx];
1330: v2 = aa + ai[idx+1];
1331: for (j=0; j<nz_tmp; j++) {
1332: idx = ics[ajtmp[j]];
1333: rtmp1[idx] = v1[j];
1334: rtmp2[idx] = v2[j];
1335: }
1336: rtmp1[ics[r[row]]] += sctx.shift_amount;
1337: rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1339: prow = *bjtmp++ ;
1340: while (prow < row) {
1341: pc1 = rtmp1 + prow;
1342: pc2 = rtmp2 + prow;
1343: if (*pc1 != 0.0 || *pc2 != 0.0){
1344: pv = ba + bd[prow];
1345: pj = nbj + bd[prow];
1346: mul1 = *pc1 * *pv;
1347: mul2 = *pc2 * *pv;
1348: ++pv;
1349: *pc1 = mul1;
1350: *pc2 = mul2;
1351:
1352: nz_tmp = bi[prow+1] - bd[prow] - 1;
1353: for (j=0; j<nz_tmp; j++) {
1354: tmp = pv[j];
1355: idx = pj[j];
1356: rtmp1[idx] -= mul1 * tmp;
1357: rtmp2[idx] -= mul2 * tmp;
1358: }
1359: PetscLogFlops(4*nz_tmp);
1360: }
1361: prow = *bjtmp++ ;
1362: }
1364: /* Now take care of diagonal 2x2 block. Note: prow = row here */
1365: pc1 = rtmp1 + prow;
1366: pc2 = rtmp2 + prow;
1368: sctx.pv = *pc1;
1369: pj = bj + bi[prow];
1370: rs = 0.0;
1371: for (j=0; j<nz; j++){
1372: idx = pj[j];
1373: if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]);
1374: }
1375: sctx.rs = rs;
1376: MatLUCheckShift_inline(info,sctx,row,newshift);
1377: if (newshift == 1) goto endofwhile;
1379: if (*pc2 != 0.0){
1380: pj = nbj + bd[prow];
1381: mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/
1382: *pc2 = mul2;
1383: nz_tmp = bi[prow+1] - bd[prow] - 1;
1384: for (j=0; j<nz_tmp; j++) {
1385: idx = pj[j] ;
1386: tmp = rtmp1[idx];
1387: rtmp2[idx] -= mul2 * tmp;
1388: }
1389: PetscLogFlops(2*nz_tmp);
1390: }
1391:
1392: pj = bj + bi[row];
1393: pc1 = ba + bi[row];
1394: pc2 = ba + bi[row+1];
1396: sctx.pv = rtmp2[row+1];
1397: rs = 0.0;
1398: rtmp1[row] = 1.0/rtmp1[row];
1399: rtmp2[row+1] = 1.0/rtmp2[row+1];
1400: /* copy row entries from dense representation to sparse */
1401: for (j=0; j<nz; j++) {
1402: idx = pj[j];
1403: pc1[j] = rtmp1[idx];
1404: pc2[j] = rtmp2[idx];
1405: if (idx != row+1) rs += PetscAbsScalar(pc2[j]);
1406: }
1407: sctx.rs = rs;
1408: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1409: if (newshift == 1) goto endofwhile;
1410: break;
1412: case 3:
1413: for (j=0; j<nz; j++) {
1414: idx = bjtmp[j];
1415: rtmp1[idx] = 0.0;
1416: rtmp2[idx] = 0.0;
1417: rtmp3[idx] = 0.0;
1418: }
1419: /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
1420: idx = r[row];
1421: nz_tmp = ai[idx+1] - ai[idx];
1422: ajtmp = aj + ai[idx];
1423: v1 = aa + ai[idx];
1424: v2 = aa + ai[idx+1];
1425: v3 = aa + ai[idx+2];
1426: for (j=0; j<nz_tmp; j++) {
1427: idx = ics[ajtmp[j]];
1428: rtmp1[idx] = v1[j];
1429: rtmp2[idx] = v2[j];
1430: rtmp3[idx] = v3[j];
1431: }
1432: rtmp1[ics[r[row]]] += sctx.shift_amount;
1433: rtmp2[ics[r[row+1]]] += sctx.shift_amount;
1434: rtmp3[ics[r[row+2]]] += sctx.shift_amount;
1436: /* loop over all pivot row blocks above this row block */
1437: prow = *bjtmp++ ;
1438: while (prow < row) {
1439: pc1 = rtmp1 + prow;
1440: pc2 = rtmp2 + prow;
1441: pc3 = rtmp3 + prow;
1442: if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){
1443: pv = ba + bd[prow];
1444: pj = nbj + bd[prow];
1445: mul1 = *pc1 * *pv;
1446: mul2 = *pc2 * *pv;
1447: mul3 = *pc3 * *pv;
1448: ++pv;
1449: *pc1 = mul1;
1450: *pc2 = mul2;
1451: *pc3 = mul3;
1452:
1453: nz_tmp = bi[prow+1] - bd[prow] - 1;
1454: /* update this row based on pivot row */
1455: for (j=0; j<nz_tmp; j++) {
1456: tmp = pv[j];
1457: idx = pj[j];
1458: rtmp1[idx] -= mul1 * tmp;
1459: rtmp2[idx] -= mul2 * tmp;
1460: rtmp3[idx] -= mul3 * tmp;
1461: }
1462: PetscLogFlops(6*nz_tmp);
1463: }
1464: prow = *bjtmp++ ;
1465: }
1467: /* Now take care of diagonal 3x3 block in this set of rows */
1468: /* note: prow = row here */
1469: pc1 = rtmp1 + prow;
1470: pc2 = rtmp2 + prow;
1471: pc3 = rtmp3 + prow;
1473: sctx.pv = *pc1;
1474: pj = bj + bi[prow];
1475: rs = 0.0;
1476: for (j=0; j<nz; j++){
1477: idx = pj[j];
1478: if (idx != row) rs += PetscAbsScalar(rtmp1[idx]);
1479: }
1480: sctx.rs = rs;
1481: MatLUCheckShift_inline(info,sctx,row,newshift);
1482: if (newshift == 1) goto endofwhile;
1484: if (*pc2 != 0.0 || *pc3 != 0.0){
1485: mul2 = (*pc2)/(*pc1);
1486: mul3 = (*pc3)/(*pc1);
1487: *pc2 = mul2;
1488: *pc3 = mul3;
1489: nz_tmp = bi[prow+1] - bd[prow] - 1;
1490: pj = nbj + bd[prow];
1491: for (j=0; j<nz_tmp; j++) {
1492: idx = pj[j] ;
1493: tmp = rtmp1[idx];
1494: rtmp2[idx] -= mul2 * tmp;
1495: rtmp3[idx] -= mul3 * tmp;
1496: }
1497: PetscLogFlops(4*nz_tmp);
1498: }
1499: ++prow;
1501: pc2 = rtmp2 + prow;
1502: pc3 = rtmp3 + prow;
1503: sctx.pv = *pc2;
1504: pj = bj + bi[prow];
1505: rs = 0.0;
1506: for (j=0; j<nz; j++){
1507: idx = pj[j];
1508: if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]);
1509: }
1510: sctx.rs = rs;
1511: MatLUCheckShift_inline(info,sctx,row+1,newshift);
1512: if (newshift == 1) goto endofwhile;
1514: if (*pc3 != 0.0){
1515: mul3 = (*pc3)/(*pc2);
1516: *pc3 = mul3;
1517: pj = nbj + bd[prow];
1518: nz_tmp = bi[prow+1] - bd[prow] - 1;
1519: for (j=0; j<nz_tmp; j++) {
1520: idx = pj[j] ;
1521: tmp = rtmp2[idx];
1522: rtmp3[idx] -= mul3 * tmp;
1523: }
1524: PetscLogFlops(4*nz_tmp);
1525: }
1527: pj = bj + bi[row];
1528: pc1 = ba + bi[row];
1529: pc2 = ba + bi[row+1];
1530: pc3 = ba + bi[row+2];
1532: sctx.pv = rtmp3[row+2];
1533: rs = 0.0;
1534: rtmp1[row] = 1.0/rtmp1[row];
1535: rtmp2[row+1] = 1.0/rtmp2[row+1];
1536: rtmp3[row+2] = 1.0/rtmp3[row+2];
1537: /* copy row entries from dense representation to sparse */
1538: for (j=0; j<nz; j++) {
1539: idx = pj[j];
1540: pc1[j] = rtmp1[idx];
1541: pc2[j] = rtmp2[idx];
1542: pc3[j] = rtmp3[idx];
1543: if (idx != row+2) rs += PetscAbsScalar(pc3[j]);
1544: }
1546: sctx.rs = rs;
1547: MatLUCheckShift_inline(info,sctx,row+2,newshift);
1548: if (newshift == 1) goto endofwhile;
1549: break;
1551: default:
1552: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n");
1553: }
1554: row += nodesz; /* Update the row */
1555: }
1556: endofwhile:;
1557: } while (sctx.lushift);
1558: PetscFree(rtmp1);
1559: PetscFree(tmp_vec2);
1560: ISRestoreIndices(isicol,&ic);
1561: ISRestoreIndices(isrow,&r);
1562: ISRestoreIndices(iscol,&c);
1563: C->factor = FACTOR_LU;
1564: C->assembled = PETSC_TRUE;
1565: if (sctx.nshift) {
1566: if (info->shiftnz) {
1567: PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);
1568: } else if (info->shiftpd) {
1569: PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);
1570: }
1571: }
1572: PetscLogFlops(C->cmap.n);
1573: return(0);
1574: }
1576: /*
1577: Makes a longer coloring[] array and calls the usual code with that
1578: */
1581: PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring)
1582: {
1583: Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data;
1584: PetscErrorCode ierr;
1585: PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row;
1586: PetscInt *colorused,i;
1587: ISColoringValue *newcolor;
1590: PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);
1591: /* loop over inodes, marking a color for each column*/
1592: row = 0;
1593: for (i=0; i<m; i++){
1594: for (j=0; j<ns[i]; j++) {
1595: newcolor[row++] = coloring[i] + j*ncolors;
1596: }
1597: }
1599: /* eliminate unneeded colors */
1600: PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);
1601: PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));
1602: for (i=0; i<n; i++) {
1603: colorused[newcolor[i]] = 1;
1604: }
1606: for (i=1; i<5*ncolors; i++) {
1607: colorused[i] += colorused[i-1];
1608: }
1609: ncolors = colorused[5*ncolors-1];
1610: for (i=0; i<n; i++) {
1611: newcolor[i] = colorused[newcolor[i]];
1612: }
1613: PetscFree(colorused);
1614: ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);
1615: PetscFree(coloring);
1616: return(0);
1617: }
1619: /*
1620: samestructure indicates that the matrix has not changed its nonzero structure so we
1621: do not need to recompute the inodes
1622: */
1625: PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure)
1626: {
1627: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1629: PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size;
1630: PetscTruth flag;
1633: if (!a->inode.use) return(0);
1634: if (a->inode.checked && samestructure) return(0);
1637: m = A->rmap.n;
1638: if (a->inode.size) {ns = a->inode.size;}
1639: else {PetscMalloc((m+1)*sizeof(PetscInt),&ns);}
1641: i = 0;
1642: node_count = 0;
1643: idx = a->j;
1644: ii = a->i;
1645: while (i < m){ /* For each row */
1646: nzx = ii[i+1] - ii[i]; /* Number of nonzeros */
1647: /* Limits the number of elements in a node to 'a->inode.limit' */
1648: for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) {
1649: nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */
1650: if (nzy != nzx) break;
1651: idy += nzx; /* Same nonzero pattern */
1652: PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);
1653: if (!flag) break;
1654: }
1655: ns[node_count++] = blk_size;
1656: idx += blk_size*nzx;
1657: i = j;
1658: }
1659: /* If not enough inodes found,, do not use inode version of the routines */
1660: if (!a->inode.size && m && node_count > 0.9*m) {
1661: PetscFree(ns);
1662: a->inode.node_count = 0;
1663: a->inode.size = PETSC_NULL;
1664: a->inode.use = PETSC_FALSE;
1665: PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);
1666: } else {
1667: A->ops->mult = MatMult_Inode;
1668: A->ops->multadd = MatMultAdd_Inode;
1669: A->ops->solve = MatSolve_Inode;
1670: A->ops->lufactornumeric = MatLUFactorNumeric_Inode;
1671: A->ops->getrowij = MatGetRowIJ_Inode;
1672: A->ops->restorerowij = MatRestoreRowIJ_Inode;
1673: A->ops->getcolumnij = MatGetColumnIJ_Inode;
1674: A->ops->restorecolumnij = MatRestoreColumnIJ_Inode;
1675: A->ops->coloringpatch = MatColoringPatch_Inode;
1676: a->inode.node_count = node_count;
1677: a->inode.size = ns;
1678: PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);
1679: }
1680: return(0);
1681: }
1683: /*
1684: This is really ugly. if inodes are used this replaces the
1685: permutations with ones that correspond to rows/cols of the matrix
1686: rather then inode blocks
1687: */
1690: PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm)
1691: {
1692: PetscErrorCode ierr,(*f)(Mat,IS*,IS*);
1695: PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);
1696: if (f) {
1697: (*f)(A,rperm,cperm);
1698: }
1699: return(0);
1700: }
1705: PetscErrorCode MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm)
1706: {
1707: Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data;
1709: PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count;
1710: PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx;
1711: PetscInt nslim_col,*ns_col;
1712: IS ris = *rperm,cis = *cperm;
1715: if (!a->inode.size) return(0); /* no inodes so return */
1716: if (a->inode.node_count == m) return(0); /* all inodes are of size 1 */
1718: Mat_CreateColInode(A,&nslim_col,&ns_col);
1719: PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);
1720: PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);
1721: permc = permr + m;
1723: ISGetIndices(ris,&ridx);
1724: ISGetIndices(cis,&cidx);
1726: /* Form the inode structure for the rows of permuted matric using inv perm*/
1727: for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i];
1729: /* Construct the permutations for rows*/
1730: for (i=0,row = 0; i<nslim_row; ++i){
1731: indx = ridx[i];
1732: start_val = tns[indx];
1733: end_val = tns[indx + 1];
1734: for (j=start_val; j<end_val; ++j,++row) permr[row]= j;
1735: }
1737: /* Form the inode structure for the columns of permuted matrix using inv perm*/
1738: for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i];
1740: /* Construct permutations for columns */
1741: for (i=0,col=0; i<nslim_col; ++i){
1742: indx = cidx[i];
1743: start_val = tns[indx];
1744: end_val = tns[indx + 1];
1745: for (j = start_val; j<end_val; ++j,++col) permc[col]= j;
1746: }
1748: ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);
1749: ISSetPermutation(*rperm);
1750: ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);
1751: ISSetPermutation(*cperm);
1752:
1753: ISRestoreIndices(ris,&ridx);
1754: ISRestoreIndices(cis,&cidx);
1756: PetscFree(ns_col);
1757: PetscFree(permr);
1758: ISDestroy(cis);
1759: ISDestroy(ris);
1760: PetscFree(tns);
1761: return(0);
1762: }
1767: /*@C
1768: MatInodeGetInodeSizes - Returns the inode information of the Inode matrix.
1770: Collective on Mat
1772: Input Parameter:
1773: . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ
1775: Output Parameter:
1776: + node_count - no of inodes present in the matrix.
1777: . sizes - an array of size node_count,with sizes of each inode.
1778: - limit - the max size used to generate the inodes.
1780: Level: advanced
1782: Notes: This routine returns some internal storage information
1783: of the matrix, it is intended to be used by advanced users.
1784: It should be called after the matrix is assembled.
1785: The contents of the sizes[] array should not be changed.
1786: PETSC_NULL may be passed for information not requested.
1788: .keywords: matrix, seqaij, get, inode
1790: .seealso: MatGetInfo()
1791: @*/
1792: PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1793: {
1794: PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*);
1797: if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix");
1798: PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);
1799: if (f) {
1800: (*f)(A,node_count,sizes,limit);
1801: }
1802: return(0);
1803: }
1808: PetscErrorCode MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit)
1809: {
1810: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data;
1813: if (node_count) *node_count = a->inode.node_count;
1814: if (sizes) *sizes = a->inode.size;
1815: if (limit) *limit = a->inode.limit;
1816: return(0);
1817: }