#define PETSCMAT_DLL /* This file provides high performance routines for the Inode format (compressed sparse row) by taking advantage of rows with identical nonzero structure (I-nodes). */ #include "src/mat/impls/aij/seq/aij.h" #undef __FUNCT__ #define __FUNCT__ "Mat_CreateColInode" static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; PetscFunctionBegin; n = A->cmap.n; m = A->rmap.n; ns_row = a->inode.size; min_mn = (m < n) ? m : n; if (!ns) { for (count=0,i=0; count n) { /* Adjust for the over estimation */ ns_col[i-1] += n - count; } *size = i; *ns = ns_col; PetscFunctionReturn(0); } /* This builds symmetric version of nonzero structure, */ #undef __FUNCT__ #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; PetscFunctionBegin; nslim_row = a->inode.node_count; m = A->rmap.n; n = A->cmap.n; if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); /* Use the row_inode as column_inode */ nslim_col = nslim_row; ns_col = ns_row; /* allocate space for reformated inode structure */ ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); for (i1=0,tns[0]=0; i1data; PetscErrorCode ierr; PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; PetscFunctionBegin; nslim_row = a->inode.node_count; n = A->cmap.n; /* Create The column_inode for this matrix */ ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); /* allocate space for reformated column_inode structure */ ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); for (i1=0,tns[0]=0; i1 0) { /* off-diagonal elemets */ ia[i1+1]++; i2++; /* Start col of next node */ while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} if (nz > 0) i2 = tvc[col]; } } /* shift ia[i] to point to next row */ for (i1=1; i1 0) { ja[work[i1]++] = i2 + oshift; ++i2; while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} if (nz > 0) i2 = tvc[col]; } } ierr = PetscFree(ns_col);CHKERRQ(ierr); ierr = PetscFree(work);CHKERRQ(ierr); ierr = PetscFree(tns);CHKERRQ(ierr); ierr = PetscFree(tvc);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatGetRowIJ_Inode" static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscFunctionBegin; *n = a->inode.node_count; if (!ia) PetscFunctionReturn(0); if (symmetric) { ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } else { ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatRestoreRowIJ_Inode" static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) { PetscErrorCode ierr; PetscFunctionBegin; if (!ia) PetscFunctionReturn(0); ierr = PetscFree(*ia);CHKERRQ(ierr); ierr = PetscFree(*ja);CHKERRQ(ierr); PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ #undef __FUNCT__ #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; PetscFunctionBegin; nslim_row = a->inode.node_count; n = A->cmap.n; /* Create The column_inode for this matrix */ ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); /* allocate space for reformated column_inode structure */ ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); for (i1=0,tns[0]=0; i1 0) { /* off-diagonal elemets */ /* ia[i1+1]++; */ ia[i2+1]++; i2++; while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} if (nz > 0) i2 = tvc[col]; } } /* shift ia[i] to point to next col */ for (i1=1; i1 0) { /* ja[work[i1]++] = i2 + oshift; */ ja[work[i2]++] = i1 + oshift; i2++; while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} if (nz > 0) i2 = tvc[col]; } } ierr = PetscFree(ns_col);CHKERRQ(ierr); ierr = PetscFree(work);CHKERRQ(ierr); ierr = PetscFree(tns);CHKERRQ(ierr); ierr = PetscFree(tvc);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatGetColumnIJ_Inode" static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) { PetscErrorCode ierr; PetscFunctionBegin; ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); if (!ia) PetscFunctionReturn(0); if (symmetric) { /* Since the indices are symmetric it does'nt matter */ ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } else { ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); } PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatRestoreColumnIJ_Inode" static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) { PetscErrorCode ierr; PetscFunctionBegin; if (!ia) PetscFunctionReturn(0); ierr = PetscFree(*ia);CHKERRQ(ierr); ierr = PetscFree(*ja);CHKERRQ(ierr); PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ #undef __FUNCT__ #define __FUNCT__ "MatMult_Inode" static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y; PetscErrorCode ierr; PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; #if defined(PETSC_HAVE_PRAGMA_DISJOINT) #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) #endif PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); node_max = a->inode.node_count; ns = a->inode.size; /* Node Size array */ ierr = VecGetArray(xx,&x);CHKERRQ(ierr); ierr = VecGetArray(yy,&y);CHKERRQ(ierr); idx = a->j; v1 = a->a; ii = a->i; for (i = 0,row = 0; i< node_max; ++i){ nsz = ns[i]; n = ii[1] - ii[0]; ii += nsz; sz = n; /* No of non zeros in this row */ /* Switch on the size of Node */ switch (nsz){ /* Each loop in 'case' is unrolled */ case 1 : sum1 = 0; for(n = 0; n< sz-1; n+=2) { i1 = idx[0]; /* The instructions are ordered to */ i2 = idx[1]; /* make the compiler's job easy */ idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; } if (n == sz-1){ /* Take care of the last nonzero */ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; } y[row++]=sum1; break; case 2: sum1 = 0; sum2 = 0; v2 = v1 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; } if (n == sz-1){ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; } y[row++]=sum1; y[row++]=sum2; v1 =v2; /* Since the next block to be processed starts there*/ idx +=sz; break; case 3: sum1 = 0; sum2 = 0; sum3 = 0; v2 = v1 + n; v3 = v2 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; } if (n == sz-1){ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; v1 =v3; /* Since the next block to be processed starts there*/ idx +=2*sz; break; case 4: sum1 = 0; sum2 = 0; sum3 = 0; sum4 = 0; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; } if (n == sz-1){ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; sum4 += *v4++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; y[row++]=sum4; v1 =v4; /* Since the next block to be processed starts there*/ idx +=3*sz; break; case 5: sum1 = 0; sum2 = 0; sum3 = 0; sum4 = 0; sum5 = 0; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; v5 = v4 + n; for (n = 0; nnz - A->rmap.n);CHKERRQ(ierr); PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ /* Almost same code as the MatMult_Inode() */ #undef __FUNCT__ #define __FUNCT__ "MatMultAdd_Inode" static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt; PetscErrorCode ierr; PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; PetscFunctionBegin; if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); node_max = a->inode.node_count; ns = a->inode.size; /* Node Size array */ ierr = VecGetArray(xx,&x);CHKERRQ(ierr); ierr = VecGetArray(yy,&y);CHKERRQ(ierr); if (zz != yy) { ierr = VecGetArray(zz,&z);CHKERRQ(ierr); } else { z = y; } zt = z; idx = a->j; v1 = a->a; ii = a->i; for (i = 0,row = 0; i< node_max; ++i){ nsz = ns[i]; n = ii[1] - ii[0]; ii += nsz; sz = n; /* No of non zeros in this row */ /* Switch on the size of Node */ switch (nsz){ /* Each loop in 'case' is unrolled */ case 1 : sum1 = *zt++; for(n = 0; n< sz-1; n+=2) { i1 = idx[0]; /* The instructions are ordered to */ i2 = idx[1]; /* make the compiler's job easy */ idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; } if(n == sz-1){ /* Take care of the last nonzero */ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; } y[row++]=sum1; break; case 2: sum1 = *zt++; sum2 = *zt++; v2 = v1 + n; for(n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; } if(n == sz-1){ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; } y[row++]=sum1; y[row++]=sum2; v1 =v2; /* Since the next block to be processed starts there*/ idx +=sz; break; case 3: sum1 = *zt++; sum2 = *zt++; sum3 = *zt++; v2 = v1 + n; v3 = v2 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; } if (n == sz-1){ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; v1 =v3; /* Since the next block to be processed starts there*/ idx +=2*sz; break; case 4: sum1 = *zt++; sum2 = *zt++; sum3 = *zt++; sum4 = *zt++; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; for (n = 0; n< sz-1; n+=2) { i1 = idx[0]; i2 = idx[1]; idx += 2; tmp0 = x[i1]; tmp1 = x[i2]; sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; } if (n == sz-1){ tmp0 = x[*idx++]; sum1 += *v1++ * tmp0; sum2 += *v2++ * tmp0; sum3 += *v3++ * tmp0; sum4 += *v4++ * tmp0; } y[row++]=sum1; y[row++]=sum2; y[row++]=sum3; y[row++]=sum4; v1 =v4; /* Since the next block to be processed starts there*/ idx +=3*sz; break; case 5: sum1 = *zt++; sum2 = *zt++; sum3 = *zt++; sum4 = *zt++; sum5 = *zt++; v2 = v1 + n; v3 = v2 + n; v4 = v3 + n; v5 = v4 + n; for (n = 0; nnz);CHKERRQ(ierr); PetscFunctionReturn(0); } /* ----------------------------------------------------------- */ #undef __FUNCT__ #define __FUNCT__ "MatSolve_Inode" PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; IS iscol = a->col,isrow = a->row; PetscErrorCode ierr; PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j; PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1; PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5; PetscFunctionBegin; if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix"); if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); node_max = a->inode.node_count; ns = a->inode.size; /* Node Size array */ ierr = VecGetArray(bb,&b);CHKERRQ(ierr); ierr = VecGetArray(xx,&x);CHKERRQ(ierr); tmp = a->solve_work; ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); /* forward solve the lower triangular */ tmps = tmp ; aa = a_a ; aj = a_j ; ad = a->diag; for (i = 0,row = 0; i< node_max; ++i){ nsz = ns[i]; aii = ai[row]; v1 = aa + aii; vi = aj + aii; nz = ad[row]- aii; switch (nsz){ /* Each loop in 'case' is unrolled */ case 1 : sum1 = b[*r++]; /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ for(j=0; j=0; i--){ nsz = ns[i]; aii = ai[row+1] -1; v1 = aa + aii; vi = aj + aii; nz = aii- ad[row]; switch (nsz){ /* Each loop in 'case' is unrolled */ case 1 : sum1 = tmp[row]; for(j=nz ; j>1; j-=2){ vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; } if (j==1){ tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; } x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; break; case 2 : sum1 = tmp[row]; sum2 = tmp[row -1]; v2 = aa + ai[row]-1; for (j=nz ; j>1; j-=2){ vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; } if (j==1){ tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; break; case 3 : sum1 = tmp[row]; sum2 = tmp[row -1]; sum3 = tmp[row -2]; v2 = aa + ai[row]-1; v3 = aa + ai[row -1]-1; for (j=nz ; j>1; j-=2){ vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; v3 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; sum3 -= v3[2] * tmp0 + v3[1] * tmp1; } if (j==1){ tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; sum3 -= *v3-- * tmp0; x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; break; case 4 : sum1 = tmp[row]; sum2 = tmp[row -1]; sum3 = tmp[row -2]; sum4 = tmp[row -3]; v2 = aa + ai[row]-1; v3 = aa + ai[row -1]-1; v4 = aa + ai[row -2]-1; for (j=nz ; j>1; j-=2){ vi -=2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; v3 -= 2; v4 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; sum3 -= v3[2] * tmp0 + v3[1] * tmp1; sum4 -= v4[2] * tmp0 + v4[1] * tmp1; } if (j==1){ tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; sum4 -= *v4-- * tmp0; x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; break; case 5 : sum1 = tmp[row]; sum2 = tmp[row -1]; sum3 = tmp[row -2]; sum4 = tmp[row -3]; sum5 = tmp[row -4]; v2 = aa + ai[row]-1; v3 = aa + ai[row -1]-1; v4 = aa + ai[row -2]-1; v5 = aa + ai[row -3]-1; for (j=nz ; j>1; j-=2){ vi -= 2; i0 = vi[2]; i1 = vi[1]; tmp0 = tmps[i0]; tmp1 = tmps[i1]; v1 -= 2; v2 -= 2; v3 -= 2; v4 -= 2; v5 -= 2; sum1 -= v1[2] * tmp0 + v1[1] * tmp1; sum2 -= v2[2] * tmp0 + v2[1] * tmp1; sum3 -= v3[2] * tmp0 + v3[1] * tmp1; sum4 -= v4[2] * tmp0 + v4[1] * tmp1; sum5 -= v5[2] * tmp0 + v5[1] * tmp1; } if (j==1){ tmp0 = tmps[*vi--]; sum1 -= *v1-- * tmp0; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; } tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; sum2 -= *v2-- * tmp0; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; sum3 -= *v3-- * tmp0; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; sum4 -= *v4-- * tmp0; sum5 -= *v5-- * tmp0; tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; sum5 -= *v5-- * tmp0; x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; break; default: SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); } } ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr); PetscFunctionReturn(0); } #undef __FUNCT__ #define __FUNCT__ "MatLUFactorNumeric_Inode" PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B) { Mat C = *B; Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; IS iscol = b->col,isrow = b->row,isicol = b->icol; PetscErrorCode ierr; PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i; PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3; PetscScalar tmp,*ba = b->a,*aa = a->a,*pv; PetscReal rs=0.0; LUShift_Ctx sctx; PetscInt newshift; PetscFunctionBegin; sctx.shift_top = 0; sctx.nshift_max = 0; sctx.shift_lo = 0; sctx.shift_hi = 0; /* if both shift schemes are chosen by user, only use info->shiftpd */ if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ sctx.shift_top = 0; for (i=0; isctx.shift_top) sctx.shift_top = rs; } if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; sctx.shift_top *= 1.1; sctx.nshift_max = 5; sctx.shift_lo = 0.; sctx.shift_hi = 1.; } sctx.shift_amount = 0; sctx.nshift = 0; ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); ics = ic ; rtmp2 = rtmp1 + n; rtmp3 = rtmp2 + n; node_max = a->inode.node_count; ns = a->inode.size ; if (!ns){ SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); } /* If max inode size > 3, split it into two inodes.*/ /* also map the inode sizes according to the ordering */ ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); for (i=0,j=0; i3) { tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ ++j; tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; } else { tmp_vec1[j] = ns[i]; } } /* Use the correct node_max */ node_max = j; /* Now reorder the inode info based on mat re-ordering info */ /* First create a row -> inode_size_array_index map */ ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); for (i=0,row=0; i ba */ if (idx != row) rs += PetscAbsScalar(pc1[j]); } sctx.rs = rs; ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); if (newshift == 1) goto endofwhile; break; case 2: for (j=0; jfactor = FACTOR_LU; C->assembled = PETSC_TRUE; if (sctx.nshift) { if (info->shiftnz) { ierr = PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); } else if (info->shiftpd) { ierr = PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); } } ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Makes a longer coloring[] array and calls the usual code with that */ #undef __FUNCT__ #define __FUNCT__ "MatColoringPatch_Inode" PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; PetscErrorCode ierr; PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row; PetscInt *colorused,i; ISColoringValue *newcolor; PetscFunctionBegin; ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); /* loop over inodes, marking a color for each column*/ row = 0; for (i=0; icomm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); ierr = PetscFree(coloring);CHKERRQ(ierr); PetscFunctionReturn(0); } /* samestructure indicates that the matrix has not changed its nonzero structure so we do not need to recompute the inodes */ #undef __FUNCT__ #define __FUNCT__ "Mat_CheckInode" PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; PetscTruth flag; PetscFunctionBegin; if (!a->inode.use) PetscFunctionReturn(0); if (a->inode.checked && samestructure) PetscFunctionReturn(0); m = A->rmap.n; if (a->inode.size) {ns = a->inode.size;} else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} i = 0; node_count = 0; idx = a->j; ii = a->i; while (i < m){ /* For each row */ nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ /* Limits the number of elements in a node to 'a->inode.limit' */ for (j=i+1,idy=idx,blk_size=1; jinode.limit; ++j,++blk_size) { nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ if (nzy != nzx) break; idy += nzx; /* Same nonzero pattern */ ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); if (!flag) break; } ns[node_count++] = blk_size; idx += blk_size*nzx; i = j; } /* If not enough inodes found,, do not use inode version of the routines */ if (!a->inode.size && m && node_count > 0.9*m) { ierr = PetscFree(ns);CHKERRQ(ierr); a->inode.node_count = 0; a->inode.size = PETSC_NULL; a->inode.use = PETSC_FALSE; ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); } else { A->ops->mult = MatMult_Inode; A->ops->multadd = MatMultAdd_Inode; A->ops->solve = MatSolve_Inode; A->ops->lufactornumeric = MatLUFactorNumeric_Inode; A->ops->getrowij = MatGetRowIJ_Inode; A->ops->restorerowij = MatRestoreRowIJ_Inode; A->ops->getcolumnij = MatGetColumnIJ_Inode; A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; A->ops->coloringpatch = MatColoringPatch_Inode; a->inode.node_count = node_count; a->inode.size = ns; ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); } PetscFunctionReturn(0); } /* This is really ugly. if inodes are used this replaces the permutations with ones that correspond to rows/cols of the matrix rather then inode blocks */ #undef __FUNCT__ #define __FUNCT__ "MatInodeAdjustForInodes" PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) { PetscErrorCode ierr,(*f)(Mat,IS*,IS*); PetscFunctionBegin; ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); if (f) { ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); } PetscFunctionReturn(0); } EXTERN_C_BEGIN #undef __FUNCT__ #define __FUNCT__ "MatAdjustForInodes_Inode" PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) { Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; PetscErrorCode ierr; PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; PetscInt nslim_col,*ns_col; IS ris = *rperm,cis = *cperm; PetscFunctionBegin; if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); permc = permr + m; ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); /* Form the inode structure for the rows of permuted matric using inv perm*/ for (i=0,tns[0]=0; iassembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); if (f) { ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); } PetscFunctionReturn(0); } EXTERN_C_BEGIN #undef __FUNCT__ #define __FUNCT__ "MatInodeGetInodeSizes_Inode" PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscFunctionBegin; if (node_count) *node_count = a->inode.node_count; if (sizes) *sizes = a->inode.size; if (limit) *limit = a->inode.limit; PetscFunctionReturn(0); } EXTERN_C_END