Actual source code: csrperm.c
  1: #define PETSCMAT_DLL

  3: /*
  4:   Defines basic operations for the MATSEQCSRPERM matrix class.
  5:   This class is derived from the MATSEQAIJ class and retains the 
  6:   compressed row storage (aka Yale sparse matrix format) but augments 
  7:   it with some permutation information that enables some operations 
  8:   to be more vectorizable.  A physically rearranged copy of the matrix 
  9:   may be stored if the user desires.

 11:   Eventually a variety of permutations may be supported.
 12: */

 14:  #include src/mat/impls/aij/seq/aij.h

 16: #define NDIM 512
 17:     /* NDIM specifies how many rows at a time we should work with when 
 18:      * performing the vectorized mat-vec.  This depends on various factors 
 19:      * such as vector register length, etc., and I really need to add a 
 20:      * way for the user (or the library) to tune this.  I'm setting it to  
 21:      * 512 for now since that is what Ed D'Azevedo was using in his Fortran 
 22:      * routines. */

 24: typedef struct {
 25:   PetscInt  ngroup;
 26:   PetscInt *xgroup;
 27:     /* Denotes where groups of rows with same number of nonzeros 
 28:      * begin and end, i.e., xgroup[i] gives us the position in iperm[] 
 29:      * where the ith group begins. */
 30:   PetscInt *nzgroup; /*  how many nonzeros each row that is a member of group i has. */
 31:   PetscInt *iperm;  /* The permutation vector. */

 33:   /* Flag that indicates whether we need to clean up permutation 
 34:    * information during the MatDestroy. */
 35:   PetscTruth CleanUpCSRPERM;

 37:   /* Some of this stuff is for Ed's recursive triangular solve.
 38:    * I'm not sure what I need yet. */
 39:   PetscInt blocksize;
 40:   PetscInt nstep;
 41:   PetscInt *jstart_list;
 42:   PetscInt *jend_list;
 43:   PetscInt *action_list;
 44:   PetscInt *ngroup_list;
 45:   PetscInt **ipointer_list;
 46:   PetscInt **xgroup_list;
 47:   PetscInt **nzgroup_list;
 48:   PetscInt **iperm_list;

 50:   /* We need to keep a pointer to MatAssemblyEnd_SeqAIJ because we 
 51:    * actually want to call this function from within the 
 52:    * MatAssemblyEnd_SeqCSRPERM function.  Similarly, we also need 
 53:    * MatDestroy_SeqAIJ and MatDuplicate_SeqAIJ. */
 54:   PetscErrorCode (*AssemblyEnd_SeqAIJ)(Mat,MatAssemblyType);
 55:   PetscErrorCode (*MatDestroy_SeqAIJ)(Mat);
 56:   PetscErrorCode (*MatDuplicate_SeqAIJ)(Mat,MatDuplicateOption,Mat*);
 57: } Mat_SeqCSRPERM;

 62: PetscErrorCode  MatConvert_SeqCSRPERM_SeqAIJ(Mat A,MatType type,MatReuse reuse,Mat *newmat)
 63: {
 64:   /* This routine is only called to convert a MATCSRPERM to its base PETSc type, */
 65:   /* so we will ignore 'MatType type'. */
 67:   Mat            B = *newmat;
 68:   Mat_SeqCSRPERM *csrperm=(Mat_SeqCSRPERM*)A->spptr;

 71:   if (reuse == MAT_INITIAL_MATRIX) {
 72:     MatDuplicate(A,MAT_COPY_VALUES,&B);
 73:   }

 75:   /* Reset the original function pointers. */
 76:   B->ops->assemblyend = csrperm->AssemblyEnd_SeqAIJ;
 77:   B->ops->destroy   = csrperm->MatDestroy_SeqAIJ;
 78:   B->ops->duplicate = csrperm->MatDuplicate_SeqAIJ;

 80:   /* Free everything in the Mat_SeqCSRPERM data structure. 
 81:    * We don't free the Mat_SeqCSRPERM struct itself, as this will 
 82:    * cause problems later when MatDestroy() tries to free it. */
 83:   if(csrperm->CleanUpCSRPERM) {
 84:     PetscFree(csrperm->xgroup);
 85:     PetscFree(csrperm->nzgroup);
 86:     PetscFree(csrperm->iperm);
 87:   }

 89:   /* Change the type of B to MATSEQAIJ. */
 90:   PetscObjectChangeTypeName( (PetscObject)B, MATSEQAIJ);
 91: 
 92:   *newmat = B;
 93:   return(0);
 94: }

 99: PetscErrorCode MatDestroy_SeqCSRPERM(Mat A)
100: {
102:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM *) A->spptr;

104:   /* We are going to convert A back into a SEQAIJ matrix, since we are 
105:    * eventually going to use MatDestroy_SeqAIJ() to destroy everything 
106:    * that is not specific to CSRPERM.
107:    * In preparation for this, reset the operations pointers in A to 
108:    * their SeqAIJ versions. */
109:   A->ops->assemblyend = csrperm->AssemblyEnd_SeqAIJ;
110:     /* I suppose I don't actually need to point A->ops->assemblyend back 
111:      * to the right thing, but I do it anyway for completeness. */
112:   A->ops->destroy   = csrperm->MatDestroy_SeqAIJ;
113:   A->ops->duplicate = csrperm->MatDuplicate_SeqAIJ;

115:   /* Free everything in the Mat_SeqCSRPERM data structure. 
116:    * Note that we don't need to free the Mat_SeqCSRPERM struct 
117:    * itself, as MatDestroy() will do so. */
118:   if(csrperm->CleanUpCSRPERM) {
119:     PetscFree(csrperm->xgroup);
120:     PetscFree(csrperm->nzgroup);
121:     PetscFree(csrperm->iperm);
122:   }

124:   /* Change the type of A back to SEQAIJ and use MatDestroy_SeqAIJ() 
125:    * to destroy everything that remains. */
126:   PetscObjectChangeTypeName( (PetscObject)A, MATSEQAIJ);
127:   /* Note that I don't call MatSetType().  I believe this is because that 
128:    * is only to be called when *building* a matrix.  I could be wrong, but 
129:    * that is how things work for the SuperLU matrix class. */
130:   (*A->ops->destroy)(A);
131:   return(0);
132: }

134: PetscErrorCode MatDuplicate_SeqCSRPERM(Mat A, MatDuplicateOption op, Mat *M)
135: {
137:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM *) A->spptr;
138:   Mat_SeqCSRPERM *csrperm_dest = (Mat_SeqCSRPERM *) (*M)->spptr;

141:   (*csrperm->MatDuplicate_SeqAIJ)(A,op,M);
142:   PetscMemcpy((*M)->spptr,csrperm,sizeof(Mat_SeqCSRPERM));
143:   /* Allocate space for, and copy the grouping and permutation info. 
144:    * I note that when the groups are initially determined in 
145:    * SeqCSRPERM_create_perm, xgroup and nzgroup may be sized larger than 
146:    * necessary.  But at this point, we know how large they need to be, and 
147:    * allocate only the necessary amount of memory.  So the duplicated matrix 
148:    * may actually use slightly less storage than the original! */
149:   PetscMalloc(A->rmap.n*sizeof(PetscInt), csrperm_dest->iperm);
150:   PetscMalloc((csrperm->ngroup+1)*sizeof(PetscInt), csrperm_dest->xgroup);
151:   PetscMalloc((csrperm->ngroup)*sizeof(PetscInt), csrperm_dest->nzgroup);
152:   PetscMemcpy(csrperm_dest->iperm,csrperm->iperm,sizeof(PetscInt)*A->rmap.n);
153:   PetscMemcpy(csrperm_dest->xgroup,csrperm->xgroup,sizeof(PetscInt)*(csrperm->ngroup+1));
154:   PetscMemcpy(csrperm_dest->nzgroup,csrperm->nzgroup,sizeof(PetscInt)*csrperm->ngroup);
155:   return(0);
156: }

160: PetscErrorCode SeqCSRPERM_create_perm(Mat A)
161: {
162:   PetscInt   m;  /* Number of rows in the matrix. */
163:   Mat_SeqAIJ *a = (Mat_SeqAIJ *)(A)->data;
164:   PetscInt   *ia;  /* From the CSR representation; points to the beginning  of each row. */
165:   PetscInt   maxnz; /* Maximum number of nonzeros in any row. */
166:   PetscInt   *rows_in_bucket;
167:     /* To construct the permutation, we sort each row into one of maxnz 
168:      * buckets based on how many nonzeros are in the row. */
169:   PetscInt   nz;
170:   PetscInt   *nz_in_row;  /* the number of nonzero elements in row k. */
171:   PetscInt   *ipnz;
172:     /* When constructing the iperm permutation vector, 
173:      * ipnz[nz] is used to point to the next place in the permutation vector 
174:      * that a row with nz nonzero elements should be placed.*/
175:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM*) A->spptr;
176:     /* Points to the MATSEQCSRPERM-specific data in the matrix B. */
178:   PetscInt       i, ngroup, istart, ipos;

180:   /* I really ought to put something in here to check if B is of 
181:    * type MATSEQCSRPERM and return an error code if it is not.
182:    * Come back and do this! */
183: 
184:   m  = A->rmap.n;
185:   ia = a->i;
186: 
187:   /* Allocate the arrays that will hold the permutation vector. */
188:   PetscMalloc(m*sizeof(PetscInt), &csrperm->iperm);

190:   /* Allocate some temporary work arrays that will be used in 
191:    * calculating the permuation vector and groupings. */
192:   PetscMalloc((m+1)*sizeof(PetscInt), &rows_in_bucket);
193:   PetscMalloc((m+1)*sizeof(PetscInt), &ipnz);
194:   PetscMalloc(m*sizeof(PetscInt), &nz_in_row);

196:   /* Now actually figure out the permutation and grouping. */

198:   /* First pass: Determine number of nonzeros in each row, maximum 
199:    * number of nonzeros in any row, and how many rows fall into each  
200:    * "bucket" of rows with same number of nonzeros. */
201:   maxnz = 0;
202:   for (i=0; i<m; i++) {
203:     nz_in_row[i] = ia[i+1]-ia[i];
204:     if (nz_in_row[i] > maxnz) maxnz = nz_in_row[i];
205:   }

207:   for (i=0; i<=maxnz; i++) {
208:     rows_in_bucket[i] = 0;
209:   }
210:   for (i=0; i<m; i++) {
211:     nz = nz_in_row[i];
212:     rows_in_bucket[nz]++;
213:   }

215:   /* Allocate space for the grouping info.  There will be at most (maxnz + 1) 
216:    * groups.  (It is maxnz + 1 instead of simply maxnz because there may be 
217:    * rows with no nonzero elements.)  If there are (maxnz + 1) groups, 
218:    * then xgroup[] must consist of (maxnz + 2) elements, since the last 
219:    * element of xgroup will tell us where the (maxnz + 1)th group ends.
220:    * We allocate space for the maximum number of groups; 
221:    * that is potentially a little wasteful, but not too much so.  
222:    * Perhaps I should fix it later. */
223:   PetscMalloc((maxnz+2)*sizeof(PetscInt), &csrperm->xgroup);
224:   PetscMalloc((maxnz+1)*sizeof(PetscInt), &csrperm->nzgroup);

226:   /* Second pass.  Look at what is in the buckets and create the groupings.
227:    * Note that it is OK to have a group of rows with no non-zero values. */
228:   ngroup = 0;
229:   istart = 0;
230:   for (i=0; i<=maxnz; i++) {
231:     if (rows_in_bucket[i] > 0) {
232:       csrperm->nzgroup[ngroup] = i;
233:       csrperm->xgroup[ngroup] = istart;
234:       ngroup++;
235:       istart += rows_in_bucket[i];
236:     }
237:   }

239:   csrperm->xgroup[ngroup] = istart;
240:   csrperm->ngroup = ngroup;

242:   /* Now fill in the permutation vector iperm. */
243:   ipnz[0] = 0;
244:   for (i=0; i<maxnz; i++) {
245:     ipnz[i+1] = ipnz[i] + rows_in_bucket[i];
246:   }

248:   for (i=0; i<m; i++) {
249:     nz = nz_in_row[i];
250:     ipos = ipnz[nz];
251:     csrperm->iperm[ipos] = i;
252:     ipnz[nz]++;
253:   }

255:   /* Clean up temporary work arrays. */
256:   PetscFree(rows_in_bucket);
257:   PetscFree(ipnz);
258:   PetscFree(nz_in_row);

260:   /* Since we've allocated some memory to hold permutation info, 
261:    * flip the CleanUpCSRPERM flag to true. */
262:   csrperm->CleanUpCSRPERM = PETSC_TRUE;
263:   return(0);
264: }


269: PetscErrorCode MatAssemblyEnd_SeqCSRPERM(Mat A, MatAssemblyType mode)
270: {
272:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM*) A->spptr;
273:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;

275:   if (mode == MAT_FLUSH_ASSEMBLY) return(0);
276: 
277:   /* Since a MATSEQCSRPERM matrix is really just a MATSEQAIJ with some 
278:    * extra information, call the AssemblyEnd routine for a MATSEQAIJ. 
279:    * I'm not sure if this is the best way to do this, but it avoids 
280:    * a lot of code duplication.
281:    * I also note that currently MATSEQCSRPERM doesn't know anything about 
282:    * the Mat_CompressedRow data structure that SeqAIJ now uses when there 
283:    * are many zero rows.  If the SeqAIJ assembly end routine decides to use 
284:    * this, this may break things.  (Don't know... haven't looked at it.) */
285:   a->inode.use = PETSC_FALSE;
286:   (*csrperm->AssemblyEnd_SeqAIJ)(A, mode);

288:   /* Now calculate the permutation and grouping information. */
289:   SeqCSRPERM_create_perm(A);
290:   return(0);
291: }

293:  #include src/inline/dot.h

297: PetscErrorCode MatMult_SeqCSRPERM(Mat A,Vec xx,Vec yy)
298: {
299:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
300:   PetscScalar    *x,*y,*aa;
302:   PetscInt       *aj,*ai;
303: #if !(defined(PETSC_USE_FORTRAN_KERNEL_MULTCSRPERM) && defined(notworking))
304:   PetscInt       i,j;
305: #endif

307:   /* Variables that don't appear in MatMult_SeqAIJ. */
308:   Mat_SeqCSRPERM *csrperm = (Mat_SeqCSRPERM *) A->spptr;
309:   PetscInt       *iperm;  /* Points to the permutation vector. */
310:   PetscInt       *xgroup;
311:     /* Denotes where groups of rows with same number of nonzeros 
312:      * begin and end in iperm. */
313:   PetscInt *nzgroup;
314:   PetscInt ngroup;
315:   PetscInt igroup;
316:   PetscInt jstart,jend;
317:     /* jstart is used in loops to denote the position in iperm where a 
318:      * group starts; jend denotes the position where it ends.
319:      * (jend + 1 is where the next group starts.) */
320:   PetscInt iold,nz;
321:   PetscInt istart,iend,isize;
322:   PetscInt ipos;
323:   PetscScalar yp[NDIM];
324:   PetscInt    ip[NDIM]; /* yp[] and ip[] are treated as vector "registers" for performing the mat-vec. */

326: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
327: #pragma disjoint(*x,*y,*aa)
328: #endif

331:   VecGetArray(xx,&x);
332:   VecGetArray(yy,&y);
333:   aj  = a->j;    /* aj[k] gives column index for element aa[k]. */
334:   aa  = a->a;  /* Nonzero elements stored row-by-row. */
335:   ai  = a->i;   /* ai[k] is the position in aa and aj where row k starts. */

337:   /* Get the info we need about the permutations and groupings. */
338:   iperm  = csrperm->iperm;
339:   ngroup = csrperm->ngroup;
340:   xgroup = csrperm->xgroup;
341:   nzgroup = csrperm->nzgroup;
342: 
343: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTCSRPERM) && defined(notworking)
344:   fortranmultcsrperm_(&m,x,ii,aj,aa,y);
345: #else

347:   for (igroup=0; igroup<ngroup; igroup++) {
348:     jstart = xgroup[igroup];
349:     jend   = xgroup[igroup+1] - 1;
350:     nz     = nzgroup[igroup];

352:     /* Handle the special cases where the number of nonzeros per row 
353:      * in the group is either 0 or 1. */
354:     if (nz == 0) {
355:       for (i=jstart; i<=jend; i++) {
356:         y[iperm[i]] = 0.0;
357:       }
358:     } else if (nz == 1) {
359:       for (i=jstart; i<=jend; i++) {
360:         iold = iperm[i];
361:         ipos = ai[iold];
362:         y[iold] = aa[ipos] * x[aj[ipos]];
363:       }
364:     } else {
365: 
366:       /* We work our way through the current group in chunks of NDIM rows 
367:        * at a time. */

369:       for (istart=jstart; istart<=jend; istart+=NDIM) {
370:         /* Figure out where the chunk of 'isize' rows ends in iperm.
371:          * 'isize may of course be less than NDIM for the last chunk. */
372:         iend = istart + (NDIM - 1);
373:         if (iend > jend) { iend = jend; }
374:         isize = iend - istart + 1;

376:         /* Initialize the yp[] array that will be used to hold part of 
377:          * the permuted results vector, and figure out where in aa each 
378:          * row of the chunk will begin. */
379:         for (i=0; i<isize; i++) {
380:           iold = iperm[istart + i];
381:             /* iold is a row number from the matrix A *before* reordering. */
382:           ip[i] = ai[iold];
383:             /* ip[i] tells us where the ith row of the chunk begins in aa. */
384:           yp[i] = (PetscScalar) 0.0;
385:         }

387:         /* If the number of zeros per row exceeds the number of rows in 
388:          * the chunk, we should vectorize along nz, that is, perform the 
389:          * mat-vec one row at a time as in the usual CSR case. */
390:         if (nz > isize) {
391: #if defined(PETSC_HAVE_CRAYC)
392: #pragma _CRI preferstream
393: #endif
394:           for (i=0; i<isize; i++) {
395: #if defined(PETSC_HAVE_CRAYC)
396: #pragma _CRI prefervector
397: #endif
398:             for (j=0; j<nz; j++) {
399:               ipos = ip[i] + j;
400:               yp[i] += aa[ipos] * x[aj[ipos]];
401:             }
402:           }
403:         } else {
404:         /* Otherwise, there are enough rows in the chunk to make it 
405:          * worthwhile to vectorize across the rows, that is, to do the 
406:          * matvec by operating with "columns" of the chunk. */
407:           for (j=0; j<nz; j++) {
408:             for(i=0; i<isize; i++) {
409:               ipos = ip[i] + j;
410:               yp[i] += aa[ipos] * x[aj[ipos]];
411:             }
412:           }
413:         }

415: #if defined(PETSC_HAVE_CRAYC)
416: #pragma _CRI ivdep
417: #endif
418:         /* Put results from yp[] into non-permuted result vector y. */
419:         for (i=0; i<isize; i++) {
420:           y[iperm[istart+i]] = yp[i];
421:         }
422:       } /* End processing chunk of isize rows of a group. */
423:     } /* End handling matvec for chunk with nz > 1. */
424:   } /* End loop over igroup. */
425: #endif
426:   PetscLogFlops(2*a->nz - A->rmap.n);
427:   VecRestoreArray(xx,&x);
428:   VecRestoreArray(yy,&y);
429:   return(0);
430: }


433: /* MatMultAdd_SeqCSRPERM() calculates yy = ww + A * xx.
434:  * Note that the names I used to designate the vectors differs from that 
435:  * used in MatMultAdd_SeqAIJ().  I did this to keep my notation consistent 
436:  * with the MatMult_SeqCSRPERM() routine, which is very similar to this one. */
437: /*
438:     I hate having virtually identical code for the mult and the multadd!!!
439: */
442: PetscErrorCode MatMultAdd_SeqCSRPERM(Mat A,Vec xx,Vec ww,Vec yy)
443: {
444:   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
445:   PetscScalar    *x,*y,*w,*aa;
447:   PetscInt       *aj,*ai;
448: #if !defined(PETSC_USE_FORTRAN_KERNEL_MULTADDCSRPERM)
449:   PetscInt       i,j;
450: #endif

452:   /* Variables that don't appear in MatMultAdd_SeqAIJ. */
453:   Mat_SeqCSRPERM *csrperm;
454:   PetscInt *iperm;  /* Points to the permutation vector. */
455:   PetscInt *xgroup;
456:     /* Denotes where groups of rows with same number of nonzeros 
457:      * begin and end in iperm. */
458:   PetscInt *nzgroup;
459:   PetscInt ngroup;
460:   PetscInt igroup;
461:   PetscInt jstart,jend;
462:     /* jstart is used in loops to denote the position in iperm where a 
463:      * group starts; jend denotes the position where it ends.
464:      * (jend + 1 is where the next group starts.) */
465:   PetscInt iold,nz;
466:   PetscInt istart,iend,isize;
467:   PetscInt ipos;
468:   PetscScalar yp[NDIM];
469:   PetscInt ip[NDIM];
470:     /* yp[] and ip[] are treated as vector "registers" for performing 
471:      * the mat-vec. */

473: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
474: #pragma disjoint(*x,*y,*aa)
475: #endif

478:   VecGetArray(xx,&x);
479:   VecGetArray(yy,&y);
480:   if (yy != ww) {
481:     VecGetArray(ww,&w);
482:   } else {
483:     w = y;
484:   }

486:   aj  = a->j;  /* aj[k] gives column index for element aa[k]. */
487:   aa  = a->a;  /* Nonzero elements stored row-by-row. */
488:   ai  = a->i;  /* ai[k] is the position in aa and aj where row k starts. */

490:   /* Get the info we need about the permutations and groupings. */
491:   csrperm = (Mat_SeqCSRPERM *) A->spptr;
492:   iperm = csrperm->iperm;
493:   ngroup = csrperm->ngroup;
494:   xgroup = csrperm->xgroup;
495:   nzgroup = csrperm->nzgroup;
496: 
497: #if defined(PETSC_USE_FORTRAN_KERNEL_MULTADDCSRPERM)
498:   fortranmultaddcsrperm_(&m,x,ii,aj,aa,y,w);
499: #else

501:   for(igroup=0; igroup<ngroup; igroup++) {
502:     jstart = xgroup[igroup];
503:     jend = xgroup[igroup+1] - 1;

505:     nz = nzgroup[igroup];

507:     /* Handle the special cases where the number of nonzeros per row 
508:      * in the group is either 0 or 1. */
509:     if(nz == 0) {
510:       for(i=jstart; i<=jend; i++) {
511:         iold = iperm[i];
512:         y[iold] = w[iold];
513:       }
514:     }
515:     else if(nz == 1) {
516:       for(i=jstart; i<=jend; i++) {
517:         iold = iperm[i];
518:         ipos = ai[iold];
519:         y[iold] = w[iold] + aa[ipos] * x[aj[ipos]];
520:       }
521:     }
522:     /* For the general case: */
523:     else {
524: 
525:       /* We work our way through the current group in chunks of NDIM rows 
526:        * at a time. */

528:       for(istart=jstart; istart<=jend; istart+=NDIM) {
529:         /* Figure out where the chunk of 'isize' rows ends in iperm.
530:          * 'isize may of course be less than NDIM for the last chunk. */
531:         iend = istart + (NDIM - 1);
532:         if(iend > jend) { iend = jend; }
533:         isize = iend - istart + 1;

535:         /* Initialize the yp[] array that will be used to hold part of 
536:          * the permuted results vector, and figure out where in aa each 
537:          * row of the chunk will begin. */
538:         for(i=0; i<isize; i++) {
539:           iold = iperm[istart + i];
540:             /* iold is a row number from the matrix A *before* reordering. */
541:           ip[i] = ai[iold];
542:             /* ip[i] tells us where the ith row of the chunk begins in aa. */
543:           yp[i] = w[iold];
544:         }

546:         /* If the number of zeros per row exceeds the number of rows in 
547:          * the chunk, we should vectorize along nz, that is, perform the 
548:          * mat-vec one row at a time as in the usual CSR case. */
549:         if(nz > isize) {
550: #if defined(PETSC_HAVE_CRAYC)
551: #pragma _CRI preferstream
552: #endif
553:           for(i=0; i<isize; i++) {
554: #if defined(PETSC_HAVE_CRAYC)
555: #pragma _CRI prefervector
556: #endif
557:             for(j=0; j<nz; j++) {
558:               ipos = ip[i] + j;
559:               yp[i] += aa[ipos] * x[aj[ipos]];
560:             }
561:           }
562:         }
563:         /* Otherwise, there are enough rows in the chunk to make it 
564:          * worthwhile to vectorize across the rows, that is, to do the 
565:          * matvec by operating with "columns" of the chunk. */
566:         else {
567:           for(j=0; j<nz; j++) {
568:             for(i=0; i<isize; i++) {
569:               ipos = ip[i] + j;
570:               yp[i] += aa[ipos] * x[aj[ipos]];
571:             }
572:           }
573:         }

575: #if defined(PETSC_HAVE_CRAYC)
576: #pragma _CRI ivdep
577: #endif
578:         /* Put results from yp[] into non-permuted result vector y. */
579:         for(i=0; i<isize; i++) {
580:           y[iperm[istart+i]] = yp[i];
581:         }
582:       } /* End processing chunk of isize rows of a group. */
583: 
584:     } /* End handling matvec for chunk with nz > 1. */
585:   } /* End loop over igroup. */

587: #endif
588:   PetscLogFlops(2*a->nz - A->rmap.n);
589:   VecRestoreArray(xx,&x);
590:   VecRestoreArray(yy,&y);
591:   if (yy != ww) {
592:     VecRestoreArray(ww,&w);
593:   }
594:   return(0);
595: }


598: /* MatConvert_SeqAIJ_SeqCSRPERM converts a SeqAIJ matrix into a 
599:  * SeqCSRPERM matrix.  This routine is called by the MatCreate_SeqCSRPERM() 
600:  * routine, but can also be used to convert an assembled SeqAIJ matrix 
601:  * into a SeqCSRPERM one. */
605: PetscErrorCode  MatConvert_SeqAIJ_SeqCSRPERM(Mat A,MatType type,MatReuse reuse,Mat *newmat)
606: {
608:   Mat            B = *newmat;
609:   Mat_SeqCSRPERM *csrperm;

612:   if (reuse == MAT_INITIAL_MATRIX) {
613:     MatDuplicate(A,MAT_COPY_VALUES,&B);
614:   }

616:   PetscNew(Mat_SeqCSRPERM,&csrperm);
617:   B->spptr = (void *) csrperm;

619:   csrperm->AssemblyEnd_SeqAIJ  = A->ops->assemblyend;
620:   csrperm->MatDestroy_SeqAIJ   = A->ops->destroy;
621:   csrperm->MatDuplicate_SeqAIJ = A->ops->duplicate;

623:   /* Set function pointers for methods that we inherit from AIJ but override. */
624:   B->ops->duplicate   = MatDuplicate_SeqCSRPERM;
625:   B->ops->assemblyend = MatAssemblyEnd_SeqCSRPERM;
626:   B->ops->destroy     = MatDestroy_SeqCSRPERM;
627:   B->ops->mult        = MatMult_SeqCSRPERM;
628:   B->ops->multadd     = MatMultAdd_SeqCSRPERM;

630:   /* If A has already been assembled, compute the permutation. */
631:   if (A->assembled == PETSC_TRUE) {
632:     SeqCSRPERM_create_perm(B);
633:   }
634: 
635:   PetscObjectComposeFunctionDynamic((PetscObject)B,"MatConvert_seqcsrperm_seqaij_C",
636:                                            "MatConvert_SeqCSRPERM_SeqAIJ",MatConvert_SeqCSRPERM_SeqAIJ);

638:   PetscObjectChangeTypeName((PetscObject)B,MATSEQCSRPERM);
639:   *newmat = B;
640:   return(0);
641: }


647: /*@C
648:    MatCreateSeqCSRPERM - Creates a sparse matrix of type SEQCSRPERM.
649:    This type inherits from AIJ, but calculates some additional permutation 
650:    information that is used to allow better vectorization of some 
651:    operations.  At the cost of increased storage, the AIJ formatted 
652:    matrix can be copied to a format in which pieces of the matrix are 
653:    stored in ELLPACK format, allowing the vectorized matrix multiply 
654:    routine to use stride-1 memory accesses.  As with the AIJ type, it is 
655:    important to preallocate matrix storage in order to get good assembly 
656:    performance.
657:    
658:    Collective on MPI_Comm

660:    Input Parameters:
661: +  comm - MPI communicator, set to PETSC_COMM_SELF
662: .  m - number of rows
663: .  n - number of columns
664: .  nz - number of nonzeros per row (same for all rows)
665: -  nnz - array containing the number of nonzeros in the various rows 
666:          (possibly different for each row) or PETSC_NULL

668:    Output Parameter:
669: .  A - the matrix 

671:    Notes:
672:    If nnz is given then nz is ignored

674:    Level: intermediate

676: .keywords: matrix, cray, sparse, parallel

678: .seealso: MatCreate(), MatCreateMPICSRPERM(), MatSetValues()
679: @*/
680: PetscErrorCode  MatCreateSeqCSRPERM(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
681: {

685:   MatCreate(comm,A);
686:   MatSetSizes(*A,m,n,m,n);
687:   MatSetType(*A,MATSEQCSRPERM);
688:   MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);
689:   return(0);
690: }


696: PetscErrorCode  MatCreate_SeqCSRPERM(Mat A)
697: {

701:   MatSetType(A,MATSEQAIJ);
702:   MatConvert_SeqAIJ_SeqCSRPERM(A,MATSEQCSRPERM,MAT_REUSE_MATRIX,&A);
703:   return(0);
704: }