Actual source code: ex2.c
  2: static char help[] = "Reads a a simple unstructured grid from a file. Partitions it,\n\
  3: and distributes the grid data accordingly\n\n";

  5: /*T
  6:    Concepts: Mat^partitioning a matrix;
  7:    Processors: n
  8: T*/

 10: /*
 11:     Updates of this example MAY be found at 
 12:        http://www.mcs.anl.gov/petsc/src/dm/ao/examples/tutorials/ex2.c

 14:     This is a very basic, even crude, example of managing an unstructured
 15:     grid in parallel.

 17:     This particular code is for a Galerkin-style finite element method. 

 19:     After the calls below, each processor will have 
 20:       1) a list of elements it "owns"; for each "owned" element it will have the global 
 21:          numbering of the three vertices; stored in gdata->ele;
 22:       2) a list of vertices it "owns". For each owned it will have the x and y 
 23:          coordinates; stored in gdata->vert

 25:     It will not have 
 26:       1) list of ghost elements (since they are not needed for traditional 
 27:          Galerkin style finite element methods). For various finite volume methods
 28:          you may need the ghost element lists, these may be generated using the 
 29:          element neighbor information given in the file database.

 31:     To compute the local element stiffness matrix and load vector, each processor
 32:     will need the vertex coordinates for all of the vertices on the locally 
 33:     "owned" elements.  This data could be obtained by doing the appropriate vector
 34:     scatter on the data stored in gdata->vert; we haven't had time to demonstrate this.

 36:     Clearly writing a complete parallel unstructured grid code with PETSc is still
 37:     a good deal of work and requires a lot of application level coding.  BUT, at least
 38:     PETSc can manage all of the nonlinear and linear solvers (including matrix assembly 
 39:     etc.), which allows the programmer to concentrate his or her efforts on managing 
 40:     the unstructured grid. The PETSc team is developing additional library objects
 41:     to help manage parallel unstructured grid computations.  Unfortunately we have 
 42:     not had time to complete these yet, so the application programmer still must
 43:     manage much of the parallel grid manipulation as indicated below.

 45: */

 47: /* 
 48:   Include "petscmat.h" so that we can use matrices.
 49:   automatically includes:
 50:      petsc.h       - base PETSc routines   petscvec.h    - vectors
 51:      petscsys.h    - system routines       petscmat.h    - matrices
 52:      petscis.h     - index sets            petscviewer.h - viewers               

 54:   Include "petscao.h" allows use of the AO (application ordering) commands,
 55:   used below for renumbering the vertex numbers after the partitioning.

 57:   Include "petscbt.h" for managing logical bit arrays that are used to 
 58:   conserve space. Note that the code does use order N bit arrays on each 
 59:   processor so is theoretically not scalable, but even with 64 million 
 60:   vertices it will only need temporarily 8 megabytes of memory for the 
 61:   bit array so one can still do very large problems with this approach, 
 62:   since the bit arrays are freed before the vectors and matrices are
 63:   created.
 64: */
 65:  #include petscmat.h
 66:  #include petscao.h
 67:  #include petscbt.h

 69: /* 
 70:     This is the user-defined grid data context 
 71: */
 72: typedef struct {
 73:   PetscInt    n_vert,n_ele;
 74:   PetscInt    mlocal_vert,mlocal_ele;
 75:   PetscInt    *ele;
 76:   PetscScalar *vert;
 77:   PetscInt    *ia,*ja;
 78:   IS     isnewproc;
 79:   PetscInt    *localvert,nlocal; /* used to stash temporarily old global vertex number of new vertex */
 80: } GridData;

 82: /*
 83:    Variables on all processors:
 84:       n_vert          - total number of vertices
 85:       mlocal_vert     - number of vertices on this processor
 86:       vert            - x,y coordinates of local vertices

 88:       n_ele           - total number of elements
 89:       mlocal_ele      - number of elements on this processor
 90:       ele             - vertices of elements on this processor

 92:       ia, ja          - adjacency graph of elements (for partitioning)
 93:     
 94:    Variables on processor 0 during data reading from file:
 95:       mmlocal_vert[i] - number of vertices on each processor
 96:       tmpvert         - x,y coordinates of vertices on any processor (as read in)

 98:       mmlocal_ele[i]  - number of elements on each processor

100:       tmpia, tmpja    - adjacency graph of elements for other processors

102:    Notes:
103:    The code below has a great deal of IO (print statements). This is to allow one to track 
104:    the renumbering and movement of data among processors. In an actual 
105:    production run, IO of this type would be deactivated.

107:    To use the ParMETIS partitioner run with the option -mat_partitioning_type parmetis
108:    otherwise it defaults to the initial element partitioning induced when the data 
109:    is read in.

111:    To understand the parallel performance of this type of code, it is important
112:    to profile the time spent in various events in the code; running with the
113:    option -log_summary will indicate how much time is spent in the routines 
114:    below.   Of course, for very small problems, such as the sample grid used here,
115:    the profiling results are meaningless.
116: */


127: int main(int argc,char **args)
128: {
130:   PetscEvent     READ_EVENT,PARTITION_ELEMENT_EVENT,MOVE_ELEMENT_EVENT;
131:   PetscEvent     PARTITION_VERTEX_EVENT,MOVE_VERTEX_EVENT;
132:   GridData       gdata;

134:   PetscInitialize(&argc,&args,(char *)0,help);


143:   DataRead(&gdata);
146:   DataPartitionElements(&gdata);
149:   DataMoveElements(&gdata);
152:   DataPartitionVertices(&gdata);
155:   DataMoveVertices(&gdata);
157:   DataDestroy(&gdata);

159:   PetscFinalize();
160:   return 0;
161: }


166: /*
167:      Reads in the grid data from a file; each processor is naively 
168:   assigned a continuous chunk of vertex and element data. Later the data
169:   will be partitioned and moved to the appropriate processor.
170: */
171: PetscErrorCode DataRead(GridData *gdata)
172: {
173:   PetscMPIInt    rank,size;
174:   PetscInt       n_vert,*mmlocal_vert,mlocal_vert,i,*ia,*ja,cnt,j;
175:   PetscInt       mlocal_ele,*mmlocal_ele,*ele,*tmpele,n_ele,net,a1,a2,a3;
176:   PetscInt       *iatmp,*jatmp;
178:   char           msg[128];
179:   PetscScalar    *vert,*tmpvert;
180:   MPI_Status     status;

183:   /*
184:      Processor 0 opens the file, reads in chunks of data and sends a portion off to
185:    each other processor.

187:      Note: For a truely scalable IO portion of the code, one would store
188:    the grid data in a binary file and use MPI-IO commands to have each 
189:    processor read in the parts that it needs. However in most circumstances
190:    involving up to a say a million nodes and 100 processors this approach 
191:    here is fine.
192:   */
193:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
194:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

196:   if (!rank) {
197:     FILE *fd;
198:     fd = fopen("usgdata","r"); if (!fd) SETERRQ(1,"Cannot open grid file");

200:     /* read in number of vertices */
201:     fgets(msg,128,fd);
202:     printf("File msg:%s",msg);
203:     fscanf(fd,"Number Vertices = %d\n",&n_vert);
204:     printf("Number of grid vertices %d\n",n_vert);

206:     /* broadcast number of vertices to all processors */
207:     MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
208:     mlocal_vert  = n_vert/size + ((n_vert % size) > 0);

210:     /* 
211:       allocate enough room for the first processor to keep track of how many 
212:       vertices are assigned to each processor. Splitting vertices equally amoung
213:       all processors.
214:     */
215:     PetscMalloc(size*sizeof(PetscInt),&mmlocal_vert);
216:     for (i=0; i<size; i++) {
217:       mmlocal_vert[i] = n_vert/size + ((n_vert % size) > i);
218:       printf("Processor %d assigned %d vertices\n",i,mmlocal_vert[i]);
219:     }

221:     /*
222:        Read in vertices assigned to first processor
223:     */
224:     PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&vert);
225:     printf("Vertices assigned to processor 0\n");
226:     for (i=0; i<mlocal_vert; i++) {
227:       fscanf(fd,"%d %lf %lf\n",&cnt,vert+2*i,vert+2*i+1);
228:       printf("%d %g %g\n",cnt,PetscRealPart(vert[2*i]),PetscRealPart(vert[2*i+1]));
229:     }

231:     /* 
232:        Read in vertices for all the other processors 
233:     */
234:     PetscMalloc(2*mmlocal_vert[0]*sizeof(PetscScalar),&tmpvert);
235:     for (j=1; j<size; j++) {
236:       printf("Vertices assigned to processor %d\n",j);
237:       for (i=0; i<mmlocal_vert[j]; i++) {
238:         fscanf(fd,"%d %lf %lf\n",&cnt,tmpvert+2*i,tmpvert+2*i+1);
239:         printf("%d %g %g\n",cnt,tmpvert[2*i],tmpvert[2*i+1]);
240:       }
241:       MPI_Send(tmpvert,2*mmlocal_vert[j],MPIU_SCALAR,j,0,PETSC_COMM_WORLD);
242:     }
243:     PetscFree(tmpvert);
244:     PetscFree(mmlocal_vert);

246:     fscanf(fd,"Number Elements = %d\n",&n_ele);
247:     printf("Number of grid elements %d\n",n_ele);

249:     /* 
250:        Broadcast number of elements to all processors
251:     */
252:     MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
253:     mlocal_ele  = n_ele/size + ((n_ele % size) > 0);

255:     /* 
256:       Allocate enough room for the first processor to keep track of how many 
257:       elements are assigned to each processor.
258:     */
259:     PetscMalloc(size*sizeof(PetscInt),&mmlocal_ele);
260:     for (i=0; i<size; i++) {
261:       mmlocal_ele[i] = n_ele/size + ((n_ele % size) > i);
262:       printf("Processor %d assigned %d elements\n",i,mmlocal_ele[i]);
263:     }
264: 
265:     /*
266:         read in element information for the first processor
267:     */
268:     PetscMalloc(3*mmlocal_ele[0]*sizeof(PetscInt),&ele);
269:     printf("Elements assigned to processor 0\n");
270:     for (i=0; i<mlocal_ele; i++) {
271:       fscanf(fd,"%d %d %d %d\n",&cnt,ele+3*i,ele+3*i+1,ele+3*i+2);
272:       printf("%d %d %d %d\n",cnt,ele[3*i],ele[3*i+1],ele[3*i+2]);
273:     }

275:     /* 
276:        Read in elements for all the other processors 
277:     */
278:     PetscMalloc(3*mmlocal_ele[0]*sizeof(PetscInt),&tmpele);
279:     for (j=1; j<size; j++) {
280:       printf("Elements assigned to processor %d\n",j);
281:       for (i=0; i<mmlocal_ele[j]; i++) {
282:         fscanf(fd,"%d %d %d %d\n",&cnt,tmpele+3*i,tmpele+3*i+1,tmpele+3*i+2);
283:         printf("%d %d %d %d\n",cnt,tmpele[3*i],tmpele[3*i+1],tmpele[3*i+2]);
284:       }
285:       MPI_Send(tmpele,3*mmlocal_ele[j],MPI_INT,j,0,PETSC_COMM_WORLD);
286:     }
287:     PetscFree(tmpele);

289:     /* 
290:          Read in element neighbors for processor 0 
291:          We don't know how many spaces in ja[] to allocate so we allocate 
292:        3*the number of local elements, this is the maximum it could be
293:     */
294:     PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&ia);
295:     PetscMalloc((3*mlocal_ele+1)*sizeof(PetscInt),&ja);
296:     net   = 0;
297:     ia[0] = 0;
298:     printf("Element neighbors on processor 0\n");
299:     fgets(msg,128,fd);
300:     for (i=0; i<mlocal_ele; i++) {
301:       fscanf(fd,"%d %d %d %d\n",&cnt,&a1,&a2,&a3);
302:       printf("%d %d %d %d\n",cnt,a1,a2,a3);
303:       if (a1 >= 0) {ja[net++] = a1;}
304:       if (a2 >= 0) {ja[net++] = a2;}
305:       if (a3 >= 0) {ja[net++] = a3;}
306:       ia[i+1] = net;
307:     }

309:     printf("ia values for processor 0\n");
310:     for (i=0; i<mlocal_ele+1; i++) {
311:       printf("%d ",ia[i]);
312:     }
313:     printf("\n");
314:     printf("ja values for processor 0\n");
315:     for (i=0; i<ia[mlocal_ele]; i++) {
316:       printf("%d ",ja[i]);
317:     }
318:     printf("\n");

320:     /*
321:        Read in element neighbor information for all other processors
322:     */
323:     PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&iatmp);
324:     PetscMalloc((3*mlocal_ele+1)*sizeof(PetscInt),&jatmp);
325:     for (j=1; j<size; j++) {
326:       net   = 0;
327:       iatmp[0] = 0;
328:       printf("Element neighbors on processor %d\n",j);
329:       for (i=0; i<mmlocal_ele[j]; i++) {
330:         fscanf(fd,"%d %d %d %d\n",&cnt,&a1,&a2,&a3);
331:         printf("%d %d %d %d\n",cnt,a1,a2,a3);
332:         if (a1 >= 0) {jatmp[net++] = a1;}
333:         if (a2 >= 0) {jatmp[net++] = a2;}
334:         if (a3 >= 0) {jatmp[net++] = a3;}
335:         iatmp[i+1] = net;
336:       }

338:       printf("ia values for processor %d\n",j);
339:       for (i=0; i<mmlocal_ele[j]+1; i++) {
340:         printf("%d ",iatmp[i]);
341:       }
342:       printf("\n");
343:       printf("ja values for processor %d\n",j);
344:       for (i=0; i<iatmp[mmlocal_ele[j]]; i++) {
345:         printf("%d ",jatmp[i]);
346:       }
347:       printf("\n");

349:       /* send graph off to appropriate processor */
350:       MPI_Send(iatmp,mmlocal_ele[j]+1,MPI_INT,j,0,PETSC_COMM_WORLD);
351:       MPI_Send(jatmp,iatmp[mmlocal_ele[j]],MPI_INT,j,0,PETSC_COMM_WORLD);
352:     }
353:     PetscFree(iatmp);
354:     PetscFree(jatmp);
355:     PetscFree(mmlocal_ele);

357:     fclose(fd);
358:   } else {
359:     /*
360:         We are not the zeroth processor so we do not open the file
361:       rather we wait for processor 0 to send us our data.
362:     */

364:     /* receive total number of vertices */
365:     MPI_Bcast(&n_vert,1,MPI_INT,0,PETSC_COMM_WORLD);
366:     mlocal_vert = n_vert/size + ((n_vert % size) > rank);

368:     /* receive vertices */
369:     PetscMalloc(2*(mlocal_vert+1)*sizeof(PetscScalar),&vert);
370:     MPI_Recv(vert,2*mlocal_vert,MPIU_SCALAR,0,0,PETSC_COMM_WORLD,&status);

372:     /* receive total number of elements */
373:     MPI_Bcast(&n_ele,1,MPI_INT,0,PETSC_COMM_WORLD);
374:     mlocal_ele = n_ele/size + ((n_ele % size) > rank);

376:     /* receive elements */
377:     PetscMalloc(3*(mlocal_ele+1)*sizeof(PetscInt),&ele);
378:     MPI_Recv(ele,3*mlocal_ele,MPI_INT,0,0,PETSC_COMM_WORLD,&status);

380:     /* receive element adjacency graph */
381:     PetscMalloc((mlocal_ele+1)*sizeof(PetscInt),&ia);
382:     MPI_Recv(ia,mlocal_ele+1,MPI_INT,0,0,PETSC_COMM_WORLD,&status);

384:     PetscMalloc((ia[mlocal_ele]+1)*sizeof(PetscInt),&ja);
385:     MPI_Recv(ja,ia[mlocal_ele],MPI_INT,0,0,PETSC_COMM_WORLD,&status);
386:   }

388:   gdata->n_vert      = n_vert;
389:   gdata->n_ele       = n_ele;
390:   gdata->mlocal_vert = mlocal_vert;
391:   gdata->mlocal_ele  = mlocal_ele;
392:   gdata->ele         = ele;
393:   gdata->vert        = vert;

395:   gdata->ia          = ia;
396:   gdata->ja          = ja;

398:   return(0);
399: }


404: /*
405:          Given the grid data spread across the processors, determines a
406:    new partitioning of the CELLS (elements) to reduce the number of cut edges between
407:    cells (elements).
408: */
409: PetscErrorCode DataPartitionElements(GridData *gdata)
410: {
411:   Mat             Adj;                /* adjacency matrix */
412:   PetscInt        *ia,*ja;
413:   PetscInt        mlocal_ele,n_ele;
414:   PetscErrorCode  ierr;
415:   MatPartitioning part;
416:   IS              isnewproc;

419:   n_ele       = gdata->n_ele;
420:   mlocal_ele  = gdata->mlocal_ele;

422:   ia          = gdata->ia;
423:   ja          = gdata->ja;

425:   /*
426:       Create the adjacency graph matrix
427:   */
428:   MatCreateMPIAdj(PETSC_COMM_WORLD,mlocal_ele,n_ele,ia,ja,PETSC_NULL,&Adj);

430:   /*
431:       Create the partioning object
432:   */
433:   MatPartitioningCreate(PETSC_COMM_WORLD,&part);
434:   MatPartitioningSetAdjacency(part,Adj);
435:   MatPartitioningSetFromOptions(part);
436:   MatPartitioningApply(part,&isnewproc);
437:   MatPartitioningDestroy(part);

439:   /*
440:        isnewproc - indicates for each local element the new processor it is assigned to
441:   */
442:   PetscPrintf(PETSC_COMM_WORLD,"New processor assignment for each element\n");
443:   ISView(isnewproc,PETSC_VIEWER_STDOUT_WORLD);
444:   gdata->isnewproc = isnewproc;

446:   /*
447:       Free the adjacency graph data structures
448:   */
449:   MatDestroy(Adj);


452:   return(0);
453: }

457: /*
458:       Moves the grid element data to be on the correct processor for the new
459:    element partitioning.
460: */
461: PetscErrorCode DataMoveElements(GridData *gdata)
462: {
464:   PetscInt       *counts,i,*idx;
465:   PetscMPIInt    rank,size;
466:   Vec            vele,veleold;
467:   PetscScalar    *array;
468:   IS             isscat,isnum;
469:   VecScatter     vecscat;


473:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
474:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

476:   /* 
477:       Determine how many elements are assigned to each processor 
478:   */
479:   PetscMalloc(size*sizeof(PetscInt),&counts);
480:   ISPartitioningCount(gdata->isnewproc,counts);

482:   /* 
483:      Create a vector to contain the newly ordered element information 

485:      Note: we use vectors to communicate this data since we can use the powerful
486:      VecScatter routines to get the data to the correct location. This is a little
487:      wasteful since the vectors hold double precision numbers instead of integers,
488:      but since this is just a setup phase in the entire numerical computation that 
489:      is only called once it is not a measureable performance bottleneck.
490:   */
491:   VecCreate(PETSC_COMM_WORLD,&vele);
492:   VecSetSizes(vele,3*counts[rank],PETSC_DECIDE);
493:   VecSetFromOptions(vele);

495:   /* 
496:       Create an index set from the isnewproc index set to indicate the mapping TO 
497:   */
498:   ISPartitioningToNumbering(gdata->isnewproc,&isnum);
499:   ISDestroy(gdata->isnewproc);
500:   /* 
501:       There are three data items per cell (element), the integer vertex numbers of its three 
502:     coordinates (we convert to double to use the scatter) (one can think 
503:     of the vectors of having a block size of 3, then there is one index in idx[] for each element)
504:   */
505:   ISGetIndices(isnum,&idx);
506:   for (i=0; i<gdata->mlocal_ele; i++) {
507:     idx[i] *= 3;
508:   }
509:   ISCreateBlock(PETSC_COMM_WORLD,3,gdata->mlocal_ele,idx,&isscat);
510:   ISRestoreIndices(isnum,&idx);
511:   ISDestroy(isnum);

513:   /* 
514:      Create a vector to contain the original vertex information for each element 
515:   */
516:   VecCreateSeq(PETSC_COMM_SELF,3*gdata->mlocal_ele,&veleold);
517:   VecGetArray(veleold,&array);
518:   for (i=0; i<3*gdata->mlocal_ele; i++) {
519:     array[i] = gdata->ele[i];
520:   }
521:   VecRestoreArray(veleold,&array);
522: 
523:   /* 
524:      Scatter the element vertex information (still in the original vertex ordering) to the correct processor
525:   */
526:   VecScatterCreate(veleold,PETSC_NULL,vele,isscat,&vecscat);
527:   ISDestroy(isscat);
528:   VecScatterBegin(veleold,vele,INSERT_VALUES,SCATTER_FORWARD,vecscat);
529:   VecScatterEnd(veleold,vele,INSERT_VALUES,SCATTER_FORWARD,vecscat);
530:   VecScatterDestroy(vecscat);
531:   VecDestroy(veleold);

533:   /* 
534:      Put the element vertex data into a new allocation of the gdata->ele 
535:   */
536:   PetscFree(gdata->ele);
537:   gdata->mlocal_ele = counts[rank];
538:   PetscFree(counts);
539:   PetscMalloc(3*gdata->mlocal_ele*sizeof(PetscInt),&gdata->ele);
540:   VecGetArray(vele,&array);
541:   for (i=0; i<3*gdata->mlocal_ele; i++) {
542:     gdata->ele[i] = (int)PetscRealPart(array[i]);
543:   }
544:   VecRestoreArray(vele,&array);
545:   VecDestroy(vele);

547:   PetscPrintf(PETSC_COMM_WORLD,"Old vertex numbering in new element ordering\n");
548:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %d\n",rank);
549:   for (i=0; i<gdata->mlocal_ele; i++) {
550:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %d\n",i,gdata->ele[3*i],gdata->ele[3*i+1],
551:                             gdata->ele[3*i+2]);
552:   }
553:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

555:   return(0);
556: }

560: /*
561:          Given the newly partitioned cells (elements), this routine partitions the 
562:      vertices.

564:      The code is not completely scalable since it requires
565:      1) O(n_vert) bits per processor memory
566:      2) uses O(size) stages of communication; each of size O(n_vert) bits
567:      3) it is sequential (each processor marks it vertices ONLY after all processors
568:         to the left have marked theirs.
569:      4) the amount of work on the last processor is O(n_vert)

571:      The algorithm also does not take advantage of vertices that are "interior" to a
572:      processors elements (that is; is not contained in any element on another processor).
573:      A better algorithm would first have all processors determine "interior" vertices and
574:      make sure they are retained on that processor before listing "boundary" vertices.

576:      The algorithm is:
577:      a) each processor waits for a message from the left containing mask of all marked vertices
578:      b) it loops over all local elements, generating a list of vertices it will 
579:         claim (not claiming ones that have already been marked in the bit-array)
580:         it claims at most n_vert/size vertices
581:      c) it sends to the right the mask

583:      This is a quick-and-dirty implementation; it should work fine for many problems,
584:      but will need to be replaced once profiling shows that it takes a large amount of
585:      time. An advantage is it requires no searching or sorting.
586:      
587: */
588: PetscErrorCode DataPartitionVertices(GridData *gdata)
589: {
590:   PetscInt       n_vert = gdata->n_vert,*localvert;
591:   PetscInt       mlocal_ele = gdata->mlocal_ele,*ele = gdata->ele,i,j,nlocal = 0,nmax;
592:   PetscMPIInt    rank,size;
594:   PetscBT        mask;
595:   MPI_Status     status;

598:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
599:   MPI_Comm_size(PETSC_COMM_WORLD,&size);

601:   /*
602:       Allocated space to store bit-array indicting vertices marked
603:   */
604:   PetscBTCreate(n_vert,mask);

606:   /*
607:      All processors except last can have a maximum of n_vert/size vertices assigned
608:      (because last processor needs to handle any leftovers)
609:   */
610:   nmax = n_vert/size;
611:   if (rank == size-1) {
612:     nmax = n_vert;
613:   }

615:   /* 
616:      Receive list of marked vertices from left 
617:   */
618:   if (rank) {
619:     MPI_Recv(mask,PetscBTLength(n_vert),MPI_CHAR,rank-1,0,PETSC_COMM_WORLD,&status);
620:   }

622:   if (rank == size-1) {
623:     /* last processor gets all the rest */
624:     for (i=0; i<n_vert; i++) {
625:       if (!PetscBTLookup(mask,i)) {
626:         nlocal++;
627:       }
628:     }
629:     nmax = nlocal;
630:   }

632:   /* 
633:      Now we know how many are local, allocated enough space for them and mark them 
634:   */
635:   PetscMalloc((nmax+1)*sizeof(PetscInt),&localvert);

637:   /* generate local list and fill in mask */
638:   nlocal = 0;
639:   if (rank < size-1) {
640:     /* count my vertices */
641:     for (i=0; i<mlocal_ele; i++) {
642:       for (j=0; j<3; j++) {
643:         if (!PetscBTLookupSet(mask,ele[3*i+j])) {
644:           localvert[nlocal++] = ele[3*i+j];
645:           if (nlocal >= nmax) goto foundenough2;
646:         }
647:       }
648:     }
649:     foundenough2:;
650:   } else {
651:     /* last processor gets all the rest */
652:     for (i=0; i<n_vert; i++) {
653:       if (!PetscBTLookup(mask,i)) {
654:         localvert[nlocal++] = i;
655:       }
656:     }
657:   }
658:   /* 
659:       Send bit mask on to next processor
660:   */
661:   if (rank < size-1) {
662:     MPI_Send(mask,PetscBTLength(n_vert),MPI_CHAR,rank+1,0,PETSC_COMM_WORLD);
663:   }
664:   PetscBTDestroy(mask);

666:   gdata->localvert = localvert;
667:   gdata->nlocal    = nlocal;

669:   /* print lists of owned vertices */
670:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Number vertices assigned %d\n",rank,nlocal);
671:   PetscSynchronizedFlush(PETSC_COMM_WORLD);
672:   PetscIntView(nlocal,localvert,PETSC_VIEWER_STDOUT_WORLD);

674:   return(0);
675: }

679: /*
680:      Given the partitioning of the vertices; renumbers the element vertex lists for the 
681:      new vertex numbering and moves the vertex coordinate values to the correct processor
682: */
683: PetscErrorCode DataMoveVertices(GridData *gdata)
684: {
685:   AO             ao;
687:   PetscMPIInt    rank;
688:   PetscInt       i;
689:   Vec            vert,overt;
690:   VecScatter     vecscat;
691:   IS             isscat;
692:   PetscScalar    *avert;

695:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

697:   /* ---------------------------------------------------------------------
698:       Create a global reodering of the vertex numbers
699:   */
700:   AOCreateBasic(PETSC_COMM_WORLD,gdata->nlocal,gdata->localvert,PETSC_NULL,&ao);

702:   /*
703:      Change the element vertex information to the new vertex numbering
704:   */
705:   AOApplicationToPetsc(ao,3*gdata->mlocal_ele,gdata->ele);
706:   PetscPrintf(PETSC_COMM_WORLD,"New vertex numbering in new element ordering\n");
707:   PetscSynchronizedPrintf(PETSC_COMM_WORLD,"Processor %d\n",rank);
708:   for (i=0; i<gdata->mlocal_ele; i++) {
709:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"%d %d %d %d\n",i,gdata->ele[3*i],gdata->ele[3*i+1],
710:                             gdata->ele[3*i+2]);
711:   }
712:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

714:   /*
715:      Destroy the AO that is no longer needed
716:   */
717:   AODestroy(ao);

719:   /* --------------------------------------------------------------------
720:       Finally ship the vertex coordinate information to its owning process
721:       note, we do this in a way very similar to what was done for the element info
722:   */
723:   /* create a vector to contain the newly ordered vertex information */
724:   VecCreateSeq(PETSC_COMM_SELF,2*gdata->nlocal,&vert);

726:   /* create a vector to contain the old ordered vertex information */
727:   VecCreateMPIWithArray(PETSC_COMM_WORLD,2*gdata->mlocal_vert,PETSC_DECIDE,gdata->vert,
728:                                &overt);

730:   /* 
731:       There are two data items per vertex, the x and y coordinates (i.e. one can think 
732:     of the vectors of having a block size of 2 and there is one index in localvert[] for each block)
733:   */
734:   for (i=0; i<gdata->nlocal; i++) gdata->localvert[i] *= 2;
735:   ISCreateBlock(PETSC_COMM_WORLD,2,gdata->nlocal,gdata->localvert,&isscat);
736:   PetscFree(gdata->localvert);

738:   /* 
739:       Scatter the element vertex information to the correct processor
740:   */
741:   VecScatterCreate(overt,isscat,vert,PETSC_NULL,&vecscat);
742:   ISDestroy(isscat);
743:   VecScatterBegin(overt,vert,INSERT_VALUES,SCATTER_FORWARD,vecscat);
744:   VecScatterEnd(overt,vert,INSERT_VALUES,SCATTER_FORWARD,vecscat);
745:   VecScatterDestroy(vecscat);

747:   VecDestroy(overt);
748:   PetscFree(gdata->vert);
749: 
750:   /*
751:         Put resulting vertex information into gdata->vert array
752:   */
753:   PetscMalloc(2*gdata->nlocal*sizeof(PetscScalar),&gdata->vert);
754:   VecGetArray(vert,&avert);
755:   PetscMemcpy(gdata->vert,avert,2*gdata->nlocal*sizeof(PetscScalar));
756:   VecRestoreArray(vert,&avert);
757:   gdata->mlocal_vert = gdata->nlocal;
758:   VecDestroy(vert);

760:   PetscPrintf(PETSC_COMM_WORLD,"Vertex coordinates in new numbering\n");
761:   for (i=0; i<gdata->mlocal_vert; i++) {
762:     PetscSynchronizedPrintf(PETSC_COMM_WORLD,"(%g,%g)\n",gdata->vert[2*i],gdata->vert[2*i+1]);
763:   }
764:   PetscSynchronizedFlush(PETSC_COMM_WORLD);

766:   return(0);
767: }


772: PetscErrorCode DataDestroy(GridData *gdata)
773: {

777:   PetscFree(gdata->ele);
778:   PetscFree(gdata->vert);
779:   return(0);
780: }