Actual source code: mpiov.c
1: /*
2: Routines to compute overlapping regions of a parallel MPI matrix
3: and to find submatrices that were shared across processors.
4: */
5: #include <../src/mat/impls/aij/seq/aij.h>
6: #include <../src/mat/impls/aij/mpi/mpiaij.h>
7: #include <petscbt.h>
8: #include <petscsf.h>
10: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat,PetscInt,IS*);
11: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat,PetscInt,char**,PetscInt*,PetscInt**,PetscTable*);
12: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat,PetscInt,PetscInt**,PetscInt**,PetscInt*);
13: extern PetscErrorCode MatGetRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
14: extern PetscErrorCode MatRestoreRow_MPIAIJ(Mat,PetscInt,PetscInt*,PetscInt**,PetscScalar**);
16: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat,PetscInt,IS*);
17: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat,PetscInt,IS*);
18: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat,PetscInt,PetscMPIInt,PetscMPIInt *,PetscInt *, PetscInt *,PetscInt **,PetscInt **);
19: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat,PetscInt,IS*,PetscInt,PetscInt *);
21: PetscErrorCode MatIncreaseOverlap_MPIAIJ(Mat C,PetscInt imax,IS is[],PetscInt ov)
22: {
24: PetscInt i;
27: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
28: for (i=0; i<ov; ++i) {
29: MatIncreaseOverlap_MPIAIJ_Once(C,imax,is);
30: }
31: return(0);
32: }
34: PetscErrorCode MatIncreaseOverlap_MPIAIJ_Scalable(Mat C,PetscInt imax,IS is[],PetscInt ov)
35: {
37: PetscInt i;
40: if (ov < 0) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
41: for (i=0; i<ov; ++i) {
42: MatIncreaseOverlap_MPIAIJ_Once_Scalable(C,imax,is);
43: }
44: return(0);
45: }
47: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once_Scalable(Mat mat,PetscInt nidx,IS is[])
48: {
50: MPI_Comm comm;
51: PetscInt *length,length_i,tlength,*remoterows,nrrows,reducednrrows,*rrow_ranks,*rrow_isids,i,j;
52: PetscInt *tosizes,*tosizes_temp,*toffsets,*fromsizes,*todata,*fromdata;
53: PetscInt nrecvrows,*sbsizes = NULL,*sbdata = NULL;
54: const PetscInt *indices_i,**indices;
55: PetscLayout rmap;
56: PetscMPIInt rank,size,*toranks,*fromranks,nto,nfrom,owner;
57: PetscSF sf;
58: PetscSFNode *remote;
61: PetscObjectGetComm((PetscObject)mat,&comm);
62: MPI_Comm_rank(comm,&rank);
63: MPI_Comm_size(comm,&size);
64: /* get row map to determine where rows should be going */
65: MatGetLayouts(mat,&rmap,NULL);
66: /* retrieve IS data and put all together so that we
67: * can optimize communication
68: * */
69: PetscMalloc2(nidx,(PetscInt ***)&indices,nidx,&length);
70: for (i=0,tlength=0; i<nidx; i++) {
71: ISGetLocalSize(is[i],&length[i]);
72: tlength += length[i];
73: ISGetIndices(is[i],&indices[i]);
74: }
75: /* find these rows on remote processors */
76: PetscCalloc3(tlength,&remoterows,tlength,&rrow_ranks,tlength,&rrow_isids);
77: PetscCalloc3(size,&toranks,2*size,&tosizes,size,&tosizes_temp);
78: nrrows = 0;
79: for (i=0; i<nidx; i++) {
80: length_i = length[i];
81: indices_i = indices[i];
82: for (j=0; j<length_i; j++) {
83: owner = -1;
84: PetscLayoutFindOwner(rmap,indices_i[j],&owner);
85: /* remote processors */
86: if (owner != rank) {
87: tosizes_temp[owner]++; /* number of rows to owner */
88: rrow_ranks[nrrows] = owner; /* processor */
89: rrow_isids[nrrows] = i; /* is id */
90: remoterows[nrrows++] = indices_i[j]; /* row */
91: }
92: }
93: ISRestoreIndices(is[i],&indices[i]);
94: }
95: PetscFree2(*(PetscInt***)&indices,length);
96: /* test if we need to exchange messages
97: * generally speaking, we do not need to exchange
98: * data when overlap is 1
99: * */
100: MPIU_Allreduce(&nrrows,&reducednrrows,1,MPIU_INT,MPIU_MAX,comm);
101: /* we do not have any messages
102: * It usually corresponds to overlap 1
103: * */
104: if (!reducednrrows) {
105: PetscFree3(toranks,tosizes,tosizes_temp);
106: PetscFree3(remoterows,rrow_ranks,rrow_isids);
107: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
108: return(0);
109: }
110: nto = 0;
111: /* send sizes and ranks for building a two-sided communcation */
112: for (i=0; i<size; i++) {
113: if (tosizes_temp[i]) {
114: tosizes[nto*2] = tosizes_temp[i]*2; /* size */
115: tosizes_temp[i] = nto; /* a map from processor to index */
116: toranks[nto++] = i; /* processor */
117: }
118: }
119: PetscMalloc1(nto+1,&toffsets);
120: toffsets[0] = 0;
121: for (i=0; i<nto; i++) {
122: toffsets[i+1] = toffsets[i]+tosizes[2*i]; /* offsets */
123: tosizes[2*i+1] = toffsets[i]; /* offsets to send */
124: }
125: /* send information to other processors */
126: PetscCommBuildTwoSided(comm,2,MPIU_INT,nto,toranks,tosizes,&nfrom,&fromranks,&fromsizes);
127: nrecvrows = 0;
128: for (i=0; i<nfrom; i++) nrecvrows += fromsizes[2*i];
129: PetscMalloc1(nrecvrows,&remote);
130: nrecvrows = 0;
131: for (i=0; i<nfrom; i++) {
132: for (j=0; j<fromsizes[2*i]; j++) {
133: remote[nrecvrows].rank = fromranks[i];
134: remote[nrecvrows++].index = fromsizes[2*i+1]+j;
135: }
136: }
137: PetscSFCreate(comm,&sf);
138: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
139: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
140: PetscSFSetType(sf,PETSCSFBASIC);
141: PetscSFSetFromOptions(sf);
142: /* message pair <no of is, row> */
143: PetscCalloc2(2*nrrows,&todata,nrecvrows,&fromdata);
144: for (i=0; i<nrrows; i++) {
145: owner = rrow_ranks[i]; /* processor */
146: j = tosizes_temp[owner]; /* index */
147: todata[toffsets[j]++] = rrow_isids[i];
148: todata[toffsets[j]++] = remoterows[i];
149: }
150: PetscFree3(toranks,tosizes,tosizes_temp);
151: PetscFree3(remoterows,rrow_ranks,rrow_isids);
152: PetscFree(toffsets);
153: PetscSFBcastBegin(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
154: PetscSFBcastEnd(sf,MPIU_INT,todata,fromdata,MPI_REPLACE);
155: PetscSFDestroy(&sf);
156: /* send rows belonging to the remote so that then we could get the overlapping data back */
157: MatIncreaseOverlap_MPIAIJ_Send_Scalable(mat,nidx,nfrom,fromranks,fromsizes,fromdata,&sbsizes,&sbdata);
158: PetscFree2(todata,fromdata);
159: PetscFree(fromsizes);
160: PetscCommBuildTwoSided(comm,2,MPIU_INT,nfrom,fromranks,sbsizes,&nto,&toranks,&tosizes);
161: PetscFree(fromranks);
162: nrecvrows = 0;
163: for (i=0; i<nto; i++) nrecvrows += tosizes[2*i];
164: PetscCalloc1(nrecvrows,&todata);
165: PetscMalloc1(nrecvrows,&remote);
166: nrecvrows = 0;
167: for (i=0; i<nto; i++) {
168: for (j=0; j<tosizes[2*i]; j++) {
169: remote[nrecvrows].rank = toranks[i];
170: remote[nrecvrows++].index = tosizes[2*i+1]+j;
171: }
172: }
173: PetscSFCreate(comm,&sf);
174: PetscSFSetGraph(sf,nrecvrows,nrecvrows,NULL,PETSC_OWN_POINTER,remote,PETSC_OWN_POINTER);
175: /* use two-sided communication by default since OPENMPI has some bugs for one-sided one */
176: PetscSFSetType(sf,PETSCSFBASIC);
177: PetscSFSetFromOptions(sf);
178: /* overlap communication and computation */
179: PetscSFBcastBegin(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
180: MatIncreaseOverlap_MPIAIJ_Local_Scalable(mat,nidx,is);
181: PetscSFBcastEnd(sf,MPIU_INT,sbdata,todata,MPI_REPLACE);
182: PetscSFDestroy(&sf);
183: PetscFree2(sbdata,sbsizes);
184: MatIncreaseOverlap_MPIAIJ_Receive_Scalable(mat,nidx,is,nrecvrows,todata);
185: PetscFree(toranks);
186: PetscFree(tosizes);
187: PetscFree(todata);
188: return(0);
189: }
191: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive_Scalable(Mat mat,PetscInt nidx, IS is[], PetscInt nrecvs, PetscInt *recvdata)
192: {
193: PetscInt *isz,isz_i,i,j,is_id, data_size;
194: PetscInt col,lsize,max_lsize,*indices_temp, *indices_i;
195: const PetscInt *indices_i_temp;
196: MPI_Comm *iscomms;
197: PetscErrorCode ierr;
200: max_lsize = 0;
201: PetscMalloc1(nidx,&isz);
202: for (i=0; i<nidx; i++) {
203: ISGetLocalSize(is[i],&lsize);
204: max_lsize = lsize>max_lsize ? lsize:max_lsize;
205: isz[i] = lsize;
206: }
207: PetscMalloc2((max_lsize+nrecvs)*nidx,&indices_temp,nidx,&iscomms);
208: for (i=0; i<nidx; i++) {
209: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
210: ISGetIndices(is[i],&indices_i_temp);
211: PetscArraycpy(indices_temp+i*(max_lsize+nrecvs),indices_i_temp, isz[i]);
212: ISRestoreIndices(is[i],&indices_i_temp);
213: ISDestroy(&is[i]);
214: }
215: /* retrieve information to get row id and its overlap */
216: for (i=0; i<nrecvs;) {
217: is_id = recvdata[i++];
218: data_size = recvdata[i++];
219: indices_i = indices_temp+(max_lsize+nrecvs)*is_id;
220: isz_i = isz[is_id];
221: for (j=0; j< data_size; j++) {
222: col = recvdata[i++];
223: indices_i[isz_i++] = col;
224: }
225: isz[is_id] = isz_i;
226: }
227: /* remove duplicate entities */
228: for (i=0; i<nidx; i++) {
229: indices_i = indices_temp+(max_lsize+nrecvs)*i;
230: isz_i = isz[i];
231: PetscSortRemoveDupsInt(&isz_i,indices_i);
232: ISCreateGeneral(iscomms[i],isz_i,indices_i,PETSC_COPY_VALUES,&is[i]);
233: PetscCommDestroy(&iscomms[i]);
234: }
235: PetscFree(isz);
236: PetscFree2(indices_temp,iscomms);
237: return(0);
238: }
240: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Send_Scalable(Mat mat,PetscInt nidx, PetscMPIInt nfrom,PetscMPIInt *fromranks,PetscInt *fromsizes, PetscInt *fromrows, PetscInt **sbrowsizes, PetscInt **sbrows)
241: {
242: PetscLayout rmap,cmap;
243: PetscInt i,j,k,l,*rows_i,*rows_data_ptr,**rows_data,max_fszs,rows_pos,*rows_pos_i;
244: PetscInt is_id,tnz,an,bn,rstart,cstart,row,start,end,col,totalrows,*sbdata;
245: PetscInt *indv_counts,indvc_ij,*sbsizes,*indices_tmp,*offsets;
246: const PetscInt *gcols,*ai,*aj,*bi,*bj;
247: Mat amat,bmat;
248: PetscMPIInt rank;
249: PetscBool done;
250: MPI_Comm comm;
251: PetscErrorCode ierr;
254: PetscObjectGetComm((PetscObject)mat,&comm);
255: MPI_Comm_rank(comm,&rank);
256: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
257: /* Even if the mat is symmetric, we still assume it is not symmetric */
258: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
259: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
260: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
261: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
262: /* total number of nonzero values is used to estimate the memory usage in the next step */
263: tnz = ai[an]+bi[bn];
264: MatGetLayouts(mat,&rmap,&cmap);
265: PetscLayoutGetRange(rmap,&rstart,NULL);
266: PetscLayoutGetRange(cmap,&cstart,NULL);
267: /* to find the longest message */
268: max_fszs = 0;
269: for (i=0; i<nfrom; i++) max_fszs = fromsizes[2*i]>max_fszs ? fromsizes[2*i]:max_fszs;
270: /* better way to estimate number of nonzero in the mat??? */
271: PetscCalloc5(max_fszs*nidx,&rows_data_ptr,nidx,&rows_data,nidx,&rows_pos_i,nfrom*nidx,&indv_counts,tnz,&indices_tmp);
272: for (i=0; i<nidx; i++) rows_data[i] = rows_data_ptr+max_fszs*i;
273: rows_pos = 0;
274: totalrows = 0;
275: for (i=0; i<nfrom; i++) {
276: PetscArrayzero(rows_pos_i,nidx);
277: /* group data together */
278: for (j=0; j<fromsizes[2*i]; j+=2) {
279: is_id = fromrows[rows_pos++];/* no of is */
280: rows_i = rows_data[is_id];
281: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];/* row */
282: }
283: /* estimate a space to avoid multiple allocations */
284: for (j=0; j<nidx; j++) {
285: indvc_ij = 0;
286: rows_i = rows_data[j];
287: for (l=0; l<rows_pos_i[j]; l++) {
288: row = rows_i[l]-rstart;
289: start = ai[row];
290: end = ai[row+1];
291: for (k=start; k<end; k++) { /* Amat */
292: col = aj[k] + cstart;
293: indices_tmp[indvc_ij++] = col;/* do not count the rows from the original rank */
294: }
295: start = bi[row];
296: end = bi[row+1];
297: for (k=start; k<end; k++) { /* Bmat */
298: col = gcols[bj[k]];
299: indices_tmp[indvc_ij++] = col;
300: }
301: }
302: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
303: indv_counts[i*nidx+j] = indvc_ij;
304: totalrows += indvc_ij;
305: }
306: }
307: /* message triple <no of is, number of rows, rows> */
308: PetscCalloc2(totalrows+nidx*nfrom*2,&sbdata,2*nfrom,&sbsizes);
309: totalrows = 0;
310: rows_pos = 0;
311: /* use this code again */
312: for (i=0;i<nfrom;i++) {
313: PetscArrayzero(rows_pos_i,nidx);
314: for (j=0; j<fromsizes[2*i]; j+=2) {
315: is_id = fromrows[rows_pos++];
316: rows_i = rows_data[is_id];
317: rows_i[rows_pos_i[is_id]++] = fromrows[rows_pos++];
318: }
319: /* add data */
320: for (j=0; j<nidx; j++) {
321: if (!indv_counts[i*nidx+j]) continue;
322: indvc_ij = 0;
323: sbdata[totalrows++] = j;
324: sbdata[totalrows++] = indv_counts[i*nidx+j];
325: sbsizes[2*i] += 2;
326: rows_i = rows_data[j];
327: for (l=0; l<rows_pos_i[j]; l++) {
328: row = rows_i[l]-rstart;
329: start = ai[row];
330: end = ai[row+1];
331: for (k=start; k<end; k++) { /* Amat */
332: col = aj[k] + cstart;
333: indices_tmp[indvc_ij++] = col;
334: }
335: start = bi[row];
336: end = bi[row+1];
337: for (k=start; k<end; k++) { /* Bmat */
338: col = gcols[bj[k]];
339: indices_tmp[indvc_ij++] = col;
340: }
341: }
342: PetscSortRemoveDupsInt(&indvc_ij,indices_tmp);
343: sbsizes[2*i] += indvc_ij;
344: PetscArraycpy(sbdata+totalrows,indices_tmp,indvc_ij);
345: totalrows += indvc_ij;
346: }
347: }
348: PetscMalloc1(nfrom+1,&offsets);
349: offsets[0] = 0;
350: for (i=0; i<nfrom; i++) {
351: offsets[i+1] = offsets[i] + sbsizes[2*i];
352: sbsizes[2*i+1] = offsets[i];
353: }
354: PetscFree(offsets);
355: if (sbrowsizes) *sbrowsizes = sbsizes;
356: if (sbrows) *sbrows = sbdata;
357: PetscFree5(rows_data_ptr,rows_data,rows_pos_i,indv_counts,indices_tmp);
358: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
359: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
360: return(0);
361: }
363: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local_Scalable(Mat mat,PetscInt nidx, IS is[])
364: {
365: const PetscInt *gcols,*ai,*aj,*bi,*bj, *indices;
366: PetscInt tnz,an,bn,i,j,row,start,end,rstart,cstart,col,k,*indices_temp;
367: PetscInt lsize,lsize_tmp;
368: PetscMPIInt rank,owner;
369: Mat amat,bmat;
370: PetscBool done;
371: PetscLayout cmap,rmap;
372: MPI_Comm comm;
373: PetscErrorCode ierr;
376: PetscObjectGetComm((PetscObject)mat,&comm);
377: MPI_Comm_rank(comm,&rank);
378: MatMPIAIJGetSeqAIJ(mat,&amat,&bmat,&gcols);
379: MatGetRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
380: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
381: MatGetRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
382: if (!done) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"can not get row IJ \n");
383: /* is it a safe way to compute number of nonzero values ? */
384: tnz = ai[an]+bi[bn];
385: MatGetLayouts(mat,&rmap,&cmap);
386: PetscLayoutGetRange(rmap,&rstart,NULL);
387: PetscLayoutGetRange(cmap,&cstart,NULL);
388: /* it is a better way to estimate memory than the old implementation
389: * where global size of matrix is used
390: * */
391: PetscMalloc1(tnz,&indices_temp);
392: for (i=0; i<nidx; i++) {
393: MPI_Comm iscomm;
395: ISGetLocalSize(is[i],&lsize);
396: ISGetIndices(is[i],&indices);
397: lsize_tmp = 0;
398: for (j=0; j<lsize; j++) {
399: owner = -1;
400: row = indices[j];
401: PetscLayoutFindOwner(rmap,row,&owner);
402: if (owner != rank) continue;
403: /* local number */
404: row -= rstart;
405: start = ai[row];
406: end = ai[row+1];
407: for (k=start; k<end; k++) { /* Amat */
408: col = aj[k] + cstart;
409: indices_temp[lsize_tmp++] = col;
410: }
411: start = bi[row];
412: end = bi[row+1];
413: for (k=start; k<end; k++) { /* Bmat */
414: col = gcols[bj[k]];
415: indices_temp[lsize_tmp++] = col;
416: }
417: }
418: ISRestoreIndices(is[i],&indices);
419: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomm,NULL);
420: ISDestroy(&is[i]);
421: PetscSortRemoveDupsInt(&lsize_tmp,indices_temp);
422: ISCreateGeneral(iscomm,lsize_tmp,indices_temp,PETSC_COPY_VALUES,&is[i]);
423: PetscCommDestroy(&iscomm);
424: }
425: PetscFree(indices_temp);
426: MatRestoreRowIJ(amat,0,PETSC_FALSE,PETSC_FALSE,&an,&ai,&aj,&done);
427: MatRestoreRowIJ(bmat,0,PETSC_FALSE,PETSC_FALSE,&bn,&bi,&bj,&done);
428: return(0);
429: }
431: /*
432: Sample message format:
433: If a processor A wants processor B to process some elements corresponding
434: to index sets is[1],is[5]
435: mesg [0] = 2 (no of index sets in the mesg)
436: -----------
437: mesg [1] = 1 => is[1]
438: mesg [2] = sizeof(is[1]);
439: -----------
440: mesg [3] = 5 => is[5]
441: mesg [4] = sizeof(is[5]);
442: -----------
443: mesg [5]
444: mesg [n] datas[1]
445: -----------
446: mesg[n+1]
447: mesg[m] data(is[5])
448: -----------
450: Notes:
451: nrqs - no of requests sent (or to be sent out)
452: nrqr - no of requests received (which have to be or which have been processed)
453: */
454: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Once(Mat C,PetscInt imax,IS is[])
455: {
456: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
457: PetscMPIInt *w1,*w2,nrqr,*w3,*w4,*onodes1,*olengths1,*onodes2,*olengths2;
458: const PetscInt **idx,*idx_i;
459: PetscInt *n,**data,len;
460: #if defined(PETSC_USE_CTABLE)
461: PetscTable *table_data,table_data_i;
462: PetscInt *tdata,tcount,tcount_max;
463: #else
464: PetscInt *data_i,*d_p;
465: #endif
467: PetscMPIInt size,rank,tag1,tag2,proc = 0;
468: PetscInt M,i,j,k,**rbuf,row,nrqs,msz,**outdat,**ptr;
469: PetscInt *ctr,*pa,*tmp,*isz,*isz1,**xdata,**rbuf2;
470: PetscBT *table;
471: MPI_Comm comm;
472: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2;
473: MPI_Status *recv_status;
474: MPI_Comm *iscomms;
475: char *t_p;
478: PetscObjectGetComm((PetscObject)C,&comm);
479: size = c->size;
480: rank = c->rank;
481: M = C->rmap->N;
483: PetscObjectGetNewTag((PetscObject)C,&tag1);
484: PetscObjectGetNewTag((PetscObject)C,&tag2);
486: PetscMalloc2(imax,(PetscInt***)&idx,imax,&n);
488: for (i=0; i<imax; i++) {
489: ISGetIndices(is[i],&idx[i]);
490: ISGetLocalSize(is[i],&n[i]);
491: }
493: /* evaluate communication - mesg to who,length of mesg, and buffer space
494: required. Based on this, buffers are allocated, and data copied into them */
495: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4);
496: for (i=0; i<imax; i++) {
497: PetscArrayzero(w4,size); /* initialise work vector*/
498: idx_i = idx[i];
499: len = n[i];
500: for (j=0; j<len; j++) {
501: row = idx_i[j];
502: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Index set cannot have negative entries");
503: PetscLayoutFindOwner(C->rmap,row,&proc);
504: w4[proc]++;
505: }
506: for (j=0; j<size; j++) {
507: if (w4[j]) { w1[j] += w4[j]; w3[j]++;}
508: }
509: }
511: nrqs = 0; /* no of outgoing messages */
512: msz = 0; /* total mesg length (for all proc */
513: w1[rank] = 0; /* no mesg sent to intself */
514: w3[rank] = 0;
515: for (i=0; i<size; i++) {
516: if (w1[i]) {w2[i] = 1; nrqs++;} /* there exists a message to proc i */
517: }
518: /* pa - is list of processors to communicate with */
519: PetscMalloc1(nrqs,&pa);
520: for (i=0,j=0; i<size; i++) {
521: if (w1[i]) {pa[j] = i; j++;}
522: }
524: /* Each message would have a header = 1 + 2*(no of IS) + data */
525: for (i=0; i<nrqs; i++) {
526: j = pa[i];
527: w1[j] += w2[j] + 2*w3[j];
528: msz += w1[j];
529: }
531: /* Determine the number of messages to expect, their lengths, from from-ids */
532: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
533: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
535: /* Now post the Irecvs corresponding to these messages */
536: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf,&r_waits1);
538: /* Allocate Memory for outgoing messages */
539: PetscMalloc4(size,&outdat,size,&ptr,msz,&tmp,size,&ctr);
540: PetscArrayzero(outdat,size);
541: PetscArrayzero(ptr,size);
543: {
544: PetscInt *iptr = tmp,ict = 0;
545: for (i=0; i<nrqs; i++) {
546: j = pa[i];
547: iptr += ict;
548: outdat[j] = iptr;
549: ict = w1[j];
550: }
551: }
553: /* Form the outgoing messages */
554: /* plug in the headers */
555: for (i=0; i<nrqs; i++) {
556: j = pa[i];
557: outdat[j][0] = 0;
558: PetscArrayzero(outdat[j]+1,2*w3[j]);
559: ptr[j] = outdat[j] + 2*w3[j] + 1;
560: }
562: /* Memory for doing local proc's work */
563: {
564: PetscInt M_BPB_imax = 0;
565: #if defined(PETSC_USE_CTABLE)
566: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
567: PetscMalloc1(imax,&table_data);
568: for (i=0; i<imax; i++) {
569: PetscTableCreate(n[i],M,&table_data[i]);
570: }
571: PetscCalloc4(imax,&table, imax,&data, imax,&isz, M_BPB_imax,&t_p);
572: for (i=0; i<imax; i++) {
573: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
574: }
575: #else
576: PetscInt Mimax = 0;
577: PetscIntMultError(M,imax, &Mimax);
578: PetscIntMultError((M/PETSC_BITS_PER_BYTE+1),imax, &M_BPB_imax);
579: PetscCalloc5(imax,&table, imax,&data, imax,&isz, Mimax,&d_p, M_BPB_imax,&t_p);
580: for (i=0; i<imax; i++) {
581: table[i] = t_p + (M/PETSC_BITS_PER_BYTE+1)*i;
582: data[i] = d_p + M*i;
583: }
584: #endif
585: }
587: /* Parse the IS and update local tables and the outgoing buf with the data */
588: {
589: PetscInt n_i,isz_i,*outdat_j,ctr_j;
590: PetscBT table_i;
592: for (i=0; i<imax; i++) {
593: PetscArrayzero(ctr,size);
594: n_i = n[i];
595: table_i = table[i];
596: idx_i = idx[i];
597: #if defined(PETSC_USE_CTABLE)
598: table_data_i = table_data[i];
599: #else
600: data_i = data[i];
601: #endif
602: isz_i = isz[i];
603: for (j=0; j<n_i; j++) { /* parse the indices of each IS */
604: row = idx_i[j];
605: PetscLayoutFindOwner(C->rmap,row,&proc);
606: if (proc != rank) { /* copy to the outgoing buffer */
607: ctr[proc]++;
608: *ptr[proc] = row;
609: ptr[proc]++;
610: } else if (!PetscBTLookupSet(table_i,row)) {
611: #if defined(PETSC_USE_CTABLE)
612: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
613: #else
614: data_i[isz_i] = row; /* Update the local table */
615: #endif
616: isz_i++;
617: }
618: }
619: /* Update the headers for the current IS */
620: for (j=0; j<size; j++) { /* Can Optimise this loop by using pa[] */
621: if ((ctr_j = ctr[j])) {
622: outdat_j = outdat[j];
623: k = ++outdat_j[0];
624: outdat_j[2*k] = ctr_j;
625: outdat_j[2*k-1] = i;
626: }
627: }
628: isz[i] = isz_i;
629: }
630: }
632: /* Now post the sends */
633: PetscMalloc1(nrqs,&s_waits1);
634: for (i=0; i<nrqs; ++i) {
635: j = pa[i];
636: MPI_Isend(outdat[j],w1[j],MPIU_INT,j,tag1,comm,s_waits1+i);
637: }
639: /* No longer need the original indices */
640: PetscMalloc1(imax,&iscomms);
641: for (i=0; i<imax; ++i) {
642: ISRestoreIndices(is[i],idx+i);
643: PetscCommDuplicate(PetscObjectComm((PetscObject)is[i]),&iscomms[i],NULL);
644: }
645: PetscFree2(*(PetscInt***)&idx,n);
647: for (i=0; i<imax; ++i) {
648: ISDestroy(&is[i]);
649: }
651: /* Do Local work */
652: #if defined(PETSC_USE_CTABLE)
653: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,NULL,table_data);
654: #else
655: MatIncreaseOverlap_MPIAIJ_Local(C,imax,table,isz,data,NULL);
656: #endif
658: /* Receive messages */
659: PetscMalloc1(nrqr,&recv_status);
660: MPI_Waitall(nrqr,r_waits1,recv_status);
661: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
663: /* Phase 1 sends are complete - deallocate buffers */
664: PetscFree4(outdat,ptr,tmp,ctr);
665: PetscFree4(w1,w2,w3,w4);
667: PetscMalloc1(nrqr,&xdata);
668: PetscMalloc1(nrqr,&isz1);
669: MatIncreaseOverlap_MPIAIJ_Receive(C,nrqr,rbuf,xdata,isz1);
670: PetscFree(rbuf[0]);
671: PetscFree(rbuf);
673: /* Send the data back */
674: /* Do a global reduction to know the buffer space req for incoming messages */
675: {
676: PetscMPIInt *rw1;
678: PetscCalloc1(size,&rw1);
680: for (i=0; i<nrqr; ++i) {
681: proc = recv_status[i].MPI_SOURCE;
683: if (proc != onodes1[i]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPI_SOURCE mismatch");
684: rw1[proc] = isz1[i];
685: }
686: PetscFree(onodes1);
687: PetscFree(olengths1);
689: /* Determine the number of messages to expect, their lengths, from from-ids */
690: PetscGatherMessageLengths(comm,nrqr,nrqs,rw1,&onodes2,&olengths2);
691: PetscFree(rw1);
692: }
693: /* Now post the Irecvs corresponding to these messages */
694: PetscPostIrecvInt(comm,tag2,nrqs,onodes2,olengths2,&rbuf2,&r_waits2);
696: /* Now post the sends */
697: PetscMalloc1(nrqr,&s_waits2);
698: for (i=0; i<nrqr; ++i) {
699: j = recv_status[i].MPI_SOURCE;
700: MPI_Isend(xdata[i],isz1[i],MPIU_INT,j,tag2,comm,s_waits2+i);
701: }
703: /* receive work done on other processors */
704: {
705: PetscInt is_no,ct1,max,*rbuf2_i,isz_i,jmax;
706: PetscMPIInt idex;
707: PetscBT table_i;
709: for (i=0; i<nrqs; ++i) {
710: MPI_Waitany(nrqs,r_waits2,&idex,MPI_STATUS_IGNORE);
711: /* Process the message */
712: rbuf2_i = rbuf2[idex];
713: ct1 = 2*rbuf2_i[0]+1;
714: jmax = rbuf2[idex][0];
715: for (j=1; j<=jmax; j++) {
716: max = rbuf2_i[2*j];
717: is_no = rbuf2_i[2*j-1];
718: isz_i = isz[is_no];
719: table_i = table[is_no];
720: #if defined(PETSC_USE_CTABLE)
721: table_data_i = table_data[is_no];
722: #else
723: data_i = data[is_no];
724: #endif
725: for (k=0; k<max; k++,ct1++) {
726: row = rbuf2_i[ct1];
727: if (!PetscBTLookupSet(table_i,row)) {
728: #if defined(PETSC_USE_CTABLE)
729: PetscTableAdd(table_data_i,row+1,isz_i+1,INSERT_VALUES);
730: #else
731: data_i[isz_i] = row;
732: #endif
733: isz_i++;
734: }
735: }
736: isz[is_no] = isz_i;
737: }
738: }
740: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
741: }
743: #if defined(PETSC_USE_CTABLE)
744: tcount_max = 0;
745: for (i=0; i<imax; ++i) {
746: table_data_i = table_data[i];
747: PetscTableGetCount(table_data_i,&tcount);
748: if (tcount_max < tcount) tcount_max = tcount;
749: }
750: PetscMalloc1(tcount_max+1,&tdata);
751: #endif
753: for (i=0; i<imax; ++i) {
754: #if defined(PETSC_USE_CTABLE)
755: PetscTablePosition tpos;
756: table_data_i = table_data[i];
758: PetscTableGetHeadPosition(table_data_i,&tpos);
759: while (tpos) {
760: PetscTableGetNext(table_data_i,&tpos,&k,&j);
761: tdata[--j] = --k;
762: }
763: ISCreateGeneral(iscomms[i],isz[i],tdata,PETSC_COPY_VALUES,is+i);
764: #else
765: ISCreateGeneral(iscomms[i],isz[i],data[i],PETSC_COPY_VALUES,is+i);
766: #endif
767: PetscCommDestroy(&iscomms[i]);
768: }
770: PetscFree(iscomms);
771: PetscFree(onodes2);
772: PetscFree(olengths2);
774: PetscFree(pa);
775: PetscFree(rbuf2[0]);
776: PetscFree(rbuf2);
777: PetscFree(s_waits1);
778: PetscFree(r_waits1);
779: PetscFree(s_waits2);
780: PetscFree(r_waits2);
781: PetscFree(recv_status);
782: if (xdata) {
783: PetscFree(xdata[0]);
784: PetscFree(xdata);
785: }
786: PetscFree(isz1);
787: #if defined(PETSC_USE_CTABLE)
788: for (i=0; i<imax; i++) {
789: PetscTableDestroy((PetscTable*)&table_data[i]);
790: }
791: PetscFree(table_data);
792: PetscFree(tdata);
793: PetscFree4(table,data,isz,t_p);
794: #else
795: PetscFree5(table,data,isz,d_p,t_p);
796: #endif
797: return(0);
798: }
800: /*
801: MatIncreaseOverlap_MPIAIJ_Local - Called by MatincreaseOverlap, to do
802: the work on the local processor.
804: Inputs:
805: C - MAT_MPIAIJ;
806: imax - total no of index sets processed at a time;
807: table - an array of char - size = m bits.
809: Output:
810: isz - array containing the count of the solution elements corresponding
811: to each index set;
812: data or table_data - pointer to the solutions
813: */
814: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Local(Mat C,PetscInt imax,PetscBT *table,PetscInt *isz,PetscInt **data,PetscTable *table_data)
815: {
816: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
817: Mat A = c->A,B = c->B;
818: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
819: PetscInt start,end,val,max,rstart,cstart,*ai,*aj;
820: PetscInt *bi,*bj,*garray,i,j,k,row,isz_i;
821: PetscBT table_i;
822: #if defined(PETSC_USE_CTABLE)
823: PetscTable table_data_i;
824: PetscErrorCode ierr;
825: PetscTablePosition tpos;
826: PetscInt tcount,*tdata;
827: #else
828: PetscInt *data_i;
829: #endif
832: rstart = C->rmap->rstart;
833: cstart = C->cmap->rstart;
834: ai = a->i;
835: aj = a->j;
836: bi = b->i;
837: bj = b->j;
838: garray = c->garray;
840: for (i=0; i<imax; i++) {
841: #if defined(PETSC_USE_CTABLE)
842: /* copy existing entries of table_data_i into tdata[] */
843: table_data_i = table_data[i];
844: PetscTableGetCount(table_data_i,&tcount);
845: if (tcount != isz[i]) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_PLIB," tcount %d != isz[%d] %d",tcount,i,isz[i]);
847: PetscMalloc1(tcount,&tdata);
848: PetscTableGetHeadPosition(table_data_i,&tpos);
849: while (tpos) {
850: PetscTableGetNext(table_data_i,&tpos,&row,&j);
851: tdata[--j] = --row;
852: if (j > tcount - 1) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB," j %d >= tcount %d",j,tcount);
853: }
854: #else
855: data_i = data[i];
856: #endif
857: table_i = table[i];
858: isz_i = isz[i];
859: max = isz[i];
861: for (j=0; j<max; j++) {
862: #if defined(PETSC_USE_CTABLE)
863: row = tdata[j] - rstart;
864: #else
865: row = data_i[j] - rstart;
866: #endif
867: start = ai[row];
868: end = ai[row+1];
869: for (k=start; k<end; k++) { /* Amat */
870: val = aj[k] + cstart;
871: if (!PetscBTLookupSet(table_i,val)) {
872: #if defined(PETSC_USE_CTABLE)
873: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
874: #else
875: data_i[isz_i] = val;
876: #endif
877: isz_i++;
878: }
879: }
880: start = bi[row];
881: end = bi[row+1];
882: for (k=start; k<end; k++) { /* Bmat */
883: val = garray[bj[k]];
884: if (!PetscBTLookupSet(table_i,val)) {
885: #if defined(PETSC_USE_CTABLE)
886: PetscTableAdd(table_data_i,val+1,isz_i+1,INSERT_VALUES);
887: #else
888: data_i[isz_i] = val;
889: #endif
890: isz_i++;
891: }
892: }
893: }
894: isz[i] = isz_i;
896: #if defined(PETSC_USE_CTABLE)
897: PetscFree(tdata);
898: #endif
899: }
900: return(0);
901: }
903: /*
904: MatIncreaseOverlap_MPIAIJ_Receive - Process the received messages,
905: and return the output
907: Input:
908: C - the matrix
909: nrqr - no of messages being processed.
910: rbuf - an array of pointers to the received requests
912: Output:
913: xdata - array of messages to be sent back
914: isz1 - size of each message
916: For better efficiency perhaps we should malloc separately each xdata[i],
917: then if a remalloc is required we need only copy the data for that one row
918: rather then all previous rows as it is now where a single large chunk of
919: memory is used.
921: */
922: static PetscErrorCode MatIncreaseOverlap_MPIAIJ_Receive(Mat C,PetscInt nrqr,PetscInt **rbuf,PetscInt **xdata,PetscInt * isz1)
923: {
924: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
925: Mat A = c->A,B = c->B;
926: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data;
928: PetscInt rstart,cstart,*ai,*aj,*bi,*bj,*garray,i,j,k;
929: PetscInt row,total_sz,ct,ct1,ct2,ct3,mem_estimate,oct2,l,start,end;
930: PetscInt val,max1,max2,m,no_malloc =0,*tmp,new_estimate,ctr;
931: PetscInt *rbuf_i,kmax,rbuf_0;
932: PetscBT xtable;
935: m = C->rmap->N;
936: rstart = C->rmap->rstart;
937: cstart = C->cmap->rstart;
938: ai = a->i;
939: aj = a->j;
940: bi = b->i;
941: bj = b->j;
942: garray = c->garray;
944: for (i=0,ct=0,total_sz=0; i<nrqr; ++i) {
945: rbuf_i = rbuf[i];
946: rbuf_0 = rbuf_i[0];
947: ct += rbuf_0;
948: for (j=1; j<=rbuf_0; j++) total_sz += rbuf_i[2*j];
949: }
951: if (C->rmap->n) max1 = ct*(a->nz + b->nz)/C->rmap->n;
952: else max1 = 1;
953: mem_estimate = 3*((total_sz > max1 ? total_sz : max1)+1);
954: if (nrqr) {
955: PetscMalloc1(mem_estimate,&xdata[0]);
956: ++no_malloc;
957: }
958: PetscBTCreate(m,&xtable);
959: PetscArrayzero(isz1,nrqr);
961: ct3 = 0;
962: for (i=0; i<nrqr; i++) { /* for easch mesg from proc i */
963: rbuf_i = rbuf[i];
964: rbuf_0 = rbuf_i[0];
965: ct1 = 2*rbuf_0+1;
966: ct2 = ct1;
967: ct3 += ct1;
968: for (j=1; j<=rbuf_0; j++) { /* for each IS from proc i*/
969: PetscBTMemzero(m,xtable);
970: oct2 = ct2;
971: kmax = rbuf_i[2*j];
972: for (k=0; k<kmax; k++,ct1++) {
973: row = rbuf_i[ct1];
974: if (!PetscBTLookupSet(xtable,row)) {
975: if (!(ct3 < mem_estimate)) {
976: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
977: PetscMalloc1(new_estimate,&tmp);
978: PetscArraycpy(tmp,xdata[0],mem_estimate);
979: PetscFree(xdata[0]);
980: xdata[0] = tmp;
981: mem_estimate = new_estimate; ++no_malloc;
982: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
983: }
984: xdata[i][ct2++] = row;
985: ct3++;
986: }
987: }
988: for (k=oct2,max2=ct2; k<max2; k++) {
989: row = xdata[i][k] - rstart;
990: start = ai[row];
991: end = ai[row+1];
992: for (l=start; l<end; l++) {
993: val = aj[l] + cstart;
994: if (!PetscBTLookupSet(xtable,val)) {
995: if (!(ct3 < mem_estimate)) {
996: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
997: PetscMalloc1(new_estimate,&tmp);
998: PetscArraycpy(tmp,xdata[0],mem_estimate);
999: PetscFree(xdata[0]);
1000: xdata[0] = tmp;
1001: mem_estimate = new_estimate; ++no_malloc;
1002: for (ctr=1; ctr<=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1003: }
1004: xdata[i][ct2++] = val;
1005: ct3++;
1006: }
1007: }
1008: start = bi[row];
1009: end = bi[row+1];
1010: for (l=start; l<end; l++) {
1011: val = garray[bj[l]];
1012: if (!PetscBTLookupSet(xtable,val)) {
1013: if (!(ct3 < mem_estimate)) {
1014: new_estimate = (PetscInt)(1.5*mem_estimate)+1;
1015: PetscMalloc1(new_estimate,&tmp);
1016: PetscArraycpy(tmp,xdata[0],mem_estimate);
1017: PetscFree(xdata[0]);
1018: xdata[0] = tmp;
1019: mem_estimate = new_estimate; ++no_malloc;
1020: for (ctr =1; ctr <=i; ctr++) xdata[ctr] = xdata[ctr-1] + isz1[ctr-1];
1021: }
1022: xdata[i][ct2++] = val;
1023: ct3++;
1024: }
1025: }
1026: }
1027: /* Update the header*/
1028: xdata[i][2*j] = ct2 - oct2; /* Undo the vector isz1 and use only a var*/
1029: xdata[i][2*j-1] = rbuf_i[2*j-1];
1030: }
1031: xdata[i][0] = rbuf_0;
1032: if (i+1<nrqr) xdata[i+1] = xdata[i] + ct2;
1033: isz1[i] = ct2; /* size of each message */
1034: }
1035: PetscBTDestroy(&xtable);
1036: PetscInfo3(C,"Allocated %D bytes, required %D bytes, no of mallocs = %D\n",mem_estimate,ct3,no_malloc);
1037: return(0);
1038: }
1039: /* -------------------------------------------------------------------------*/
1040: extern PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*);
1041: /*
1042: Every processor gets the entire matrix
1043: */
1044: PetscErrorCode MatCreateSubMatrix_MPIAIJ_All(Mat A,MatCreateSubMatrixOption flag,MatReuse scall,Mat *Bin[])
1045: {
1046: Mat B;
1047: Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
1048: Mat_SeqAIJ *b,*ad = (Mat_SeqAIJ*)a->A->data,*bd = (Mat_SeqAIJ*)a->B->data;
1050: PetscMPIInt size,rank,*recvcounts = NULL,*displs = NULL;
1051: PetscInt sendcount,i,*rstarts = A->rmap->range,n,cnt,j;
1052: PetscInt m,*b_sendj,*garray = a->garray,*lens,*jsendbuf,*a_jsendbuf,*b_jsendbuf;
1055: MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);
1056: MPI_Comm_rank(PetscObjectComm((PetscObject)A),&rank);
1057: if (scall == MAT_INITIAL_MATRIX) {
1058: /* ----------------------------------------------------------------
1059: Tell every processor the number of nonzeros per row
1060: */
1061: PetscMalloc1(A->rmap->N,&lens);
1062: for (i=A->rmap->rstart; i<A->rmap->rend; i++) {
1063: lens[i] = ad->i[i-A->rmap->rstart+1] - ad->i[i-A->rmap->rstart] + bd->i[i-A->rmap->rstart+1] - bd->i[i-A->rmap->rstart];
1064: }
1065: PetscMalloc2(size,&recvcounts,size,&displs);
1066: for (i=0; i<size; i++) {
1067: recvcounts[i] = A->rmap->range[i+1] - A->rmap->range[i];
1068: displs[i] = A->rmap->range[i];
1069: }
1070: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1071: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1072: #else
1073: sendcount = A->rmap->rend - A->rmap->rstart;
1074: MPI_Allgatherv(lens+A->rmap->rstart,sendcount,MPIU_INT,lens,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1075: #endif
1076: /* ---------------------------------------------------------------
1077: Create the sequential matrix of the same type as the local block diagonal
1078: */
1079: MatCreate(PETSC_COMM_SELF,&B);
1080: MatSetSizes(B,A->rmap->N,A->cmap->N,PETSC_DETERMINE,PETSC_DETERMINE);
1081: MatSetBlockSizesFromMats(B,A,A);
1082: MatSetType(B,((PetscObject)a->A)->type_name);
1083: MatSeqAIJSetPreallocation(B,0,lens);
1084: PetscCalloc1(2,Bin);
1085: **Bin = B;
1086: b = (Mat_SeqAIJ*)B->data;
1088: /*--------------------------------------------------------------------
1089: Copy my part of matrix column indices over
1090: */
1091: sendcount = ad->nz + bd->nz;
1092: jsendbuf = b->j + b->i[rstarts[rank]];
1093: a_jsendbuf = ad->j;
1094: b_jsendbuf = bd->j;
1095: n = A->rmap->rend - A->rmap->rstart;
1096: cnt = 0;
1097: for (i=0; i<n; i++) {
1098: /* put in lower diagonal portion */
1099: m = bd->i[i+1] - bd->i[i];
1100: while (m > 0) {
1101: /* is it above diagonal (in bd (compressed) numbering) */
1102: if (garray[*b_jsendbuf] > A->rmap->rstart + i) break;
1103: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1104: m--;
1105: }
1107: /* put in diagonal portion */
1108: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1109: jsendbuf[cnt++] = A->rmap->rstart + *a_jsendbuf++;
1110: }
1112: /* put in upper diagonal portion */
1113: while (m-- > 0) {
1114: jsendbuf[cnt++] = garray[*b_jsendbuf++];
1115: }
1116: }
1117: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1119: /*--------------------------------------------------------------------
1120: Gather all column indices to all processors
1121: */
1122: for (i=0; i<size; i++) {
1123: recvcounts[i] = 0;
1124: for (j=A->rmap->range[i]; j<A->rmap->range[i+1]; j++) {
1125: recvcounts[i] += lens[j];
1126: }
1127: }
1128: displs[0] = 0;
1129: for (i=1; i<size; i++) {
1130: displs[i] = displs[i-1] + recvcounts[i-1];
1131: }
1132: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1133: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1134: #else
1135: MPI_Allgatherv(jsendbuf,sendcount,MPIU_INT,b->j,recvcounts,displs,MPIU_INT,PetscObjectComm((PetscObject)A));
1136: #endif
1137: /*--------------------------------------------------------------------
1138: Assemble the matrix into useable form (note numerical values not yet set)
1139: */
1140: /* set the b->ilen (length of each row) values */
1141: PetscArraycpy(b->ilen,lens,A->rmap->N);
1142: /* set the b->i indices */
1143: b->i[0] = 0;
1144: for (i=1; i<=A->rmap->N; i++) {
1145: b->i[i] = b->i[i-1] + lens[i-1];
1146: }
1147: PetscFree(lens);
1148: MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);
1149: MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);
1151: } else {
1152: B = **Bin;
1153: b = (Mat_SeqAIJ*)B->data;
1154: }
1156: /*--------------------------------------------------------------------
1157: Copy my part of matrix numerical values into the values location
1158: */
1159: if (flag == MAT_GET_VALUES) {
1160: const PetscScalar *ada,*bda,*a_sendbuf,*b_sendbuf;
1161: MatScalar *sendbuf,*recvbuf;
1163: MatSeqAIJGetArrayRead(a->A,&ada);
1164: MatSeqAIJGetArrayRead(a->B,&bda);
1165: sendcount = ad->nz + bd->nz;
1166: sendbuf = b->a + b->i[rstarts[rank]];
1167: a_sendbuf = ada;
1168: b_sendbuf = bda;
1169: b_sendj = bd->j;
1170: n = A->rmap->rend - A->rmap->rstart;
1171: cnt = 0;
1172: for (i=0; i<n; i++) {
1173: /* put in lower diagonal portion */
1174: m = bd->i[i+1] - bd->i[i];
1175: while (m > 0) {
1176: /* is it above diagonal (in bd (compressed) numbering) */
1177: if (garray[*b_sendj] > A->rmap->rstart + i) break;
1178: sendbuf[cnt++] = *b_sendbuf++;
1179: m--;
1180: b_sendj++;
1181: }
1183: /* put in diagonal portion */
1184: for (j=ad->i[i]; j<ad->i[i+1]; j++) {
1185: sendbuf[cnt++] = *a_sendbuf++;
1186: }
1188: /* put in upper diagonal portion */
1189: while (m-- > 0) {
1190: sendbuf[cnt++] = *b_sendbuf++;
1191: b_sendj++;
1192: }
1193: }
1194: if (cnt != sendcount) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Corrupted PETSc matrix: nz given %D actual nz %D",sendcount,cnt);
1196: /* -----------------------------------------------------------------
1197: Gather all numerical values to all processors
1198: */
1199: if (!recvcounts) {
1200: PetscMalloc2(size,&recvcounts,size,&displs);
1201: }
1202: for (i=0; i<size; i++) {
1203: recvcounts[i] = b->i[rstarts[i+1]] - b->i[rstarts[i]];
1204: }
1205: displs[0] = 0;
1206: for (i=1; i<size; i++) {
1207: displs[i] = displs[i-1] + recvcounts[i-1];
1208: }
1209: recvbuf = b->a;
1210: #if defined(PETSC_HAVE_MPI_IN_PLACE)
1211: MPI_Allgatherv(MPI_IN_PLACE,0,MPI_DATATYPE_NULL,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1212: #else
1213: MPI_Allgatherv(sendbuf,sendcount,MPIU_SCALAR,recvbuf,recvcounts,displs,MPIU_SCALAR,PetscObjectComm((PetscObject)A));
1214: #endif
1215: MatSeqAIJRestoreArrayRead(a->A,&ada);
1216: MatSeqAIJRestoreArrayRead(a->B,&bda);
1217: } /* endof (flag == MAT_GET_VALUES) */
1218: PetscFree2(recvcounts,displs);
1220: if (A->symmetric) {
1221: MatSetOption(B,MAT_SYMMETRIC,PETSC_TRUE);
1222: } else if (A->hermitian) {
1223: MatSetOption(B,MAT_HERMITIAN,PETSC_TRUE);
1224: } else if (A->structurally_symmetric) {
1225: MatSetOption(B,MAT_STRUCTURALLY_SYMMETRIC,PETSC_TRUE);
1226: }
1227: return(0);
1228: }
1230: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,PetscBool allcolumns,Mat *submats)
1231: {
1232: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
1233: Mat submat,A = c->A,B = c->B;
1234: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)B->data,*subc;
1235: PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,nzA,nzB;
1236: PetscInt cstart = C->cmap->rstart,cend = C->cmap->rend,rstart = C->rmap->rstart,*bmap = c->garray;
1237: const PetscInt *icol,*irow;
1238: PetscInt nrow,ncol,start;
1240: PetscMPIInt rank,size,tag1,tag2,tag3,tag4,*w1,*w2,nrqr;
1241: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,ct3,**rbuf1,row,proc;
1242: PetscInt nrqs=0,msz,**ptr,*req_size,*ctr,*pa,*tmp,tcol,*iptr;
1243: PetscInt **rbuf3,*req_source1,*req_source2,**sbuf_aj,**rbuf2,max1,nnz;
1244: PetscInt *lens,rmax,ncols,*cols,Crow;
1245: #if defined(PETSC_USE_CTABLE)
1246: PetscTable cmap,rmap;
1247: PetscInt *cmap_loc,*rmap_loc;
1248: #else
1249: PetscInt *cmap,*rmap;
1250: #endif
1251: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*sbuf1_i,*rbuf2_i,*rbuf3_i;
1252: PetscInt *cworkB,lwrite,*subcols,*row2proc;
1253: PetscScalar *vworkA,*vworkB,*a_a,*b_a,*subvals=NULL;
1254: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
1255: MPI_Request *r_waits4,*s_waits3 = NULL,*s_waits4;
1256: MPI_Status *r_status1,*r_status2,*s_status1,*s_status3 = NULL,*s_status2;
1257: MPI_Status *r_status3 = NULL,*r_status4,*s_status4;
1258: MPI_Comm comm;
1259: PetscScalar **rbuf4,**sbuf_aa,*vals,*sbuf_aa_i,*rbuf4_i;
1260: PetscMPIInt *onodes1,*olengths1,idex,end;
1261: Mat_SubSppt *smatis1;
1262: PetscBool isrowsorted,iscolsorted;
1267: if (ismax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"This routine only works when all processes have ismax=1");
1268: MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
1269: MatSeqAIJGetArrayRead(B,(const PetscScalar**)&b_a);
1270: PetscObjectGetComm((PetscObject)C,&comm);
1271: size = c->size;
1272: rank = c->rank;
1274: ISSorted(iscol[0],&iscolsorted);
1275: ISSorted(isrow[0],&isrowsorted);
1276: ISGetIndices(isrow[0],&irow);
1277: ISGetLocalSize(isrow[0],&nrow);
1278: if (allcolumns) {
1279: icol = NULL;
1280: ncol = C->cmap->N;
1281: } else {
1282: ISGetIndices(iscol[0],&icol);
1283: ISGetLocalSize(iscol[0],&ncol);
1284: }
1286: if (scall == MAT_INITIAL_MATRIX) {
1287: PetscInt *sbuf2_i,*cworkA,lwrite,ctmp;
1289: /* Get some new tags to keep the communication clean */
1290: tag1 = ((PetscObject)C)->tag;
1291: PetscObjectGetNewTag((PetscObject)C,&tag2);
1292: PetscObjectGetNewTag((PetscObject)C,&tag3);
1294: /* evaluate communication - mesg to who, length of mesg, and buffer space
1295: required. Based on this, buffers are allocated, and data copied into them */
1296: PetscCalloc2(size,&w1,size,&w2);
1297: PetscMalloc1(nrow,&row2proc);
1299: /* w1[proc] = num of rows owned by proc -- to be requested */
1300: proc = 0;
1301: nrqs = 0; /* num of outgoing messages */
1302: for (j=0; j<nrow; j++) {
1303: row = irow[j];
1304: if (!isrowsorted) proc = 0;
1305: while (row >= C->rmap->range[proc+1]) proc++;
1306: w1[proc]++;
1307: row2proc[j] = proc; /* map row index to proc */
1309: if (proc != rank && !w2[proc]) {
1310: w2[proc] = 1; nrqs++;
1311: }
1312: }
1313: w1[rank] = 0; /* rows owned by self will not be requested */
1315: PetscMalloc1(nrqs,&pa); /*(proc -array)*/
1316: for (proc=0,j=0; proc<size; proc++) {
1317: if (w1[proc]) { pa[j++] = proc;}
1318: }
1320: /* Each message would have a header = 1 + 2*(num of IS) + data (here,num of IS = 1) */
1321: msz = 0; /* total mesg length (for all procs) */
1322: for (i=0; i<nrqs; i++) {
1323: proc = pa[i];
1324: w1[proc] += 3;
1325: msz += w1[proc];
1326: }
1327: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
1329: /* Determine nrqr, the number of messages to expect, their lengths, from from-ids */
1330: /* if w2[proc]=1, a message of length w1[proc] will be sent to proc; */
1331: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
1333: /* Input: nrqs: nsend; nrqr: nrecv; w1: msg length to be sent;
1334: Output: onodes1: recv node-ids; olengths1: corresponding recv message length */
1335: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
1337: /* Now post the Irecvs corresponding to these messages */
1338: PetscPostIrecvInt(comm,tag1,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
1340: PetscFree(onodes1);
1341: PetscFree(olengths1);
1343: /* Allocate Memory for outgoing messages */
1344: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
1345: PetscArrayzero(sbuf1,size);
1346: PetscArrayzero(ptr,size);
1348: /* subf1[pa[0]] = tmp, subf1[pa[i]] = subf1[pa[i-1]] + w1[pa[i-1]] */
1349: iptr = tmp;
1350: for (i=0; i<nrqs; i++) {
1351: proc = pa[i];
1352: sbuf1[proc] = iptr;
1353: iptr += w1[proc];
1354: }
1356: /* Form the outgoing messages */
1357: /* Initialize the header space */
1358: for (i=0; i<nrqs; i++) {
1359: proc = pa[i];
1360: PetscArrayzero(sbuf1[proc],3);
1361: ptr[proc] = sbuf1[proc] + 3;
1362: }
1364: /* Parse the isrow and copy data into outbuf */
1365: PetscArrayzero(ctr,size);
1366: for (j=0; j<nrow; j++) { /* parse the indices of each IS */
1367: proc = row2proc[j];
1368: if (proc != rank) { /* copy to the outgoing buf*/
1369: *ptr[proc] = irow[j];
1370: ctr[proc]++; ptr[proc]++;
1371: }
1372: }
1374: /* Update the headers for the current IS */
1375: for (j=0; j<size; j++) { /* Can Optimise this loop too */
1376: if ((ctr_j = ctr[j])) {
1377: sbuf1_j = sbuf1[j];
1378: k = ++sbuf1_j[0];
1379: sbuf1_j[2*k] = ctr_j;
1380: sbuf1_j[2*k-1] = 0;
1381: }
1382: }
1384: /* Now post the sends */
1385: PetscMalloc1(nrqs,&s_waits1);
1386: for (i=0; i<nrqs; ++i) {
1387: proc = pa[i];
1388: MPI_Isend(sbuf1[proc],w1[proc],MPIU_INT,proc,tag1,comm,s_waits1+i);
1389: }
1391: /* Post Receives to capture the buffer size */
1392: PetscMalloc4(nrqs,&r_status2,nrqr,&s_waits2,nrqs,&r_waits2,nrqr,&s_status2);
1393: PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
1395: if (nrqs) rbuf2[0] = tmp + msz;
1396: for (i=1; i<nrqs; ++i) rbuf2[i] = rbuf2[i-1] + w1[pa[i-1]];
1398: for (i=0; i<nrqs; ++i) {
1399: proc = pa[i];
1400: MPI_Irecv(rbuf2[i],w1[proc],MPIU_INT,proc,tag2,comm,r_waits2+i);
1401: }
1403: PetscFree2(w1,w2);
1405: /* Send to other procs the buf size they should allocate */
1406: /* Receive messages*/
1407: PetscMalloc1(nrqr,&r_status1);
1408: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
1410: MPI_Waitall(nrqr,r_waits1,r_status1);
1411: for (i=0; i<nrqr; ++i) {
1412: req_size[i] = 0;
1413: rbuf1_i = rbuf1[i];
1414: start = 2*rbuf1_i[0] + 1;
1415: MPI_Get_count(r_status1+i,MPIU_INT,&end);
1416: PetscMalloc1(end,&sbuf2[i]);
1417: sbuf2_i = sbuf2[i];
1418: for (j=start; j<end; j++) {
1419: k = rbuf1_i[j] - rstart;
1420: ncols = ai[k+1] - ai[k] + bi[k+1] - bi[k];
1421: sbuf2_i[j] = ncols;
1422: req_size[i] += ncols;
1423: }
1424: req_source1[i] = r_status1[i].MPI_SOURCE;
1426: /* form the header */
1427: sbuf2_i[0] = req_size[i];
1428: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
1430: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
1431: }
1433: PetscFree(r_status1);
1434: PetscFree(r_waits1);
1436: /* rbuf2 is received, Post recv column indices a->j */
1437: MPI_Waitall(nrqs,r_waits2,r_status2);
1439: PetscMalloc4(nrqs,&r_waits3,nrqr,&s_waits3,nrqs,&r_status3,nrqr,&s_status3);
1440: for (i=0; i<nrqs; ++i) {
1441: PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
1442: req_source2[i] = r_status2[i].MPI_SOURCE;
1443: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
1444: }
1446: /* Wait on sends1 and sends2 */
1447: PetscMalloc1(nrqs,&s_status1);
1448: MPI_Waitall(nrqs,s_waits1,s_status1);
1449: PetscFree(s_waits1);
1450: PetscFree(s_status1);
1452: MPI_Waitall(nrqr,s_waits2,s_status2);
1453: PetscFree4(r_status2,s_waits2,r_waits2,s_status2);
1455: /* Now allocate sending buffers for a->j, and send them off */
1456: PetscMalloc1(nrqr,&sbuf_aj);
1457: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1458: if (nrqr) {PetscMalloc1(j,&sbuf_aj[0]);}
1459: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
1461: for (i=0; i<nrqr; i++) { /* for each requested message */
1462: rbuf1_i = rbuf1[i];
1463: sbuf_aj_i = sbuf_aj[i];
1464: ct1 = 2*rbuf1_i[0] + 1;
1465: ct2 = 0;
1466: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 %d != 1",max1); */
1468: kmax = rbuf1[i][2];
1469: for (k=0; k<kmax; k++,ct1++) { /* for each row */
1470: row = rbuf1_i[ct1] - rstart;
1471: nzA = ai[row+1] - ai[row];
1472: nzB = bi[row+1] - bi[row];
1473: ncols = nzA + nzB;
1474: cworkA = aj + ai[row]; cworkB = bj + bi[row];
1476: /* load the column indices for this row into cols*/
1477: cols = sbuf_aj_i + ct2;
1479: lwrite = 0;
1480: for (l=0; l<nzB; l++) {
1481: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
1482: }
1483: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
1484: for (l=0; l<nzB; l++) {
1485: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
1486: }
1488: ct2 += ncols;
1489: }
1490: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
1491: }
1493: /* create column map (cmap): global col of C -> local col of submat */
1494: #if defined(PETSC_USE_CTABLE)
1495: if (!allcolumns) {
1496: PetscTableCreate(ncol,C->cmap->N,&cmap);
1497: PetscCalloc1(C->cmap->n,&cmap_loc);
1498: for (j=0; j<ncol; j++) { /* use array cmap_loc[] for local col indices */
1499: if (icol[j] >= cstart && icol[j] <cend) {
1500: cmap_loc[icol[j] - cstart] = j+1;
1501: } else { /* use PetscTable for non-local col indices */
1502: PetscTableAdd(cmap,icol[j]+1,j+1,INSERT_VALUES);
1503: }
1504: }
1505: } else {
1506: cmap = NULL;
1507: cmap_loc = NULL;
1508: }
1509: PetscCalloc1(C->rmap->n,&rmap_loc);
1510: #else
1511: if (!allcolumns) {
1512: PetscCalloc1(C->cmap->N,&cmap);
1513: for (j=0; j<ncol; j++) cmap[icol[j]] = j+1;
1514: } else {
1515: cmap = NULL;
1516: }
1517: #endif
1519: /* Create lens for MatSeqAIJSetPreallocation() */
1520: PetscCalloc1(nrow,&lens);
1522: /* Compute lens from local part of C */
1523: for (j=0; j<nrow; j++) {
1524: row = irow[j];
1525: proc = row2proc[j];
1526: if (proc == rank) {
1527: /* diagonal part A = c->A */
1528: ncols = ai[row-rstart+1] - ai[row-rstart];
1529: cols = aj + ai[row-rstart];
1530: if (!allcolumns) {
1531: for (k=0; k<ncols; k++) {
1532: #if defined(PETSC_USE_CTABLE)
1533: tcol = cmap_loc[cols[k]];
1534: #else
1535: tcol = cmap[cols[k]+cstart];
1536: #endif
1537: if (tcol) lens[j]++;
1538: }
1539: } else { /* allcolumns */
1540: lens[j] = ncols;
1541: }
1543: /* off-diagonal part B = c->B */
1544: ncols = bi[row-rstart+1] - bi[row-rstart];
1545: cols = bj + bi[row-rstart];
1546: if (!allcolumns) {
1547: for (k=0; k<ncols; k++) {
1548: #if defined(PETSC_USE_CTABLE)
1549: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1550: #else
1551: tcol = cmap[bmap[cols[k]]];
1552: #endif
1553: if (tcol) lens[j]++;
1554: }
1555: } else { /* allcolumns */
1556: lens[j] += ncols;
1557: }
1558: }
1559: }
1561: /* Create row map (rmap): global row of C -> local row of submat */
1562: #if defined(PETSC_USE_CTABLE)
1563: PetscTableCreate(nrow,C->rmap->N,&rmap);
1564: for (j=0; j<nrow; j++) {
1565: row = irow[j];
1566: proc = row2proc[j];
1567: if (proc == rank) { /* a local row */
1568: rmap_loc[row - rstart] = j;
1569: } else {
1570: PetscTableAdd(rmap,irow[j]+1,j+1,INSERT_VALUES);
1571: }
1572: }
1573: #else
1574: PetscCalloc1(C->rmap->N,&rmap);
1575: for (j=0; j<nrow; j++) {
1576: rmap[irow[j]] = j;
1577: }
1578: #endif
1580: /* Update lens from offproc data */
1581: /* recv a->j is done */
1582: MPI_Waitall(nrqs,r_waits3,r_status3);
1583: for (i=0; i<nrqs; i++) {
1584: proc = pa[i];
1585: sbuf1_i = sbuf1[proc];
1586: /* jmax = sbuf1_i[0]; if (jmax != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax !=1"); */
1587: ct1 = 2 + 1;
1588: ct2 = 0;
1589: rbuf2_i = rbuf2[i]; /* received length of C->j */
1590: rbuf3_i = rbuf3[i]; /* received C->j */
1592: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1593: max1 = sbuf1_i[2];
1594: for (k=0; k<max1; k++,ct1++) {
1595: #if defined(PETSC_USE_CTABLE)
1596: PetscTableFind(rmap,sbuf1_i[ct1]+1,&row);
1597: row--;
1598: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
1599: #else
1600: row = rmap[sbuf1_i[ct1]]; /* the row index in submat */
1601: #endif
1602: /* Now, store row index of submat in sbuf1_i[ct1] */
1603: sbuf1_i[ct1] = row;
1605: nnz = rbuf2_i[ct1];
1606: if (!allcolumns) {
1607: for (l=0; l<nnz; l++,ct2++) {
1608: #if defined(PETSC_USE_CTABLE)
1609: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1610: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1611: } else {
1612: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1613: }
1614: #else
1615: tcol = cmap[rbuf3_i[ct2]]; /* column index in submat */
1616: #endif
1617: if (tcol) lens[row]++;
1618: }
1619: } else { /* allcolumns */
1620: lens[row] += nnz;
1621: }
1622: }
1623: }
1624: MPI_Waitall(nrqr,s_waits3,s_status3);
1625: PetscFree4(r_waits3,s_waits3,r_status3,s_status3);
1627: /* Create the submatrices */
1628: MatCreate(PETSC_COMM_SELF,&submat);
1629: MatSetSizes(submat,nrow,ncol,PETSC_DETERMINE,PETSC_DETERMINE);
1631: ISGetBlockSize(isrow[0],&i);
1632: ISGetBlockSize(iscol[0],&j);
1633: MatSetBlockSizes(submat,i,j);
1634: MatSetType(submat,((PetscObject)A)->type_name);
1635: MatSeqAIJSetPreallocation(submat,0,lens);
1637: /* create struct Mat_SubSppt and attached it to submat */
1638: PetscNew(&smatis1);
1639: subc = (Mat_SeqAIJ*)submat->data;
1640: subc->submatis1 = smatis1;
1642: smatis1->id = 0;
1643: smatis1->nrqs = nrqs;
1644: smatis1->nrqr = nrqr;
1645: smatis1->rbuf1 = rbuf1;
1646: smatis1->rbuf2 = rbuf2;
1647: smatis1->rbuf3 = rbuf3;
1648: smatis1->sbuf2 = sbuf2;
1649: smatis1->req_source2 = req_source2;
1651: smatis1->sbuf1 = sbuf1;
1652: smatis1->ptr = ptr;
1653: smatis1->tmp = tmp;
1654: smatis1->ctr = ctr;
1656: smatis1->pa = pa;
1657: smatis1->req_size = req_size;
1658: smatis1->req_source1 = req_source1;
1660: smatis1->allcolumns = allcolumns;
1661: smatis1->singleis = PETSC_TRUE;
1662: smatis1->row2proc = row2proc;
1663: smatis1->rmap = rmap;
1664: smatis1->cmap = cmap;
1665: #if defined(PETSC_USE_CTABLE)
1666: smatis1->rmap_loc = rmap_loc;
1667: smatis1->cmap_loc = cmap_loc;
1668: #endif
1670: smatis1->destroy = submat->ops->destroy;
1671: submat->ops->destroy = MatDestroySubMatrix_SeqAIJ;
1672: submat->factortype = C->factortype;
1674: /* compute rmax */
1675: rmax = 0;
1676: for (i=0; i<nrow; i++) rmax = PetscMax(rmax,lens[i]);
1678: } else { /* scall == MAT_REUSE_MATRIX */
1679: submat = submats[0];
1680: if (submat->rmap->n != nrow || submat->cmap->n != ncol) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
1682: subc = (Mat_SeqAIJ*)submat->data;
1683: rmax = subc->rmax;
1684: smatis1 = subc->submatis1;
1685: nrqs = smatis1->nrqs;
1686: nrqr = smatis1->nrqr;
1687: rbuf1 = smatis1->rbuf1;
1688: rbuf2 = smatis1->rbuf2;
1689: rbuf3 = smatis1->rbuf3;
1690: req_source2 = smatis1->req_source2;
1692: sbuf1 = smatis1->sbuf1;
1693: sbuf2 = smatis1->sbuf2;
1694: ptr = smatis1->ptr;
1695: tmp = smatis1->tmp;
1696: ctr = smatis1->ctr;
1698: pa = smatis1->pa;
1699: req_size = smatis1->req_size;
1700: req_source1 = smatis1->req_source1;
1702: allcolumns = smatis1->allcolumns;
1703: row2proc = smatis1->row2proc;
1704: rmap = smatis1->rmap;
1705: cmap = smatis1->cmap;
1706: #if defined(PETSC_USE_CTABLE)
1707: rmap_loc = smatis1->rmap_loc;
1708: cmap_loc = smatis1->cmap_loc;
1709: #endif
1710: }
1712: /* Post recv matrix values */
1713: PetscMalloc3(nrqs,&rbuf4, rmax,&subcols, rmax,&subvals);
1714: PetscMalloc4(nrqs,&r_waits4,nrqr,&s_waits4,nrqs,&r_status4,nrqr,&s_status4);
1715: PetscObjectGetNewTag((PetscObject)C,&tag4);
1716: for (i=0; i<nrqs; ++i) {
1717: PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
1718: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
1719: }
1721: /* Allocate sending buffers for a->a, and send them off */
1722: PetscMalloc1(nrqr,&sbuf_aa);
1723: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
1724: if (nrqr) {PetscMalloc1(j,&sbuf_aa[0]);}
1725: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
1727: for (i=0; i<nrqr; i++) {
1728: rbuf1_i = rbuf1[i];
1729: sbuf_aa_i = sbuf_aa[i];
1730: ct1 = 2*rbuf1_i[0]+1;
1731: ct2 = 0;
1732: /* max1=rbuf1_i[0]; if (max1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"max1 !=1"); */
1734: kmax = rbuf1_i[2];
1735: for (k=0; k<kmax; k++,ct1++) {
1736: row = rbuf1_i[ct1] - rstart;
1737: nzA = ai[row+1] - ai[row];
1738: nzB = bi[row+1] - bi[row];
1739: ncols = nzA + nzB;
1740: cworkB = bj + bi[row];
1741: vworkA = a_a + ai[row];
1742: vworkB = b_a + bi[row];
1744: /* load the column values for this row into vals*/
1745: vals = sbuf_aa_i + ct2;
1747: lwrite = 0;
1748: for (l=0; l<nzB; l++) {
1749: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
1750: }
1751: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
1752: for (l=0; l<nzB; l++) {
1753: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
1754: }
1756: ct2 += ncols;
1757: }
1758: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
1759: }
1761: /* Assemble submat */
1762: /* First assemble the local rows */
1763: for (j=0; j<nrow; j++) {
1764: row = irow[j];
1765: proc = row2proc[j];
1766: if (proc == rank) {
1767: Crow = row - rstart; /* local row index of C */
1768: #if defined(PETSC_USE_CTABLE)
1769: row = rmap_loc[Crow]; /* row index of submat */
1770: #else
1771: row = rmap[row];
1772: #endif
1774: if (allcolumns) {
1775: /* diagonal part A = c->A */
1776: ncols = ai[Crow+1] - ai[Crow];
1777: cols = aj + ai[Crow];
1778: vals = a_a + ai[Crow];
1779: i = 0;
1780: for (k=0; k<ncols; k++) {
1781: subcols[i] = cols[k] + cstart;
1782: subvals[i++] = vals[k];
1783: }
1785: /* off-diagonal part B = c->B */
1786: ncols = bi[Crow+1] - bi[Crow];
1787: cols = bj + bi[Crow];
1788: vals = b_a + bi[Crow];
1789: for (k=0; k<ncols; k++) {
1790: subcols[i] = bmap[cols[k]];
1791: subvals[i++] = vals[k];
1792: }
1794: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1796: } else { /* !allcolumns */
1797: #if defined(PETSC_USE_CTABLE)
1798: /* diagonal part A = c->A */
1799: ncols = ai[Crow+1] - ai[Crow];
1800: cols = aj + ai[Crow];
1801: vals = a_a + ai[Crow];
1802: i = 0;
1803: for (k=0; k<ncols; k++) {
1804: tcol = cmap_loc[cols[k]];
1805: if (tcol) {
1806: subcols[i] = --tcol;
1807: subvals[i++] = vals[k];
1808: }
1809: }
1811: /* off-diagonal part B = c->B */
1812: ncols = bi[Crow+1] - bi[Crow];
1813: cols = bj + bi[Crow];
1814: vals = b_a + bi[Crow];
1815: for (k=0; k<ncols; k++) {
1816: PetscTableFind(cmap,bmap[cols[k]]+1,&tcol);
1817: if (tcol) {
1818: subcols[i] = --tcol;
1819: subvals[i++] = vals[k];
1820: }
1821: }
1822: #else
1823: /* diagonal part A = c->A */
1824: ncols = ai[Crow+1] - ai[Crow];
1825: cols = aj + ai[Crow];
1826: vals = a_a + ai[Crow];
1827: i = 0;
1828: for (k=0; k<ncols; k++) {
1829: tcol = cmap[cols[k]+cstart];
1830: if (tcol) {
1831: subcols[i] = --tcol;
1832: subvals[i++] = vals[k];
1833: }
1834: }
1836: /* off-diagonal part B = c->B */
1837: ncols = bi[Crow+1] - bi[Crow];
1838: cols = bj + bi[Crow];
1839: vals = b_a + bi[Crow];
1840: for (k=0; k<ncols; k++) {
1841: tcol = cmap[bmap[cols[k]]];
1842: if (tcol) {
1843: subcols[i] = --tcol;
1844: subvals[i++] = vals[k];
1845: }
1846: }
1847: #endif
1848: MatSetValues_SeqAIJ(submat,1,&row,i,subcols,subvals,INSERT_VALUES);
1849: }
1850: }
1851: }
1853: /* Now assemble the off-proc rows */
1854: for (i=0; i<nrqs; i++) { /* for each requested message */
1855: /* recv values from other processes */
1856: MPI_Waitany(nrqs,r_waits4,&idex,r_status4+i);
1857: proc = pa[idex];
1858: sbuf1_i = sbuf1[proc];
1859: /* jmax = sbuf1_i[0]; if (jmax != 1)SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"jmax %d != 1",jmax); */
1860: ct1 = 2 + 1;
1861: ct2 = 0; /* count of received C->j */
1862: ct3 = 0; /* count of received C->j that will be inserted into submat */
1863: rbuf2_i = rbuf2[idex]; /* int** received length of C->j from other processes */
1864: rbuf3_i = rbuf3[idex]; /* int** received C->j from other processes */
1865: rbuf4_i = rbuf4[idex]; /* scalar** received C->a from other processes */
1867: /* is_no = sbuf1_i[2*j-1]; if (is_no != 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"is_no !=0"); */
1868: max1 = sbuf1_i[2]; /* num of rows */
1869: for (k=0; k<max1; k++,ct1++) { /* for each recved row */
1870: row = sbuf1_i[ct1]; /* row index of submat */
1871: if (!allcolumns) {
1872: idex = 0;
1873: if (scall == MAT_INITIAL_MATRIX || !iscolsorted) {
1874: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1875: for (l=0; l<nnz; l++,ct2++) { /* for each recved column */
1876: #if defined(PETSC_USE_CTABLE)
1877: if (rbuf3_i[ct2] >= cstart && rbuf3_i[ct2] <cend) {
1878: tcol = cmap_loc[rbuf3_i[ct2] - cstart];
1879: } else {
1880: PetscTableFind(cmap,rbuf3_i[ct2]+1,&tcol);
1881: }
1882: #else
1883: tcol = cmap[rbuf3_i[ct2]];
1884: #endif
1885: if (tcol) {
1886: subcols[idex] = --tcol; /* may not be sorted */
1887: subvals[idex++] = rbuf4_i[ct2];
1889: /* We receive an entire column of C, but a subset of it needs to be inserted into submat.
1890: For reuse, we replace received C->j with index that should be inserted to submat */
1891: if (iscolsorted) rbuf3_i[ct3++] = ct2;
1892: }
1893: }
1894: MatSetValues_SeqAIJ(submat,1,&row,idex,subcols,subvals,INSERT_VALUES);
1895: } else { /* scall == MAT_REUSE_MATRIX */
1896: submat = submats[0];
1897: subc = (Mat_SeqAIJ*)submat->data;
1899: nnz = subc->i[row+1] - subc->i[row]; /* num of submat entries in this row */
1900: for (l=0; l<nnz; l++) {
1901: ct2 = rbuf3_i[ct3++]; /* index of rbuf4_i[] which needs to be inserted into submat */
1902: subvals[idex++] = rbuf4_i[ct2];
1903: }
1905: bj = subc->j + subc->i[row]; /* sorted column indices */
1906: MatSetValues_SeqAIJ(submat,1,&row,nnz,bj,subvals,INSERT_VALUES);
1907: }
1908: } else { /* allcolumns */
1909: nnz = rbuf2_i[ct1]; /* num of C entries in this row */
1910: MatSetValues_SeqAIJ(submat,1,&row,nnz,rbuf3_i+ct2,rbuf4_i+ct2,INSERT_VALUES);
1911: ct2 += nnz;
1912: }
1913: }
1914: }
1916: /* sending a->a are done */
1917: MPI_Waitall(nrqr,s_waits4,s_status4);
1918: PetscFree4(r_waits4,s_waits4,r_status4,s_status4);
1920: MatAssemblyBegin(submat,MAT_FINAL_ASSEMBLY);
1921: MatAssemblyEnd(submat,MAT_FINAL_ASSEMBLY);
1922: submats[0] = submat;
1924: /* Restore the indices */
1925: ISRestoreIndices(isrow[0],&irow);
1926: if (!allcolumns) {
1927: ISRestoreIndices(iscol[0],&icol);
1928: }
1930: /* Destroy allocated memory */
1931: for (i=0; i<nrqs; ++i) {
1932: PetscFree(rbuf4[i]);
1933: }
1934: PetscFree3(rbuf4,subcols,subvals);
1935: if (sbuf_aa) {
1936: PetscFree(sbuf_aa[0]);
1937: PetscFree(sbuf_aa);
1938: }
1940: if (scall == MAT_INITIAL_MATRIX) {
1941: PetscFree(lens);
1942: if (sbuf_aj) {
1943: PetscFree(sbuf_aj[0]);
1944: PetscFree(sbuf_aj);
1945: }
1946: }
1947: MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
1948: MatSeqAIJRestoreArrayRead(B,(const PetscScalar**)&b_a);
1949: return(0);
1950: }
1952: PetscErrorCode MatCreateSubMatrices_MPIAIJ_SingleIS(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1953: {
1955: PetscInt ncol;
1956: PetscBool colflag,allcolumns=PETSC_FALSE;
1959: /* Allocate memory to hold all the submatrices */
1960: if (scall == MAT_INITIAL_MATRIX) {
1961: PetscCalloc1(2,submat);
1962: }
1964: /* Check for special case: each processor gets entire matrix columns */
1965: ISIdentity(iscol[0],&colflag);
1966: ISGetLocalSize(iscol[0],&ncol);
1967: if (colflag && ncol == C->cmap->N) allcolumns = PETSC_TRUE;
1969: MatCreateSubMatrices_MPIAIJ_SingleIS_Local(C,ismax,isrow,iscol,scall,allcolumns,*submat);
1970: return(0);
1971: }
1973: PetscErrorCode MatCreateSubMatrices_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
1974: {
1976: PetscInt nmax,nstages=0,i,pos,max_no,nrow,ncol,in[2],out[2];
1977: PetscBool rowflag,colflag,wantallmatrix=PETSC_FALSE;
1978: Mat_SeqAIJ *subc;
1979: Mat_SubSppt *smat;
1982: /* Check for special case: each processor has a single IS */
1983: if (C->submat_singleis) { /* flag is set in PCSetUp_ASM() to skip MPI_Allreduce() */
1984: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
1985: C->submat_singleis = PETSC_FALSE; /* resume its default value in case C will be used for non-single IS */
1986: return(0);
1987: }
1989: /* Collect global wantallmatrix and nstages */
1990: if (!C->cmap->N) nmax=20*1000000/sizeof(PetscInt);
1991: else nmax = 20*1000000 / (C->cmap->N * sizeof(PetscInt));
1992: if (!nmax) nmax = 1;
1994: if (scall == MAT_INITIAL_MATRIX) {
1995: /* Collect global wantallmatrix and nstages */
1996: if (ismax == 1 && C->rmap->N == C->cmap->N) {
1997: ISIdentity(*isrow,&rowflag);
1998: ISIdentity(*iscol,&colflag);
1999: ISGetLocalSize(*isrow,&nrow);
2000: ISGetLocalSize(*iscol,&ncol);
2001: if (rowflag && colflag && nrow == C->rmap->N && ncol == C->cmap->N) {
2002: wantallmatrix = PETSC_TRUE;
2004: PetscOptionsGetBool(((PetscObject)C)->options,((PetscObject)C)->prefix,"-use_fast_submatrix",&wantallmatrix,NULL);
2005: }
2006: }
2008: /* Determine the number of stages through which submatrices are done
2009: Each stage will extract nmax submatrices.
2010: nmax is determined by the matrix column dimension.
2011: If the original matrix has 20M columns, only one submatrix per stage is allowed, etc.
2012: */
2013: nstages = ismax/nmax + ((ismax % nmax) ? 1 : 0); /* local nstages */
2015: in[0] = -1*(PetscInt)wantallmatrix;
2016: in[1] = nstages;
2017: MPIU_Allreduce(in,out,2,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
2018: wantallmatrix = (PetscBool)(-out[0]);
2019: nstages = out[1]; /* Make sure every processor loops through the global nstages */
2021: } else { /* MAT_REUSE_MATRIX */
2022: if (ismax) {
2023: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2024: smat = subc->submatis1;
2025: } else { /* (*submat)[0] is a dummy matrix */
2026: smat = (Mat_SubSppt*)(*submat)[0]->data;
2027: }
2028: if (!smat) {
2029: /* smat is not generated by MatCreateSubMatrix_MPIAIJ_All(...,MAT_INITIAL_MATRIX,...) */
2030: wantallmatrix = PETSC_TRUE;
2031: } else if (smat->singleis) {
2032: MatCreateSubMatrices_MPIAIJ_SingleIS(C,ismax,isrow,iscol,scall,submat);
2033: return(0);
2034: } else {
2035: nstages = smat->nstages;
2036: }
2037: }
2039: if (wantallmatrix) {
2040: MatCreateSubMatrix_MPIAIJ_All(C,MAT_GET_VALUES,scall,submat);
2041: return(0);
2042: }
2044: /* Allocate memory to hold all the submatrices and dummy submatrices */
2045: if (scall == MAT_INITIAL_MATRIX) {
2046: PetscCalloc1(ismax+nstages,submat);
2047: }
2049: for (i=0,pos=0; i<nstages; i++) {
2050: if (pos+nmax <= ismax) max_no = nmax;
2051: else if (pos >= ismax) max_no = 0;
2052: else max_no = ismax-pos;
2054: MatCreateSubMatrices_MPIAIJ_Local(C,max_no,isrow+pos,iscol+pos,scall,*submat+pos);
2055: if (!max_no) {
2056: if (scall == MAT_INITIAL_MATRIX) { /* submat[pos] is a dummy matrix */
2057: smat = (Mat_SubSppt*)(*submat)[pos]->data;
2058: smat->nstages = nstages;
2059: }
2060: pos++; /* advance to next dummy matrix if any */
2061: } else pos += max_no;
2062: }
2064: if (ismax && scall == MAT_INITIAL_MATRIX) {
2065: /* save nstages for reuse */
2066: subc = (Mat_SeqAIJ*)(*submat)[0]->data;
2067: smat = subc->submatis1;
2068: smat->nstages = nstages;
2069: }
2070: return(0);
2071: }
2073: /* -------------------------------------------------------------------------*/
2074: PetscErrorCode MatCreateSubMatrices_MPIAIJ_Local(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submats)
2075: {
2076: Mat_MPIAIJ *c = (Mat_MPIAIJ*)C->data;
2077: Mat A = c->A;
2078: Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)c->B->data,*subc;
2079: const PetscInt **icol,**irow;
2080: PetscInt *nrow,*ncol,start;
2082: PetscMPIInt rank,size,tag0,tag2,tag3,tag4,*w1,*w2,*w3,*w4,nrqr;
2083: PetscInt **sbuf1,**sbuf2,i,j,k,l,ct1,ct2,**rbuf1,row,proc=-1;
2084: PetscInt nrqs=0,msz,**ptr=NULL,*req_size=NULL,*ctr=NULL,*pa,*tmp=NULL,tcol;
2085: PetscInt **rbuf3=NULL,*req_source1=NULL,*req_source2,**sbuf_aj,**rbuf2=NULL,max1,max2;
2086: PetscInt **lens,is_no,ncols,*cols,mat_i,*mat_j,tmp2,jmax;
2087: #if defined(PETSC_USE_CTABLE)
2088: PetscTable *cmap,cmap_i=NULL,*rmap,rmap_i;
2089: #else
2090: PetscInt **cmap,*cmap_i=NULL,**rmap,*rmap_i;
2091: #endif
2092: const PetscInt *irow_i;
2093: PetscInt ctr_j,*sbuf1_j,*sbuf_aj_i,*rbuf1_i,kmax,*lens_i;
2094: MPI_Request *s_waits1,*r_waits1,*s_waits2,*r_waits2,*r_waits3;
2095: MPI_Request *r_waits4,*s_waits3,*s_waits4;
2096: MPI_Comm comm;
2097: PetscScalar **rbuf4,*rbuf4_i,**sbuf_aa,*vals,*mat_a,*imat_a,*sbuf_aa_i;
2098: PetscMPIInt *onodes1,*olengths1,end;
2099: PetscInt **row2proc,*row2proc_i,ilen_row,*imat_ilen,*imat_j,*imat_i,old_row;
2100: Mat_SubSppt *smat_i;
2101: PetscBool *issorted,*allcolumns,colflag,iscsorted=PETSC_TRUE;
2102: PetscInt *sbuf1_i,*rbuf2_i,*rbuf3_i,ilen;
2105: PetscObjectGetComm((PetscObject)C,&comm);
2106: size = c->size;
2107: rank = c->rank;
2109: PetscMalloc4(ismax,&row2proc,ismax,&cmap,ismax,&rmap,ismax+1,&allcolumns);
2110: PetscMalloc5(ismax,(PetscInt***)&irow,ismax,(PetscInt***)&icol,ismax,&nrow,ismax,&ncol,ismax,&issorted);
2112: for (i=0; i<ismax; i++) {
2113: ISSorted(iscol[i],&issorted[i]);
2114: if (!issorted[i]) iscsorted = issorted[i];
2116: ISSorted(isrow[i],&issorted[i]);
2118: ISGetIndices(isrow[i],&irow[i]);
2119: ISGetLocalSize(isrow[i],&nrow[i]);
2121: /* Check for special case: allcolumn */
2122: ISIdentity(iscol[i],&colflag);
2123: ISGetLocalSize(iscol[i],&ncol[i]);
2124: if (colflag && ncol[i] == C->cmap->N) {
2125: allcolumns[i] = PETSC_TRUE;
2126: icol[i] = NULL;
2127: } else {
2128: allcolumns[i] = PETSC_FALSE;
2129: ISGetIndices(iscol[i],&icol[i]);
2130: }
2131: }
2133: if (scall == MAT_REUSE_MATRIX) {
2134: /* Assumes new rows are same length as the old rows */
2135: for (i=0; i<ismax; i++) {
2136: if (!submats[i]) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats[%D] is null, cannot reuse",i);
2137: subc = (Mat_SeqAIJ*)submats[i]->data;
2138: if ((submats[i]->rmap->n != nrow[i]) || (submats[i]->cmap->n != ncol[i])) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong size");
2140: /* Initial matrix as if empty */
2141: PetscArrayzero(subc->ilen,submats[i]->rmap->n);
2143: smat_i = subc->submatis1;
2145: nrqs = smat_i->nrqs;
2146: nrqr = smat_i->nrqr;
2147: rbuf1 = smat_i->rbuf1;
2148: rbuf2 = smat_i->rbuf2;
2149: rbuf3 = smat_i->rbuf3;
2150: req_source2 = smat_i->req_source2;
2152: sbuf1 = smat_i->sbuf1;
2153: sbuf2 = smat_i->sbuf2;
2154: ptr = smat_i->ptr;
2155: tmp = smat_i->tmp;
2156: ctr = smat_i->ctr;
2158: pa = smat_i->pa;
2159: req_size = smat_i->req_size;
2160: req_source1 = smat_i->req_source1;
2162: allcolumns[i] = smat_i->allcolumns;
2163: row2proc[i] = smat_i->row2proc;
2164: rmap[i] = smat_i->rmap;
2165: cmap[i] = smat_i->cmap;
2166: }
2168: if (!ismax) { /* Get dummy submatrices and retrieve struct submatis1 */
2169: if (!submats[0]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_NULL,"submats are null, cannot reuse");
2170: smat_i = (Mat_SubSppt*)submats[0]->data;
2172: nrqs = smat_i->nrqs;
2173: nrqr = smat_i->nrqr;
2174: rbuf1 = smat_i->rbuf1;
2175: rbuf2 = smat_i->rbuf2;
2176: rbuf3 = smat_i->rbuf3;
2177: req_source2 = smat_i->req_source2;
2179: sbuf1 = smat_i->sbuf1;
2180: sbuf2 = smat_i->sbuf2;
2181: ptr = smat_i->ptr;
2182: tmp = smat_i->tmp;
2183: ctr = smat_i->ctr;
2185: pa = smat_i->pa;
2186: req_size = smat_i->req_size;
2187: req_source1 = smat_i->req_source1;
2189: allcolumns[0] = PETSC_FALSE;
2190: }
2191: } else { /* scall == MAT_INITIAL_MATRIX */
2192: /* Get some new tags to keep the communication clean */
2193: PetscObjectGetNewTag((PetscObject)C,&tag2);
2194: PetscObjectGetNewTag((PetscObject)C,&tag3);
2196: /* evaluate communication - mesg to who, length of mesg, and buffer space
2197: required. Based on this, buffers are allocated, and data copied into them*/
2198: PetscCalloc4(size,&w1,size,&w2,size,&w3,size,&w4); /* mesg size, initialize work vectors */
2200: for (i=0; i<ismax; i++) {
2201: jmax = nrow[i];
2202: irow_i = irow[i];
2204: PetscMalloc1(jmax,&row2proc_i);
2205: row2proc[i] = row2proc_i;
2207: if (issorted[i]) proc = 0;
2208: for (j=0; j<jmax; j++) {
2209: if (!issorted[i]) proc = 0;
2210: row = irow_i[j];
2211: while (row >= C->rmap->range[proc+1]) proc++;
2212: w4[proc]++;
2213: row2proc_i[j] = proc; /* map row index to proc */
2214: }
2215: for (j=0; j<size; j++) {
2216: if (w4[j]) { w1[j] += w4[j]; w3[j]++; w4[j] = 0;}
2217: }
2218: }
2220: nrqs = 0; /* no of outgoing messages */
2221: msz = 0; /* total mesg length (for all procs) */
2222: w1[rank] = 0; /* no mesg sent to self */
2223: w3[rank] = 0;
2224: for (i=0; i<size; i++) {
2225: if (w1[i]) { w2[i] = 1; nrqs++;} /* there exists a message to proc i */
2226: }
2227: PetscMalloc1(nrqs,&pa); /*(proc -array)*/
2228: for (i=0,j=0; i<size; i++) {
2229: if (w1[i]) { pa[j] = i; j++; }
2230: }
2232: /* Each message would have a header = 1 + 2*(no of IS) + data */
2233: for (i=0; i<nrqs; i++) {
2234: j = pa[i];
2235: w1[j] += w2[j] + 2* w3[j];
2236: msz += w1[j];
2237: }
2238: PetscInfo2(0,"Number of outgoing messages %D Total message length %D\n",nrqs,msz);
2240: /* Determine the number of messages to expect, their lengths, from from-ids */
2241: PetscGatherNumberOfMessages(comm,w2,w1,&nrqr);
2242: PetscGatherMessageLengths(comm,nrqs,nrqr,w1,&onodes1,&olengths1);
2244: /* Now post the Irecvs corresponding to these messages */
2245: PetscObjectGetNewTag((PetscObject)C,&tag0);
2246: PetscPostIrecvInt(comm,tag0,nrqr,onodes1,olengths1,&rbuf1,&r_waits1);
2248: /* Allocate Memory for outgoing messages */
2249: PetscMalloc4(size,&sbuf1,size,&ptr,2*msz,&tmp,size,&ctr);
2250: PetscArrayzero(sbuf1,size);
2251: PetscArrayzero(ptr,size);
2253: {
2254: PetscInt *iptr = tmp;
2255: k = 0;
2256: for (i=0; i<nrqs; i++) {
2257: j = pa[i];
2258: iptr += k;
2259: sbuf1[j] = iptr;
2260: k = w1[j];
2261: }
2262: }
2264: /* Form the outgoing messages. Initialize the header space */
2265: for (i=0; i<nrqs; i++) {
2266: j = pa[i];
2267: sbuf1[j][0] = 0;
2268: PetscArrayzero(sbuf1[j]+1,2*w3[j]);
2269: ptr[j] = sbuf1[j] + 2*w3[j] + 1;
2270: }
2272: /* Parse the isrow and copy data into outbuf */
2273: for (i=0; i<ismax; i++) {
2274: row2proc_i = row2proc[i];
2275: PetscArrayzero(ctr,size);
2276: irow_i = irow[i];
2277: jmax = nrow[i];
2278: for (j=0; j<jmax; j++) { /* parse the indices of each IS */
2279: proc = row2proc_i[j];
2280: if (proc != rank) { /* copy to the outgoing buf*/
2281: ctr[proc]++;
2282: *ptr[proc] = irow_i[j];
2283: ptr[proc]++;
2284: }
2285: }
2286: /* Update the headers for the current IS */
2287: for (j=0; j<size; j++) { /* Can Optimise this loop too */
2288: if ((ctr_j = ctr[j])) {
2289: sbuf1_j = sbuf1[j];
2290: k = ++sbuf1_j[0];
2291: sbuf1_j[2*k] = ctr_j;
2292: sbuf1_j[2*k-1] = i;
2293: }
2294: }
2295: }
2297: /* Now post the sends */
2298: PetscMalloc1(nrqs,&s_waits1);
2299: for (i=0; i<nrqs; ++i) {
2300: j = pa[i];
2301: MPI_Isend(sbuf1[j],w1[j],MPIU_INT,j,tag0,comm,s_waits1+i);
2302: }
2304: /* Post Receives to capture the buffer size */
2305: PetscMalloc1(nrqs,&r_waits2);
2306: PetscMalloc3(nrqs,&req_source2,nrqs,&rbuf2,nrqs,&rbuf3);
2307: if (nrqs) rbuf2[0] = tmp + msz;
2308: for (i=1; i<nrqs; ++i) {
2309: rbuf2[i] = rbuf2[i-1]+w1[pa[i-1]];
2310: }
2311: for (i=0; i<nrqs; ++i) {
2312: j = pa[i];
2313: MPI_Irecv(rbuf2[i],w1[j],MPIU_INT,j,tag2,comm,r_waits2+i);
2314: }
2316: /* Send to other procs the buf size they should allocate */
2317: /* Receive messages*/
2318: PetscMalloc1(nrqr,&s_waits2);
2319: PetscMalloc3(nrqr,&sbuf2,nrqr,&req_size,nrqr,&req_source1);
2320: {
2321: PetscInt *sAi = a->i,*sBi = b->i,id,rstart = C->rmap->rstart;
2322: PetscInt *sbuf2_i;
2324: MPI_Waitall(nrqr,r_waits1,MPI_STATUSES_IGNORE);
2325: for (i=0; i<nrqr; ++i) {
2326: req_size[i] = 0;
2327: rbuf1_i = rbuf1[i];
2328: start = 2*rbuf1_i[0] + 1;
2329: end = olengths1[i];
2330: PetscMalloc1(end,&sbuf2[i]);
2331: sbuf2_i = sbuf2[i];
2332: for (j=start; j<end; j++) {
2333: id = rbuf1_i[j] - rstart;
2334: ncols = sAi[id+1] - sAi[id] + sBi[id+1] - sBi[id];
2335: sbuf2_i[j] = ncols;
2336: req_size[i] += ncols;
2337: }
2338: req_source1[i] = onodes1[i];
2339: /* form the header */
2340: sbuf2_i[0] = req_size[i];
2341: for (j=1; j<start; j++) sbuf2_i[j] = rbuf1_i[j];
2343: MPI_Isend(sbuf2_i,end,MPIU_INT,req_source1[i],tag2,comm,s_waits2+i);
2344: }
2345: }
2347: PetscFree(onodes1);
2348: PetscFree(olengths1);
2349: PetscFree(r_waits1);
2350: PetscFree4(w1,w2,w3,w4);
2352: /* Receive messages*/
2353: PetscMalloc1(nrqs,&r_waits3);
2354: MPI_Waitall(nrqs,r_waits2,MPI_STATUSES_IGNORE);
2355: for (i=0; i<nrqs; ++i) {
2356: PetscMalloc1(rbuf2[i][0],&rbuf3[i]);
2357: req_source2[i] = pa[i];
2358: MPI_Irecv(rbuf3[i],rbuf2[i][0],MPIU_INT,req_source2[i],tag3,comm,r_waits3+i);
2359: }
2360: PetscFree(r_waits2);
2362: /* Wait on sends1 and sends2 */
2363: MPI_Waitall(nrqs,s_waits1,MPI_STATUSES_IGNORE);
2364: MPI_Waitall(nrqr,s_waits2,MPI_STATUSES_IGNORE);
2365: PetscFree(s_waits1);
2366: PetscFree(s_waits2);
2368: /* Now allocate sending buffers for a->j, and send them off */
2369: PetscMalloc1(nrqr,&sbuf_aj);
2370: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2371: if (nrqr) {PetscMalloc1(j,&sbuf_aj[0]);}
2372: for (i=1; i<nrqr; i++) sbuf_aj[i] = sbuf_aj[i-1] + req_size[i-1];
2374: PetscMalloc1(nrqr,&s_waits3);
2375: {
2376: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i,lwrite;
2377: PetscInt *cworkA,*cworkB,cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2378: PetscInt cend = C->cmap->rend;
2379: PetscInt *a_j = a->j,*b_j = b->j,ctmp;
2381: for (i=0; i<nrqr; i++) {
2382: rbuf1_i = rbuf1[i];
2383: sbuf_aj_i = sbuf_aj[i];
2384: ct1 = 2*rbuf1_i[0] + 1;
2385: ct2 = 0;
2386: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2387: kmax = rbuf1[i][2*j];
2388: for (k=0; k<kmax; k++,ct1++) {
2389: row = rbuf1_i[ct1] - rstart;
2390: nzA = a_i[row+1] - a_i[row];
2391: nzB = b_i[row+1] - b_i[row];
2392: ncols = nzA + nzB;
2393: cworkA = a_j + a_i[row];
2394: cworkB = b_j + b_i[row];
2396: /* load the column indices for this row into cols */
2397: cols = sbuf_aj_i + ct2;
2399: lwrite = 0;
2400: for (l=0; l<nzB; l++) {
2401: if ((ctmp = bmap[cworkB[l]]) < cstart) cols[lwrite++] = ctmp;
2402: }
2403: for (l=0; l<nzA; l++) cols[lwrite++] = cstart + cworkA[l];
2404: for (l=0; l<nzB; l++) {
2405: if ((ctmp = bmap[cworkB[l]]) >= cend) cols[lwrite++] = ctmp;
2406: }
2408: ct2 += ncols;
2409: }
2410: }
2411: MPI_Isend(sbuf_aj_i,req_size[i],MPIU_INT,req_source1[i],tag3,comm,s_waits3+i);
2412: }
2413: }
2415: /* create col map: global col of C -> local col of submatrices */
2416: {
2417: const PetscInt *icol_i;
2418: #if defined(PETSC_USE_CTABLE)
2419: for (i=0; i<ismax; i++) {
2420: if (!allcolumns[i]) {
2421: PetscTableCreate(ncol[i],C->cmap->N,&cmap[i]);
2423: jmax = ncol[i];
2424: icol_i = icol[i];
2425: cmap_i = cmap[i];
2426: for (j=0; j<jmax; j++) {
2427: PetscTableAdd(cmap[i],icol_i[j]+1,j+1,INSERT_VALUES);
2428: }
2429: } else cmap[i] = NULL;
2430: }
2431: #else
2432: for (i=0; i<ismax; i++) {
2433: if (!allcolumns[i]) {
2434: PetscCalloc1(C->cmap->N,&cmap[i]);
2435: jmax = ncol[i];
2436: icol_i = icol[i];
2437: cmap_i = cmap[i];
2438: for (j=0; j<jmax; j++) {
2439: cmap_i[icol_i[j]] = j+1;
2440: }
2441: } else cmap[i] = NULL;
2442: }
2443: #endif
2444: }
2446: /* Create lens which is required for MatCreate... */
2447: for (i=0,j=0; i<ismax; i++) j += nrow[i];
2448: PetscMalloc1(ismax,&lens);
2450: if (ismax) {
2451: PetscCalloc1(j,&lens[0]);
2452: }
2453: for (i=1; i<ismax; i++) lens[i] = lens[i-1] + nrow[i-1];
2455: /* Update lens from local data */
2456: for (i=0; i<ismax; i++) {
2457: row2proc_i = row2proc[i];
2458: jmax = nrow[i];
2459: if (!allcolumns[i]) cmap_i = cmap[i];
2460: irow_i = irow[i];
2461: lens_i = lens[i];
2462: for (j=0; j<jmax; j++) {
2463: row = irow_i[j];
2464: proc = row2proc_i[j];
2465: if (proc == rank) {
2466: MatGetRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2467: if (!allcolumns[i]) {
2468: for (k=0; k<ncols; k++) {
2469: #if defined(PETSC_USE_CTABLE)
2470: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2471: #else
2472: tcol = cmap_i[cols[k]];
2473: #endif
2474: if (tcol) lens_i[j]++;
2475: }
2476: } else { /* allcolumns */
2477: lens_i[j] = ncols;
2478: }
2479: MatRestoreRow_MPIAIJ(C,row,&ncols,&cols,NULL);
2480: }
2481: }
2482: }
2484: /* Create row map: global row of C -> local row of submatrices */
2485: #if defined(PETSC_USE_CTABLE)
2486: for (i=0; i<ismax; i++) {
2487: PetscTableCreate(nrow[i],C->rmap->N,&rmap[i]);
2488: irow_i = irow[i];
2489: jmax = nrow[i];
2490: for (j=0; j<jmax; j++) {
2491: PetscTableAdd(rmap[i],irow_i[j]+1,j+1,INSERT_VALUES);
2492: }
2493: }
2494: #else
2495: for (i=0; i<ismax; i++) {
2496: PetscCalloc1(C->rmap->N,&rmap[i]);
2497: rmap_i = rmap[i];
2498: irow_i = irow[i];
2499: jmax = nrow[i];
2500: for (j=0; j<jmax; j++) {
2501: rmap_i[irow_i[j]] = j;
2502: }
2503: }
2504: #endif
2506: /* Update lens from offproc data */
2507: {
2508: PetscInt *rbuf2_i,*rbuf3_i,*sbuf1_i;
2510: MPI_Waitall(nrqs,r_waits3,MPI_STATUSES_IGNORE);
2511: for (tmp2=0; tmp2<nrqs; tmp2++) {
2512: sbuf1_i = sbuf1[pa[tmp2]];
2513: jmax = sbuf1_i[0];
2514: ct1 = 2*jmax+1;
2515: ct2 = 0;
2516: rbuf2_i = rbuf2[tmp2];
2517: rbuf3_i = rbuf3[tmp2];
2518: for (j=1; j<=jmax; j++) {
2519: is_no = sbuf1_i[2*j-1];
2520: max1 = sbuf1_i[2*j];
2521: lens_i = lens[is_no];
2522: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2523: rmap_i = rmap[is_no];
2524: for (k=0; k<max1; k++,ct1++) {
2525: #if defined(PETSC_USE_CTABLE)
2526: PetscTableFind(rmap_i,sbuf1_i[ct1]+1,&row);
2527: row--;
2528: if (row < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"row not found in table");
2529: #else
2530: row = rmap_i[sbuf1_i[ct1]]; /* the val in the new matrix to be */
2531: #endif
2532: max2 = rbuf2_i[ct1];
2533: for (l=0; l<max2; l++,ct2++) {
2534: if (!allcolumns[is_no]) {
2535: #if defined(PETSC_USE_CTABLE)
2536: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2537: #else
2538: tcol = cmap_i[rbuf3_i[ct2]];
2539: #endif
2540: if (tcol) lens_i[row]++;
2541: } else { /* allcolumns */
2542: lens_i[row]++; /* lens_i[row] += max2 ? */
2543: }
2544: }
2545: }
2546: }
2547: }
2548: }
2549: PetscFree(r_waits3);
2550: MPI_Waitall(nrqr,s_waits3,MPI_STATUSES_IGNORE);
2551: PetscFree(s_waits3);
2553: /* Create the submatrices */
2554: for (i=0; i<ismax; i++) {
2555: PetscInt rbs,cbs;
2557: ISGetBlockSize(isrow[i],&rbs);
2558: ISGetBlockSize(iscol[i],&cbs);
2560: MatCreate(PETSC_COMM_SELF,submats+i);
2561: MatSetSizes(submats[i],nrow[i],ncol[i],PETSC_DETERMINE,PETSC_DETERMINE);
2563: MatSetBlockSizes(submats[i],rbs,cbs);
2564: MatSetType(submats[i],((PetscObject)A)->type_name);
2565: MatSeqAIJSetPreallocation(submats[i],0,lens[i]);
2567: /* create struct Mat_SubSppt and attached it to submat */
2568: PetscNew(&smat_i);
2569: subc = (Mat_SeqAIJ*)submats[i]->data;
2570: subc->submatis1 = smat_i;
2572: smat_i->destroy = submats[i]->ops->destroy;
2573: submats[i]->ops->destroy = MatDestroySubMatrix_SeqAIJ;
2574: submats[i]->factortype = C->factortype;
2576: smat_i->id = i;
2577: smat_i->nrqs = nrqs;
2578: smat_i->nrqr = nrqr;
2579: smat_i->rbuf1 = rbuf1;
2580: smat_i->rbuf2 = rbuf2;
2581: smat_i->rbuf3 = rbuf3;
2582: smat_i->sbuf2 = sbuf2;
2583: smat_i->req_source2 = req_source2;
2585: smat_i->sbuf1 = sbuf1;
2586: smat_i->ptr = ptr;
2587: smat_i->tmp = tmp;
2588: smat_i->ctr = ctr;
2590: smat_i->pa = pa;
2591: smat_i->req_size = req_size;
2592: smat_i->req_source1 = req_source1;
2594: smat_i->allcolumns = allcolumns[i];
2595: smat_i->singleis = PETSC_FALSE;
2596: smat_i->row2proc = row2proc[i];
2597: smat_i->rmap = rmap[i];
2598: smat_i->cmap = cmap[i];
2599: }
2601: if (!ismax) { /* Create dummy submats[0] for reuse struct subc */
2602: MatCreate(PETSC_COMM_SELF,&submats[0]);
2603: MatSetSizes(submats[0],0,0,PETSC_DETERMINE,PETSC_DETERMINE);
2604: MatSetType(submats[0],MATDUMMY);
2606: /* create struct Mat_SubSppt and attached it to submat */
2607: PetscNewLog(submats[0],&smat_i);
2608: submats[0]->data = (void*)smat_i;
2610: smat_i->destroy = submats[0]->ops->destroy;
2611: submats[0]->ops->destroy = MatDestroySubMatrix_Dummy;
2612: submats[0]->factortype = C->factortype;
2614: smat_i->id = 0;
2615: smat_i->nrqs = nrqs;
2616: smat_i->nrqr = nrqr;
2617: smat_i->rbuf1 = rbuf1;
2618: smat_i->rbuf2 = rbuf2;
2619: smat_i->rbuf3 = rbuf3;
2620: smat_i->sbuf2 = sbuf2;
2621: smat_i->req_source2 = req_source2;
2623: smat_i->sbuf1 = sbuf1;
2624: smat_i->ptr = ptr;
2625: smat_i->tmp = tmp;
2626: smat_i->ctr = ctr;
2628: smat_i->pa = pa;
2629: smat_i->req_size = req_size;
2630: smat_i->req_source1 = req_source1;
2632: smat_i->allcolumns = PETSC_FALSE;
2633: smat_i->singleis = PETSC_FALSE;
2634: smat_i->row2proc = NULL;
2635: smat_i->rmap = NULL;
2636: smat_i->cmap = NULL;
2637: }
2639: if (ismax) {PetscFree(lens[0]);}
2640: PetscFree(lens);
2641: if (sbuf_aj) {
2642: PetscFree(sbuf_aj[0]);
2643: PetscFree(sbuf_aj);
2644: }
2646: } /* endof scall == MAT_INITIAL_MATRIX */
2648: /* Post recv matrix values */
2649: PetscObjectGetNewTag((PetscObject)C,&tag4);
2650: PetscMalloc1(nrqs,&rbuf4);
2651: PetscMalloc1(nrqs,&r_waits4);
2652: for (i=0; i<nrqs; ++i) {
2653: PetscMalloc1(rbuf2[i][0],&rbuf4[i]);
2654: MPI_Irecv(rbuf4[i],rbuf2[i][0],MPIU_SCALAR,req_source2[i],tag4,comm,r_waits4+i);
2655: }
2657: /* Allocate sending buffers for a->a, and send them off */
2658: PetscMalloc1(nrqr,&sbuf_aa);
2659: for (i=0,j=0; i<nrqr; i++) j += req_size[i];
2660: if (nrqr) {PetscMalloc1(j,&sbuf_aa[0]);}
2661: for (i=1; i<nrqr; i++) sbuf_aa[i] = sbuf_aa[i-1] + req_size[i-1];
2663: PetscMalloc1(nrqr,&s_waits4);
2664: {
2665: PetscInt nzA,nzB,*a_i = a->i,*b_i = b->i, *cworkB,lwrite;
2666: PetscInt cstart = C->cmap->rstart,rstart = C->rmap->rstart,*bmap = c->garray;
2667: PetscInt cend = C->cmap->rend;
2668: PetscInt *b_j = b->j;
2669: PetscScalar *vworkA,*vworkB,*a_a,*b_a;
2671: MatSeqAIJGetArrayRead(A,(const PetscScalar**)&a_a);
2672: MatSeqAIJGetArrayRead(c->B,(const PetscScalar**)&b_a);
2673: for (i=0; i<nrqr; i++) {
2674: rbuf1_i = rbuf1[i];
2675: sbuf_aa_i = sbuf_aa[i];
2676: ct1 = 2*rbuf1_i[0]+1;
2677: ct2 = 0;
2678: for (j=1,max1=rbuf1_i[0]; j<=max1; j++) {
2679: kmax = rbuf1_i[2*j];
2680: for (k=0; k<kmax; k++,ct1++) {
2681: row = rbuf1_i[ct1] - rstart;
2682: nzA = a_i[row+1] - a_i[row];
2683: nzB = b_i[row+1] - b_i[row];
2684: ncols = nzA + nzB;
2685: cworkB = b_j + b_i[row];
2686: vworkA = a_a + a_i[row];
2687: vworkB = b_a + b_i[row];
2689: /* load the column values for this row into vals*/
2690: vals = sbuf_aa_i+ct2;
2692: lwrite = 0;
2693: for (l=0; l<nzB; l++) {
2694: if ((bmap[cworkB[l]]) < cstart) vals[lwrite++] = vworkB[l];
2695: }
2696: for (l=0; l<nzA; l++) vals[lwrite++] = vworkA[l];
2697: for (l=0; l<nzB; l++) {
2698: if ((bmap[cworkB[l]]) >= cend) vals[lwrite++] = vworkB[l];
2699: }
2701: ct2 += ncols;
2702: }
2703: }
2704: MPI_Isend(sbuf_aa_i,req_size[i],MPIU_SCALAR,req_source1[i],tag4,comm,s_waits4+i);
2705: }
2706: MatSeqAIJRestoreArrayRead(A,(const PetscScalar**)&a_a);
2707: MatSeqAIJRestoreArrayRead(c->B,(const PetscScalar**)&b_a);
2708: }
2710: /* Assemble the matrices */
2711: /* First assemble the local rows */
2712: for (i=0; i<ismax; i++) {
2713: row2proc_i = row2proc[i];
2714: subc = (Mat_SeqAIJ*)submats[i]->data;
2715: imat_ilen = subc->ilen;
2716: imat_j = subc->j;
2717: imat_i = subc->i;
2718: imat_a = subc->a;
2720: if (!allcolumns[i]) cmap_i = cmap[i];
2721: rmap_i = rmap[i];
2722: irow_i = irow[i];
2723: jmax = nrow[i];
2724: for (j=0; j<jmax; j++) {
2725: row = irow_i[j];
2726: proc = row2proc_i[j];
2727: if (proc == rank) {
2728: old_row = row;
2729: #if defined(PETSC_USE_CTABLE)
2730: PetscTableFind(rmap_i,row+1,&row);
2731: row--;
2732: #else
2733: row = rmap_i[row];
2734: #endif
2735: ilen_row = imat_ilen[row];
2736: MatGetRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2737: mat_i = imat_i[row];
2738: mat_a = imat_a + mat_i;
2739: mat_j = imat_j + mat_i;
2740: if (!allcolumns[i]) {
2741: for (k=0; k<ncols; k++) {
2742: #if defined(PETSC_USE_CTABLE)
2743: PetscTableFind(cmap_i,cols[k]+1,&tcol);
2744: #else
2745: tcol = cmap_i[cols[k]];
2746: #endif
2747: if (tcol) {
2748: *mat_j++ = tcol - 1;
2749: *mat_a++ = vals[k];
2750: ilen_row++;
2751: }
2752: }
2753: } else { /* allcolumns */
2754: for (k=0; k<ncols; k++) {
2755: *mat_j++ = cols[k]; /* global col index! */
2756: *mat_a++ = vals[k];
2757: ilen_row++;
2758: }
2759: }
2760: MatRestoreRow_MPIAIJ(C,old_row,&ncols,&cols,&vals);
2762: imat_ilen[row] = ilen_row;
2763: }
2764: }
2765: }
2767: /* Now assemble the off proc rows */
2768: MPI_Waitall(nrqs,r_waits4,MPI_STATUSES_IGNORE);
2769: for (tmp2=0; tmp2<nrqs; tmp2++) {
2770: sbuf1_i = sbuf1[pa[tmp2]];
2771: jmax = sbuf1_i[0];
2772: ct1 = 2*jmax + 1;
2773: ct2 = 0;
2774: rbuf2_i = rbuf2[tmp2];
2775: rbuf3_i = rbuf3[tmp2];
2776: rbuf4_i = rbuf4[tmp2];
2777: for (j=1; j<=jmax; j++) {
2778: is_no = sbuf1_i[2*j-1];
2779: rmap_i = rmap[is_no];
2780: if (!allcolumns[is_no]) cmap_i = cmap[is_no];
2781: subc = (Mat_SeqAIJ*)submats[is_no]->data;
2782: imat_ilen = subc->ilen;
2783: imat_j = subc->j;
2784: imat_i = subc->i;
2785: imat_a = subc->a;
2786: max1 = sbuf1_i[2*j];
2787: for (k=0; k<max1; k++,ct1++) {
2788: row = sbuf1_i[ct1];
2789: #if defined(PETSC_USE_CTABLE)
2790: PetscTableFind(rmap_i,row+1,&row);
2791: row--;
2792: #else
2793: row = rmap_i[row];
2794: #endif
2795: ilen = imat_ilen[row];
2796: mat_i = imat_i[row];
2797: mat_a = imat_a + mat_i;
2798: mat_j = imat_j + mat_i;
2799: max2 = rbuf2_i[ct1];
2800: if (!allcolumns[is_no]) {
2801: for (l=0; l<max2; l++,ct2++) {
2802: #if defined(PETSC_USE_CTABLE)
2803: PetscTableFind(cmap_i,rbuf3_i[ct2]+1,&tcol);
2804: #else
2805: tcol = cmap_i[rbuf3_i[ct2]];
2806: #endif
2807: if (tcol) {
2808: *mat_j++ = tcol - 1;
2809: *mat_a++ = rbuf4_i[ct2];
2810: ilen++;
2811: }
2812: }
2813: } else { /* allcolumns */
2814: for (l=0; l<max2; l++,ct2++) {
2815: *mat_j++ = rbuf3_i[ct2]; /* same global column index of C */
2816: *mat_a++ = rbuf4_i[ct2];
2817: ilen++;
2818: }
2819: }
2820: imat_ilen[row] = ilen;
2821: }
2822: }
2823: }
2825: if (!iscsorted) { /* sort column indices of the rows */
2826: for (i=0; i<ismax; i++) {
2827: subc = (Mat_SeqAIJ*)submats[i]->data;
2828: imat_j = subc->j;
2829: imat_i = subc->i;
2830: imat_a = subc->a;
2831: imat_ilen = subc->ilen;
2833: if (allcolumns[i]) continue;
2834: jmax = nrow[i];
2835: for (j=0; j<jmax; j++) {
2836: mat_i = imat_i[j];
2837: mat_a = imat_a + mat_i;
2838: mat_j = imat_j + mat_i;
2839: PetscSortIntWithScalarArray(imat_ilen[j],mat_j,mat_a);
2840: }
2841: }
2842: }
2844: PetscFree(r_waits4);
2845: MPI_Waitall(nrqr,s_waits4,MPI_STATUSES_IGNORE);
2846: PetscFree(s_waits4);
2848: /* Restore the indices */
2849: for (i=0; i<ismax; i++) {
2850: ISRestoreIndices(isrow[i],irow+i);
2851: if (!allcolumns[i]) {
2852: ISRestoreIndices(iscol[i],icol+i);
2853: }
2854: }
2856: for (i=0; i<ismax; i++) {
2857: MatAssemblyBegin(submats[i],MAT_FINAL_ASSEMBLY);
2858: MatAssemblyEnd(submats[i],MAT_FINAL_ASSEMBLY);
2859: }
2861: /* Destroy allocated memory */
2862: if (sbuf_aa) {
2863: PetscFree(sbuf_aa[0]);
2864: PetscFree(sbuf_aa);
2865: }
2866: PetscFree5(*(PetscInt***)&irow,*(PetscInt***)&icol,nrow,ncol,issorted);
2868: for (i=0; i<nrqs; ++i) {
2869: PetscFree(rbuf4[i]);
2870: }
2871: PetscFree(rbuf4);
2873: PetscFree4(row2proc,cmap,rmap,allcolumns);
2874: return(0);
2875: }
2877: /*
2878: Permute A & B into C's *local* index space using rowemb,dcolemb for A and rowemb,ocolemb for B.
2879: Embeddings are supposed to be injections and the above implies that the range of rowemb is a subset
2880: of [0,m), dcolemb is in [0,n) and ocolemb is in [N-n).
2881: If pattern == DIFFERENT_NONZERO_PATTERN, C is preallocated according to A&B.
2882: After that B's columns are mapped into C's global column space, so that C is in the "disassembled"
2883: state, and needs to be "assembled" later by compressing B's column space.
2885: This function may be called in lieu of preallocation, so C should not be expected to be preallocated.
2886: Following this call, C->A & C->B have been created, even if empty.
2887: */
2888: PetscErrorCode MatSetSeqMats_MPIAIJ(Mat C,IS rowemb,IS dcolemb,IS ocolemb,MatStructure pattern,Mat A,Mat B)
2889: {
2890: /* If making this function public, change the error returned in this function away from _PLIB. */
2892: Mat_MPIAIJ *aij;
2893: Mat_SeqAIJ *Baij;
2894: PetscBool seqaij,Bdisassembled;
2895: PetscInt m,n,*nz,i,j,ngcol,col,rstart,rend,shift,count;
2896: PetscScalar v;
2897: const PetscInt *rowindices,*colindices;
2900: /* Check to make sure the component matrices (and embeddings) are compatible with C. */
2901: if (A) {
2902: PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&seqaij);
2903: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diagonal matrix is of wrong type");
2904: if (rowemb) {
2905: ISGetLocalSize(rowemb,&m);
2906: if (m != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with diag matrix row size %D",m,A->rmap->n);
2907: } else {
2908: if (C->rmap->n != A->rmap->n) {
2909: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is row-incompatible with the MPIAIJ matrix");
2910: }
2911: }
2912: if (dcolemb) {
2913: ISGetLocalSize(dcolemb,&n);
2914: if (n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag col IS of size %D is incompatible with diag matrix col size %D",n,A->cmap->n);
2915: } else {
2916: if (C->cmap->n != A->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Diag seq matrix is col-incompatible with the MPIAIJ matrix");
2917: }
2918: }
2919: if (B) {
2920: PetscObjectBaseTypeCompare((PetscObject)B,MATSEQAIJ,&seqaij);
2921: if (!seqaij) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diagonal matrix is of wrong type");
2922: if (rowemb) {
2923: ISGetLocalSize(rowemb,&m);
2924: if (m != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Row IS of size %D is incompatible with off-diag matrix row size %D",m,A->rmap->n);
2925: } else {
2926: if (C->rmap->n != B->rmap->n) {
2927: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is row-incompatible with the MPIAIJ matrix");
2928: }
2929: }
2930: if (ocolemb) {
2931: ISGetLocalSize(ocolemb,&n);
2932: if (n != B->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag col IS of size %D is incompatible with off-diag matrix col size %D",n,B->cmap->n);
2933: } else {
2934: if (C->cmap->N - C->cmap->n != B->cmap->n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Off-diag seq matrix is col-incompatible with the MPIAIJ matrix");
2935: }
2936: }
2938: aij = (Mat_MPIAIJ*)C->data;
2939: if (!aij->A) {
2940: /* Mimic parts of MatMPIAIJSetPreallocation() */
2941: MatCreate(PETSC_COMM_SELF,&aij->A);
2942: MatSetSizes(aij->A,C->rmap->n,C->cmap->n,C->rmap->n,C->cmap->n);
2943: MatSetBlockSizesFromMats(aij->A,C,C);
2944: MatSetType(aij->A,MATSEQAIJ);
2945: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->A);
2946: }
2947: if (A) {
2948: MatSetSeqMat_SeqAIJ(aij->A,rowemb,dcolemb,pattern,A);
2949: } else {
2950: MatSetUp(aij->A);
2951: }
2952: if (B) { /* Destroy the old matrix or the column map, depending on the sparsity pattern. */
2953: /*
2954: If pattern == DIFFERENT_NONZERO_PATTERN, we reallocate B and
2955: need to "disassemble" B -- convert it to using C's global indices.
2956: To insert the values we take the safer, albeit more expensive, route of MatSetValues().
2958: If pattern == SUBSET_NONZERO_PATTERN, we do not "disassemble" B and do not reallocate;
2959: we MatZeroValues(B) first, so there may be a bunch of zeros that, perhaps, could be compacted out.
2961: TODO: Put B's values into aij->B's aij structure in place using the embedding ISs?
2962: At least avoid calling MatSetValues() and the implied searches?
2963: */
2965: if (B && pattern == DIFFERENT_NONZERO_PATTERN) {
2966: #if defined(PETSC_USE_CTABLE)
2967: PetscTableDestroy(&aij->colmap);
2968: #else
2969: PetscFree(aij->colmap);
2970: /* A bit of a HACK: ideally we should deal with case aij->B all in one code block below. */
2971: if (aij->B) {
2972: PetscLogObjectMemory((PetscObject)C,-aij->B->cmap->n*sizeof(PetscInt));
2973: }
2974: #endif
2975: ngcol = 0;
2976: if (aij->lvec) {
2977: VecGetSize(aij->lvec,&ngcol);
2978: }
2979: if (aij->garray) {
2980: PetscFree(aij->garray);
2981: PetscLogObjectMemory((PetscObject)C,-ngcol*sizeof(PetscInt));
2982: }
2983: VecDestroy(&aij->lvec);
2984: VecScatterDestroy(&aij->Mvctx);
2985: }
2986: if (aij->B && B && pattern == DIFFERENT_NONZERO_PATTERN) {
2987: MatDestroy(&aij->B);
2988: }
2989: if (aij->B && B && pattern == SUBSET_NONZERO_PATTERN) {
2990: MatZeroEntries(aij->B);
2991: }
2992: }
2993: Bdisassembled = PETSC_FALSE;
2994: if (!aij->B) {
2995: MatCreate(PETSC_COMM_SELF,&aij->B);
2996: PetscLogObjectParent((PetscObject)C,(PetscObject)aij->B);
2997: MatSetSizes(aij->B,C->rmap->n,C->cmap->N,C->rmap->n,C->cmap->N);
2998: MatSetBlockSizesFromMats(aij->B,B,B);
2999: MatSetType(aij->B,MATSEQAIJ);
3000: Bdisassembled = PETSC_TRUE;
3001: }
3002: if (B) {
3003: Baij = (Mat_SeqAIJ*)B->data;
3004: if (pattern == DIFFERENT_NONZERO_PATTERN) {
3005: PetscMalloc1(B->rmap->n,&nz);
3006: for (i=0; i<B->rmap->n; i++) {
3007: nz[i] = Baij->i[i+1] - Baij->i[i];
3008: }
3009: MatSeqAIJSetPreallocation(aij->B,0,nz);
3010: PetscFree(nz);
3011: }
3013: PetscLayoutGetRange(C->rmap,&rstart,&rend);
3014: shift = rend-rstart;
3015: count = 0;
3016: rowindices = NULL;
3017: colindices = NULL;
3018: if (rowemb) {
3019: ISGetIndices(rowemb,&rowindices);
3020: }
3021: if (ocolemb) {
3022: ISGetIndices(ocolemb,&colindices);
3023: }
3024: for (i=0; i<B->rmap->n; i++) {
3025: PetscInt row;
3026: row = i;
3027: if (rowindices) row = rowindices[i];
3028: for (j=Baij->i[i]; j<Baij->i[i+1]; j++) {
3029: col = Baij->j[count];
3030: if (colindices) col = colindices[col];
3031: if (Bdisassembled && col>=rstart) col += shift;
3032: v = Baij->a[count];
3033: MatSetValues(aij->B,1,&row,1,&col,&v,INSERT_VALUES);
3034: ++count;
3035: }
3036: }
3037: /* No assembly for aij->B is necessary. */
3038: /* FIXME: set aij->B's nonzerostate correctly. */
3039: } else {
3040: MatSetUp(aij->B);
3041: }
3042: C->preallocated = PETSC_TRUE;
3043: C->was_assembled = PETSC_FALSE;
3044: C->assembled = PETSC_FALSE;
3045: /*
3046: C will need to be assembled so that aij->B can be compressed into local form in MatSetUpMultiply_MPIAIJ().
3047: Furthermore, its nonzerostate will need to be based on that of aij->A's and aij->B's.
3048: */
3049: return(0);
3050: }
3052: /*
3053: B uses local indices with column indices ranging between 0 and N-n; they must be interpreted using garray.
3054: */
3055: PetscErrorCode MatGetSeqMats_MPIAIJ(Mat C,Mat *A,Mat *B)
3056: {
3057: Mat_MPIAIJ *aij = (Mat_MPIAIJ*)C->data;
3062: /* FIXME: make sure C is assembled */
3063: *A = aij->A;
3064: *B = aij->B;
3065: /* Note that we don't incref *A and *B, so be careful! */
3066: return(0);
3067: }
3069: /*
3070: Extract MPI submatrices encoded by pairs of IS that may live on subcomms of C.
3071: NOT SCALABLE due to the use of ISGetNonlocalIS() (see below).
3072: */
3073: PetscErrorCode MatCreateSubMatricesMPI_MPIXAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[],
3074: PetscErrorCode(*getsubmats_seq)(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat**),
3075: PetscErrorCode(*getlocalmats)(Mat,Mat*,Mat*),
3076: PetscErrorCode(*setseqmat)(Mat,IS,IS,MatStructure,Mat),
3077: PetscErrorCode(*setseqmats)(Mat,IS,IS,IS,MatStructure,Mat,Mat))
3078: {
3080: PetscMPIInt size,flag;
3081: PetscInt i,ii,cismax,ispar;
3082: Mat *A,*B;
3083: IS *isrow_p,*iscol_p,*cisrow,*ciscol,*ciscol_p;
3086: if (!ismax) return(0);
3088: for (i = 0, cismax = 0; i < ismax; ++i) {
3089: MPI_Comm_compare(((PetscObject)isrow[i])->comm,((PetscObject)iscol[i])->comm,&flag);
3090: if (flag != MPI_IDENT) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Row and column index sets must have the same communicator");
3091: MPI_Comm_size(((PetscObject)isrow[i])->comm, &size);
3092: if (size > 1) ++cismax;
3093: }
3095: /*
3096: If cismax is zero on all C's ranks, then and only then can we use purely sequential matrix extraction.
3097: ispar counts the number of parallel ISs across C's comm.
3098: */
3099: MPIU_Allreduce(&cismax,&ispar,1,MPIU_INT,MPI_MAX,PetscObjectComm((PetscObject)C));
3100: if (!ispar) { /* Sequential ISs only across C's comm, so can call the sequential matrix extraction subroutine. */
3101: (*getsubmats_seq)(C,ismax,isrow,iscol,scall,submat);
3102: return(0);
3103: }
3105: /* if (ispar) */
3106: /*
3107: Construct the "complements" -- the off-processor indices -- of the iscol ISs for parallel ISs only.
3108: These are used to extract the off-diag portion of the resulting parallel matrix.
3109: The row IS for the off-diag portion is the same as for the diag portion,
3110: so we merely alias (without increfing) the row IS, while skipping those that are sequential.
3111: */
3112: PetscMalloc2(cismax,&cisrow,cismax,&ciscol);
3113: PetscMalloc1(cismax,&ciscol_p);
3114: for (i = 0, ii = 0; i < ismax; ++i) {
3115: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3116: if (size > 1) {
3117: /*
3118: TODO: This is the part that's ***NOT SCALABLE***.
3119: To fix this we need to extract just the indices of C's nonzero columns
3120: that lie on the intersection of isrow[i] and ciscol[ii] -- the nonlocal
3121: part of iscol[i] -- without actually computing ciscol[ii]. This also has
3122: to be done without serializing on the IS list, so, most likely, it is best
3123: done by rewriting MatCreateSubMatrices_MPIAIJ() directly.
3124: */
3125: ISGetNonlocalIS(iscol[i],&(ciscol[ii]));
3126: /* Now we have to
3127: (a) make sure ciscol[ii] is sorted, since, even if the off-proc indices
3128: were sorted on each rank, concatenated they might no longer be sorted;
3129: (b) Use ISSortPermutation() to construct ciscol_p, the mapping from the
3130: indices in the nondecreasing order to the original index positions.
3131: If ciscol[ii] is strictly increasing, the permutation IS is NULL.
3132: */
3133: ISSortPermutation(ciscol[ii],PETSC_FALSE,ciscol_p+ii);
3134: ISSort(ciscol[ii]);
3135: ++ii;
3136: }
3137: }
3138: PetscMalloc2(ismax,&isrow_p,ismax,&iscol_p);
3139: for (i = 0, ii = 0; i < ismax; ++i) {
3140: PetscInt j,issize;
3141: const PetscInt *indices;
3143: /*
3144: Permute the indices into a nondecreasing order. Reject row and col indices with duplicates.
3145: */
3146: ISSortPermutation(isrow[i],PETSC_FALSE,isrow_p+i);
3147: ISSort(isrow[i]);
3148: ISGetLocalSize(isrow[i],&issize);
3149: ISGetIndices(isrow[i],&indices);
3150: for (j = 1; j < issize; ++j) {
3151: if (indices[j] == indices[j-1]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in row IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3152: }
3153: ISRestoreIndices(isrow[i],&indices);
3154: ISSortPermutation(iscol[i],PETSC_FALSE,iscol_p+i);
3155: ISSort(iscol[i]);
3156: ISGetLocalSize(iscol[i],&issize);
3157: ISGetIndices(iscol[i],&indices);
3158: for (j = 1; j < issize; ++j) {
3159: if (indices[j-1] == indices[j]) SETERRQ4(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Repeated indices in col IS %D: indices at %D and %D are both %D",i,j-1,j,indices[j]);
3160: }
3161: ISRestoreIndices(iscol[i],&indices);
3162: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3163: if (size > 1) {
3164: cisrow[ii] = isrow[i];
3165: ++ii;
3166: }
3167: }
3168: /*
3169: Allocate the necessary arrays to hold the resulting parallel matrices as well as the intermediate
3170: array of sequential matrices underlying the resulting parallel matrices.
3171: Which arrays to allocate is based on the value of MatReuse scall and whether ISs are sorted and/or
3172: contain duplicates.
3174: There are as many diag matrices as there are original index sets. There are only as many parallel
3175: and off-diag matrices, as there are parallel (comm size > 1) index sets.
3177: ARRAYS that can hold Seq matrices get allocated in any event -- either here or by getsubmats_seq():
3178: - If the array of MPI matrices already exists and is being reused, we need to allocate the array
3179: and extract the underlying seq matrices into it to serve as placeholders, into which getsubmats_seq
3180: will deposite the extracted diag and off-diag parts. Thus, we allocate the A&B arrays and fill them
3181: with A[i] and B[ii] extracted from the corresponding MPI submat.
3182: - However, if the rows, A's column indices or B's column indices are not sorted, the extracted A[i] & B[ii]
3183: will have a different order from what getsubmats_seq expects. To handle this case -- indicated
3184: by a nonzero isrow_p[i], iscol_p[i], or ciscol_p[ii] -- we duplicate A[i] --> AA[i], B[ii] --> BB[ii]
3185: (retrieve composed AA[i] or BB[ii]) and reuse them here. AA[i] and BB[ii] are then used to permute its
3186: values into A[i] and B[ii] sitting inside the corresponding submat.
3187: - If no reuse is taking place then getsubmats_seq will allocate the A&B arrays and create the corresponding
3188: A[i], B[ii], AA[i] or BB[ii] matrices.
3189: */
3190: /* Parallel matrix array is allocated here only if no reuse is taking place. If reused, it is passed in by the caller. */
3191: if (scall == MAT_INITIAL_MATRIX) {
3192: PetscMalloc1(ismax,submat);
3193: }
3195: /* Now obtain the sequential A and B submatrices separately. */
3196: /* scall=MAT_REUSE_MATRIX is not handled yet, because getsubmats_seq() requires reuse of A and B */
3197: (*getsubmats_seq)(C,ismax,isrow,iscol,MAT_INITIAL_MATRIX,&A);
3198: (*getsubmats_seq)(C,cismax,cisrow,ciscol,MAT_INITIAL_MATRIX,&B);
3200: /*
3201: If scall == MAT_REUSE_MATRIX AND the permutations are NULL, we are done, since the sequential
3202: matrices A & B have been extracted directly into the parallel matrices containing them, or
3203: simply into the sequential matrix identical with the corresponding A (if size == 1).
3204: Note that in that case colmap doesn't need to be rebuilt, since the matrices are expected
3205: to have the same sparsity pattern.
3206: Otherwise, A and/or B have to be properly embedded into C's index spaces and the correct colmap
3207: must be constructed for C. This is done by setseqmat(s).
3208: */
3209: for (i = 0, ii = 0; i < ismax; ++i) {
3210: /*
3211: TODO: cache ciscol, permutation ISs and maybe cisrow? What about isrow & iscol?
3212: That way we can avoid sorting and computing permutations when reusing.
3213: To this end:
3214: - remove the old cache, if it exists, when extracting submatrices with MAT_INITIAL_MATRIX
3215: - if caching arrays to hold the ISs, make and compose a container for them so that it can
3216: be destroyed upon destruction of C (use PetscContainerUserDestroy() to clear out the contents).
3217: */
3218: MatStructure pattern = DIFFERENT_NONZERO_PATTERN;
3220: MPI_Comm_size(((PetscObject)isrow[i])->comm,&size);
3221: /* Construct submat[i] from the Seq pieces A (and B, if necessary). */
3222: if (size > 1) {
3223: if (scall == MAT_INITIAL_MATRIX) {
3224: MatCreate(((PetscObject)isrow[i])->comm,(*submat)+i);
3225: MatSetSizes((*submat)[i],A[i]->rmap->n,A[i]->cmap->n,PETSC_DETERMINE,PETSC_DETERMINE);
3226: MatSetType((*submat)[i],MATMPIAIJ);
3227: PetscLayoutSetUp((*submat)[i]->rmap);
3228: PetscLayoutSetUp((*submat)[i]->cmap);
3229: }
3230: /*
3231: For each parallel isrow[i], insert the extracted sequential matrices into the parallel matrix.
3232: */
3233: {
3234: Mat AA = A[i],BB = B[ii];
3236: if (AA || BB) {
3237: setseqmats((*submat)[i],isrow_p[i],iscol_p[i],ciscol_p[ii],pattern,AA,BB);
3238: MatAssemblyBegin((*submat)[i],MAT_FINAL_ASSEMBLY);
3239: MatAssemblyEnd((*submat)[i],MAT_FINAL_ASSEMBLY);
3240: }
3241: MatDestroy(&AA);
3242: }
3243: ISDestroy(ciscol+ii);
3244: ISDestroy(ciscol_p+ii);
3245: ++ii;
3246: } else { /* if (size == 1) */
3247: if (scall == MAT_REUSE_MATRIX) {
3248: MatDestroy(&(*submat)[i]);
3249: }
3250: if (isrow_p[i] || iscol_p[i]) {
3251: MatDuplicate(A[i],MAT_DO_NOT_COPY_VALUES,(*submat)+i);
3252: setseqmat((*submat)[i],isrow_p[i],iscol_p[i],pattern,A[i]);
3253: /* Otherwise A is extracted straight into (*submats)[i]. */
3254: /* TODO: Compose A[i] on (*submat([i] for future use, if ((isrow_p[i] || iscol_p[i]) && MAT_INITIAL_MATRIX). */
3255: MatDestroy(A+i);
3256: } else (*submat)[i] = A[i];
3257: }
3258: ISDestroy(&isrow_p[i]);
3259: ISDestroy(&iscol_p[i]);
3260: }
3261: PetscFree2(cisrow,ciscol);
3262: PetscFree2(isrow_p,iscol_p);
3263: PetscFree(ciscol_p);
3264: PetscFree(A);
3265: MatDestroySubMatrices(cismax,&B);
3266: return(0);
3267: }
3269: PetscErrorCode MatCreateSubMatricesMPI_MPIAIJ(Mat C,PetscInt ismax,const IS isrow[],const IS iscol[],MatReuse scall,Mat *submat[])
3270: {
3274: MatCreateSubMatricesMPI_MPIXAIJ(C,ismax,isrow,iscol,scall,submat,MatCreateSubMatrices_MPIAIJ,MatGetSeqMats_MPIAIJ,MatSetSeqMat_SeqAIJ,MatSetSeqMats_MPIAIJ);
3275: return(0);
3276: }