Actual source code: math2opusutils.cu

  1: #include <petsc/private/matimpl.h>
  2: #include <petsc/private/vecimpl.h>
  3: #include <petscsf.h>
  4: #if defined(PETSC_HAVE_CUDA)
  5: #include <thrust/for_each.h>
  6: #include <thrust/device_vector.h>
  7: #include <thrust/execution_policy.h>
  8: #endif

 10: PETSC_INTERN PetscErrorCode PetscSFGetVectorSF(PetscSF sf, PetscInt nv, PetscInt ldr, PetscInt ldl, PetscSF *vsf)
 11: {
 12:   PetscSF           rankssf;
 13:   const PetscSFNode *iremote;
 14:   PetscSFNode       *viremote,*rremotes;
 15:   const PetscInt    *ilocal;
 16:   PetscInt          *vilocal = NULL,*ldrs;
 17:   const PetscMPIInt *ranks;
 18:   PetscMPIInt       *sranks;
 19:   PetscInt          nranks,nr,nl,vnr,vnl,i,v,j,maxl;
 20:   MPI_Comm          comm;

 25:   if (nv == 1) {
 26:     PetscObjectReference((PetscObject)sf);
 27:     *vsf = sf;
 28:     return 0;
 29:   }
 30:   PetscObjectGetComm((PetscObject)sf,&comm);
 31:   PetscSFGetGraph(sf,&nr,&nl,&ilocal,&iremote);
 32:   PetscSFGetLeafRange(sf,NULL,&maxl);
 33:   maxl += 1;
 34:   if (ldl == PETSC_DECIDE) ldl = maxl;
 35:   if (ldr == PETSC_DECIDE) ldr = nr;
 38:   vnr  = nr*nv;
 39:   vnl  = nl*nv;
 40:   PetscMalloc1(vnl,&viremote);
 41:   if (ilocal) PetscMalloc1(vnl,&vilocal);

 43:   /* TODO: Should this special SF be available, e.g.
 44:      PetscSFGetRanksSF or similar? */
 45:   PetscSFGetRootRanks(sf,&nranks,&ranks,NULL,NULL,NULL);
 46:   PetscMalloc1(nranks,&sranks);
 47:   PetscArraycpy(sranks,ranks,nranks);
 48:   PetscSortMPIInt(nranks,sranks);
 49:   PetscMalloc1(nranks,&rremotes);
 50:   for (i=0;i<nranks;i++) {
 51:     rremotes[i].rank  = sranks[i];
 52:     rremotes[i].index = 0;
 53:   }
 54:   PetscSFDuplicate(sf,PETSCSF_DUPLICATE_CONFONLY,&rankssf);
 55:   PetscSFSetGraph(rankssf,1,nranks,NULL,PETSC_OWN_POINTER,rremotes,PETSC_OWN_POINTER);
 56:   PetscMalloc1(nranks,&ldrs);
 57:   PetscSFBcastBegin(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE);
 58:   PetscSFBcastEnd(rankssf,MPIU_INT,&ldr,ldrs,MPI_REPLACE);
 59:   PetscSFDestroy(&rankssf);

 61:   j = -1;
 62:   for (i=0;i<nl;i++) {
 63:     const PetscInt r  = iremote[i].rank;
 64:     const PetscInt ii = iremote[i].index;

 66:     if (j < 0 || sranks[j] != r) {
 67:       PetscFindMPIInt(r,nranks,sranks,&j);
 68:     }
 70:     for (v=0;v<nv;v++) {
 71:       viremote[v*nl + i].rank  = r;
 72:       viremote[v*nl + i].index = v*ldrs[j] + ii;
 73:       if (ilocal) vilocal[v*nl + i] = v*ldl + ilocal[i];
 74:     }
 75:   }
 76:   PetscFree(sranks);
 77:   PetscFree(ldrs);
 78:   PetscSFCreate(comm,vsf);
 79:   PetscSFSetGraph(*vsf,vnr,vnl,vilocal,PETSC_OWN_POINTER,viremote,PETSC_OWN_POINTER);
 80:   return 0;
 81: }

 83: PETSC_INTERN PetscErrorCode MatDenseGetH2OpusVectorSF(Mat A, PetscSF h2sf, PetscSF *osf)
 84: {
 85:   PetscSF asf;

 90:   PetscObjectQuery((PetscObject)A,"_math2opus_vectorsf",(PetscObject*)&asf);
 91:   if (!asf) {
 92:     PetscInt lda;

 94:     MatDenseGetLDA(A,&lda);
 95:     PetscSFGetVectorSF(h2sf,A->cmap->N,lda,PETSC_DECIDE,&asf);
 96:     PetscObjectCompose((PetscObject)A,"_math2opus_vectorsf",(PetscObject)asf);
 97:     PetscObjectDereference((PetscObject)asf);
 98:   }
 99:   *osf = asf;
100:   return 0;
101: }

103: #if defined(PETSC_HAVE_CUDA)
104: struct SignVector_Functor
105: {
106:     const PetscScalar *v;
107:     PetscScalar *s;
108:     SignVector_Functor(const PetscScalar *_v, PetscScalar *_s) : v(_v), s(_s) {}

110:     __host__ __device__ void operator()(PetscInt i)
111:     {
112:         s[i] = (v[i] < 0 ? -1 : 1);
113:     }
114: };
115: #endif

117: PETSC_INTERN PetscErrorCode VecSign(Vec v, Vec s)
118: {
119:   const PetscScalar *av;
120:   PetscScalar       *as;
121:   PetscInt          i,n;
122: #if defined(PETSC_HAVE_CUDA)
123:   PetscBool         viscuda,siscuda;
124: #endif

128:   VecGetLocalSize(s,&n);
129:   VecGetLocalSize(v,&i);
131: #if defined(PETSC_HAVE_CUDA)
132:   PetscObjectTypeCompareAny((PetscObject)v,&viscuda,VECSEQCUDA,VECMPICUDA,"");
133:   PetscObjectTypeCompareAny((PetscObject)s,&siscuda,VECSEQCUDA,VECMPICUDA,"");
134:   viscuda = (PetscBool)(viscuda && !v->boundtocpu);
135:   siscuda = (PetscBool)(siscuda && !s->boundtocpu);
136:   if (viscuda && siscuda) {
137:     VecCUDAGetArrayRead(v,&av);
138:     VecCUDAGetArrayWrite(s,&as);
139:     SignVector_Functor sign_vector(av, as);
140:     thrust::for_each(thrust::device,thrust::counting_iterator<PetscInt>(0),
141:                      thrust::counting_iterator<PetscInt>(n), sign_vector);
142:     VecCUDARestoreArrayWrite(s,&as);
143:     VecCUDARestoreArrayRead(v,&av);
144:   } else
145: #endif
146:   {
147:     VecGetArrayRead(v,&av);
148:     VecGetArrayWrite(s,&as);
149:     for (i=0;i<n;i++) as[i] = PetscAbsScalar(av[i]) < 0 ? -1. : 1.;
150:     VecRestoreArrayWrite(s,&as);
151:     VecRestoreArrayRead(v,&av);
152:   }
153:   return 0;
154: }

156: #if defined(PETSC_HAVE_CUDA)
157: struct StandardBasis_Functor
158: {
159:     PetscScalar *v;
160:     PetscInt j;

162:     StandardBasis_Functor(PetscScalar *_v, PetscInt _j) : v(_v), j(_j) {}
163:     __host__ __device__ void operator()(PetscInt i)
164:     {
165:         v[i] = (i == j ? 1 : 0);
166:     }
167: };
168: #endif

170: PETSC_INTERN PetscErrorCode VecSetDelta(Vec x, PetscInt i)
171: {
172: #if defined(PETSC_HAVE_CUDA)
173:   PetscBool iscuda;
174: #endif
175:   PetscInt  st,en;

177:   VecGetOwnershipRange(x,&st,&en);
178: #if defined(PETSC_HAVE_CUDA)
179:   PetscObjectTypeCompareAny((PetscObject)x,&iscuda,VECSEQCUDA,VECMPICUDA,"");
180:   iscuda = (PetscBool)(iscuda && !x->boundtocpu);
181:   if (iscuda) {
182:     PetscScalar *ax;
183:     VecCUDAGetArrayWrite(x,&ax);
184:     StandardBasis_Functor delta(ax, i-st);
185:     thrust::for_each(thrust::device,thrust::counting_iterator<PetscInt>(0),
186:                      thrust::counting_iterator<PetscInt>(en-st), delta);
187:     VecCUDARestoreArrayWrite(x,&ax);
188:   } else
189: #endif
190:   {
191:     VecSet(x,0.);
192:     if (st <= i && i < en) {
193:       VecSetValue(x,i,1.0,INSERT_VALUES);
194:     }
195:     VecAssemblyBegin(x);
196:     VecAssemblyEnd(x);
197:   }
198:   return 0;
199: }

201: /* these are approximate norms */
202: /* NORM_2: Estimating the matrix p-norm Nicholas J. Higham
203:    NORM_1/NORM_INFINITY: A block algorithm for matrix 1-norm estimation, with an application to 1-norm pseudospectra Higham, Nicholas J. and Tisseur, Francoise */
204: PETSC_INTERN PetscErrorCode MatApproximateNorm_Private(Mat A, NormType normtype, PetscInt normsamples, PetscReal* n)
205: {
206:   Vec         x,y,w,z;
207:   PetscReal   normz,adot;
208:   PetscScalar dot;
209:   PetscInt    i,j,N,jold = -1;
210:   PetscBool   boundtocpu = PETSC_TRUE;

212: #if defined(PETSC_HAVE_DEVICE)
213:   boundtocpu = A->boundtocpu;
214: #endif
215:   switch (normtype) {
216:   case NORM_INFINITY:
217:   case NORM_1:
218:     if (normsamples < 0) normsamples = 10; /* pure guess */
219:     if (normtype == NORM_INFINITY) {
220:       Mat B;
221:       MatCreateTranspose(A,&B);
222:       A = B;
223:     } else {
224:       PetscObjectReference((PetscObject)A);
225:     }
226:     MatCreateVecs(A,&x,&y);
227:     MatCreateVecs(A,&z,&w);
228:     VecBindToCPU(x,boundtocpu);
229:     VecBindToCPU(y,boundtocpu);
230:     VecBindToCPU(z,boundtocpu);
231:     VecBindToCPU(w,boundtocpu);
232:     VecGetSize(x,&N);
233:     VecSet(x,1./N);
234:     *n   = 0.0;
235:     for (i = 0; i < normsamples; i++) {
236:       MatMult(A,x,y);
237:       VecSign(y,w);
238:       MatMultTranspose(A,w,z);
239:       VecNorm(z,NORM_INFINITY,&normz);
240:       VecDot(x,z,&dot);
241:       adot = PetscAbsScalar(dot);
242:       PetscInfo(A,"%s norm it %" PetscInt_FMT " -> (%g %g)\n",NormTypes[normtype],i,(double)normz,(double)adot);
243:       if (normz <= adot && i > 0) {
244:         VecNorm(y,NORM_1,n);
245:         break;
246:       }
247:       VecMax(z,&j,&normz);
248:       if (j == jold) {
249:         VecNorm(y,NORM_1,n);
250:         PetscInfo(A,"%s norm it %" PetscInt_FMT " -> breakdown (j==jold)\n",NormTypes[normtype],i);
251:         break;
252:       }
253:       jold = j;
254:       VecSetDelta(x,j);
255:     }
256:     MatDestroy(&A);
257:     VecDestroy(&x);
258:     VecDestroy(&w);
259:     VecDestroy(&y);
260:     VecDestroy(&z);
261:     break;
262:   case NORM_2:
263:     if (normsamples < 0) normsamples = 20; /* pure guess */
264:     MatCreateVecs(A,&x,&y);
265:     MatCreateVecs(A,&z,NULL);
266:     VecBindToCPU(x,boundtocpu);
267:     VecBindToCPU(y,boundtocpu);
268:     VecBindToCPU(z,boundtocpu);
269:     VecSetRandom(x,NULL);
270:     VecNormalize(x,NULL);
271:     *n   = 0.0;
272:     for (i = 0; i < normsamples; i++) {
273:       MatMult(A,x,y);
274:       VecNormalize(y,n);
275:       MatMultTranspose(A,y,z);
276:       VecNorm(z,NORM_2,&normz);
277:       VecDot(x,z,&dot);
278:       adot = PetscAbsScalar(dot);
279:       PetscInfo(A,"%s norm it %" PetscInt_FMT " -> %g (%g %g)\n",NormTypes[normtype],i,(double)*n,(double)normz,(double)adot);
280:       if (normz <= adot) break;
281:       if (i < normsamples - 1) {
282:         Vec t;

284:         VecNormalize(z,NULL);
285:         t = x;
286:         x = z;
287:         z = t;
288:       }
289:     }
290:     VecDestroy(&x);
291:     VecDestroy(&y);
292:     VecDestroy(&z);
293:     break;
294:   default:
295:     SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"%s norm not supported",NormTypes[normtype]);
296:   }
297:   PetscInfo(A,"%s norm %g computed in %" PetscInt_FMT " iterations\n",NormTypes[normtype],(double)*n,i);
298:   return 0;
299: }