Actual source code: veccuda.c
1: /*
2: Implementation of the sequential cuda vectors.
4: This file contains the code that can be compiled with a C
5: compiler. The companion file veccuda2.cu contains the code that
6: must be compiled with nvcc or a C++ compiler.
7: */
9: #define PETSC_SKIP_SPINLOCK
11: #include <petscconf.h>
12: #include <petsc/private/vecimpl.h>
13: #include <../src/vec/vec/impls/dvecimpl.h>
14: #include <petsc/private/cudavecimpl.h>
16: PetscErrorCode VecCUDAGetArrays_Private(Vec v,const PetscScalar** x,const PetscScalar** x_d,PetscOffloadMask* flg)
17: {
19: if (x) {
20: Vec_Seq *h = (Vec_Seq*)v->data;
22: *x = h->array;
23: }
24: if (x_d) {
25: Vec_CUDA *d = (Vec_CUDA*)v->spptr;
27: *x_d = d ? d->GPUarray : NULL;
28: }
29: if (flg) *flg = v->offloadmask;
30: return 0;
31: }
33: /*
34: Allocates space for the vector array on the Host if it does not exist.
35: Does NOT change the PetscCUDAFlag for the vector
36: Does NOT zero the CUDA array
37: */
38: PetscErrorCode VecCUDAAllocateCheckHost(Vec v)
39: {
40: PetscScalar *array;
41: Vec_Seq *s = (Vec_Seq*)v->data;
42: PetscInt n = v->map->n;
44: if (!s) {
45: PetscNewLog((PetscObject)v,&s);
46: v->data = s;
47: }
48: if (!s->array) {
49: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
50: PetscMallocSetCUDAHost();
51: v->pinned_memory = PETSC_TRUE;
52: }
53: PetscMalloc1(n,&array);
54: PetscLogObjectMemory((PetscObject)v,n*sizeof(PetscScalar));
55: s->array = array;
56: s->array_allocated = array;
57: if (n*sizeof(PetscScalar) > v->minimum_bytes_pinned_memory) {
58: PetscMallocResetCUDAHost();
59: }
60: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
61: v->offloadmask = PETSC_OFFLOAD_CPU;
62: }
63: }
64: return 0;
65: }
67: PetscErrorCode VecCopy_SeqCUDA_Private(Vec xin,Vec yin)
68: {
69: PetscScalar *ya;
70: const PetscScalar *xa;
72: VecCUDAAllocateCheckHost(xin);
73: VecCUDAAllocateCheckHost(yin);
74: if (xin != yin) {
75: VecGetArrayRead(xin,&xa);
76: VecGetArray(yin,&ya);
77: PetscArraycpy(ya,xa,xin->map->n);
78: VecRestoreArrayRead(xin,&xa);
79: VecRestoreArray(yin,&ya);
80: }
81: return 0;
82: }
84: PetscErrorCode VecSetRandom_SeqCUDA(Vec xin,PetscRandom r)
85: {
86: PetscInt n = xin->map->n;
87: PetscBool iscurand;
88: PetscScalar *xx;
90: PetscObjectTypeCompare((PetscObject)r,PETSCCURAND,&iscurand);
91: if (iscurand) {
92: VecCUDAGetArrayWrite(xin,&xx);
93: } else {
94: VecGetArrayWrite(xin,&xx);
95: }
96: PetscRandomGetValues(r,n,xx);
97: if (iscurand) {
98: VecCUDARestoreArrayWrite(xin,&xx);
99: } else {
100: VecRestoreArrayWrite(xin,&xx);
101: }
102: return 0;
103: }
105: PetscErrorCode VecDestroy_SeqCUDA_Private(Vec v)
106: {
107: Vec_Seq *vs = (Vec_Seq*)v->data;
109: PetscObjectSAWsViewOff(v);
110: #if defined(PETSC_USE_LOG)
111: PetscLogObjectState((PetscObject)v,"Length=%" PetscInt_FMT,v->map->n);
112: #endif
113: if (vs) {
114: if (vs->array_allocated) {
115: if (v->pinned_memory) {
116: PetscMallocSetCUDAHost();
117: }
118: PetscFree(vs->array_allocated);
119: if (v->pinned_memory) {
120: PetscMallocResetCUDAHost();
121: v->pinned_memory = PETSC_FALSE;
122: }
123: }
124: PetscFree(vs);
125: }
126: return 0;
127: }
129: PetscErrorCode VecResetArray_SeqCUDA_Private(Vec vin)
130: {
131: Vec_Seq *v = (Vec_Seq*)vin->data;
133: v->array = v->unplacedarray;
134: v->unplacedarray = 0;
135: return 0;
136: }
138: PetscErrorCode VecResetArray_SeqCUDA(Vec vin)
139: {
140: VecCUDACopyFromGPU(vin);
141: VecResetArray_SeqCUDA_Private(vin);
142: vin->offloadmask = PETSC_OFFLOAD_CPU;
143: return 0;
144: }
146: PetscErrorCode VecPlaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
147: {
148: VecCUDACopyFromGPU(vin);
149: VecPlaceArray_Seq(vin,a);
150: vin->offloadmask = PETSC_OFFLOAD_CPU;
151: return 0;
152: }
154: PetscErrorCode VecReplaceArray_SeqCUDA(Vec vin,const PetscScalar *a)
155: {
156: Vec_Seq *vs = (Vec_Seq*)vin->data;
158: if (vs->array != vs->array_allocated) {
159: /* make sure the users array has the latest values */
160: VecCUDACopyFromGPU(vin);
161: }
162: if (vs->array_allocated) {
163: if (vin->pinned_memory) {
164: PetscMallocSetCUDAHost();
165: }
166: PetscFree(vs->array_allocated);
167: if (vin->pinned_memory) {
168: PetscMallocResetCUDAHost();
169: }
170: }
171: vin->pinned_memory = PETSC_FALSE;
172: vs->array_allocated = vs->array = (PetscScalar*)a;
173: vin->offloadmask = PETSC_OFFLOAD_CPU;
174: return 0;
175: }
177: /*@
178: VecCreateSeqCUDA - Creates a standard, sequential array-style vector.
180: Collective
182: Input Parameter:
183: + comm - the communicator, should be PETSC_COMM_SELF
184: - n - the vector length
186: Output Parameter:
187: . v - the vector
189: Notes:
190: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
191: same type as an existing vector.
193: Level: intermediate
195: .seealso: VecCreateMPICUDA(), VecCreateMPI(), VecCreate(), VecDuplicate(), VecDuplicateVecs(), VecCreateGhost()
196: @*/
197: PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec *v)
198: {
199: VecCreate(comm,v);
200: VecSetSizes(*v,n,n);
201: VecSetType(*v,VECSEQCUDA);
202: return 0;
203: }
205: PetscErrorCode VecDuplicate_SeqCUDA(Vec win,Vec *V)
206: {
207: VecCreateSeqCUDA(PetscObjectComm((PetscObject)win),win->map->n,V);
208: PetscLayoutReference(win->map,&(*V)->map);
209: PetscObjectListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
210: PetscFunctionListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
211: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
212: return 0;
213: }
215: PetscErrorCode VecCreate_SeqCUDA(Vec V)
216: {
217: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
218: PetscLayoutSetUp(V->map);
219: VecCUDAAllocateCheck(V);
220: VecCreate_SeqCUDA_Private(V,((Vec_CUDA*)V->spptr)->GPUarray_allocated);
221: VecSet_SeqCUDA(V,0.0);
222: return 0;
223: }
225: /*@C
226: VecCreateSeqCUDAWithArray - Creates a CUDA sequential array-style vector,
227: where the user provides the array space to store the vector values. The array
228: provided must be a GPU array.
230: Collective
232: Input Parameters:
233: + comm - the communicator, should be PETSC_COMM_SELF
234: . bs - the block size
235: . n - the vector length
236: - array - GPU memory where the vector elements are to be stored.
238: Output Parameter:
239: . V - the vector
241: Notes:
242: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
243: same type as an existing vector.
245: If the user-provided array is NULL, then VecCUDAPlaceArray() can be used
246: at a later stage to SET the array for storing the vector values.
248: PETSc does NOT free the array when the vector is destroyed via VecDestroy().
249: The user should not free the array until the vector is destroyed.
251: Level: intermediate
253: .seealso: VecCreateMPICUDAWithArray(), VecCreate(), VecDuplicate(), VecDuplicateVecs(),
254: VecCreateGhost(), VecCreateSeq(), VecCUDAPlaceArray(), VecCreateSeqWithArray(),
255: VecCreateMPIWithArray()
256: @*/
257: PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar array[],Vec *V)
258: {
259: PetscDeviceInitialize(PETSC_DEVICE_CUDA);
260: VecCreate(comm,V);
261: VecSetSizes(*V,n,n);
262: VecSetBlockSize(*V,bs);
263: VecCreate_SeqCUDA_Private(*V,array);
264: return 0;
265: }
267: /*@C
268: VecCreateSeqCUDAWithArrays - Creates a CUDA sequential array-style vector,
269: where the user provides the array space to store the vector values.
271: Collective
273: Input Parameters:
274: + comm - the communicator, should be PETSC_COMM_SELF
275: . bs - the block size
276: . n - the vector length
277: - cpuarray - CPU memory where the vector elements are to be stored.
278: - gpuarray - GPU memory where the vector elements are to be stored.
280: Output Parameter:
281: . V - the vector
283: Notes:
284: If both cpuarray and gpuarray are provided, the caller must ensure that
285: the provided arrays have identical values.
287: PETSc does NOT free the provided arrays when the vector is destroyed via
288: VecDestroy(). The user should not free the array until the vector is
289: destroyed.
291: Level: intermediate
293: .seealso: VecCreateMPICUDAWithArrays(), VecCreate(), VecCreateSeqWithArray(),
294: VecCUDAPlaceArray(), VecCreateSeqCUDAWithArray(),
295: VecCUDAAllocateCheckHost()
296: @*/
297: PetscErrorCode VecCreateSeqCUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec *V)
298: {
299: // set V's gpuarray to be gpuarray, do not allocate memory on host yet.
300: VecCreateSeqCUDAWithArray(comm,bs,n,gpuarray,V);
302: if (cpuarray && gpuarray) {
303: Vec_Seq *s = (Vec_Seq*)((*V)->data);
304: s->array = (PetscScalar*)cpuarray;
305: (*V)->offloadmask = PETSC_OFFLOAD_BOTH;
306: } else if (cpuarray) {
307: Vec_Seq *s = (Vec_Seq*)((*V)->data);
308: s->array = (PetscScalar*)cpuarray;
309: (*V)->offloadmask = PETSC_OFFLOAD_CPU;
310: } else if (gpuarray) {
311: (*V)->offloadmask = PETSC_OFFLOAD_GPU;
312: } else {
313: (*V)->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
314: }
316: return 0;
317: }
319: PetscErrorCode VecGetArray_SeqCUDA(Vec v,PetscScalar **a)
320: {
321: VecCUDACopyFromGPU(v);
322: *a = *((PetscScalar**)v->data);
323: return 0;
324: }
326: PetscErrorCode VecRestoreArray_SeqCUDA(Vec v,PetscScalar **a)
327: {
328: v->offloadmask = PETSC_OFFLOAD_CPU;
329: return 0;
330: }
332: PetscErrorCode VecGetArrayWrite_SeqCUDA(Vec v,PetscScalar **a)
333: {
334: VecCUDAAllocateCheckHost(v);
335: *a = *((PetscScalar**)v->data);
336: return 0;
337: }
339: PetscErrorCode VecGetArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a,PetscMemType *mtype)
340: {
341: VecCUDACopyToGPU(v);
342: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
343: if (mtype) *mtype = ((Vec_CUDA*)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
344: return 0;
345: }
347: PetscErrorCode VecRestoreArrayAndMemType_SeqCUDA(Vec v,PetscScalar** a)
348: {
349: v->offloadmask = PETSC_OFFLOAD_GPU;
350: return 0;
351: }
353: PetscErrorCode VecGetArrayWriteAndMemType_SeqCUDA(Vec v,PetscScalar** a,PetscMemType *mtype)
354: {
355: /* Allocate memory (not zeroed) on device if not yet, but no need to sync data from host to device */
356: VecCUDAAllocateCheck(v);
357: *a = ((Vec_CUDA*)v->spptr)->GPUarray;
358: if (mtype) *mtype = ((Vec_CUDA*)v->spptr)->nvshmem ? PETSC_MEMTYPE_NVSHMEM : PETSC_MEMTYPE_CUDA;
359: return 0;
360: }
362: PetscErrorCode VecBindToCPU_SeqCUDA(Vec V,PetscBool bind)
363: {
364: V->boundtocpu = bind;
365: if (bind) {
366: VecCUDACopyFromGPU(V);
367: V->offloadmask = PETSC_OFFLOAD_CPU; /* since the CPU code will likely change values in the vector */
368: V->ops->dot = VecDot_Seq;
369: V->ops->norm = VecNorm_Seq;
370: V->ops->tdot = VecTDot_Seq;
371: V->ops->scale = VecScale_Seq;
372: V->ops->copy = VecCopy_Seq;
373: V->ops->set = VecSet_Seq;
374: V->ops->swap = VecSwap_Seq;
375: V->ops->axpy = VecAXPY_Seq;
376: V->ops->axpby = VecAXPBY_Seq;
377: V->ops->axpbypcz = VecAXPBYPCZ_Seq;
378: V->ops->pointwisemult = VecPointwiseMult_Seq;
379: V->ops->pointwisedivide = VecPointwiseDivide_Seq;
380: V->ops->setrandom = VecSetRandom_Seq;
381: V->ops->dot_local = VecDot_Seq;
382: V->ops->tdot_local = VecTDot_Seq;
383: V->ops->norm_local = VecNorm_Seq;
384: V->ops->mdot_local = VecMDot_Seq;
385: V->ops->mtdot_local = VecMTDot_Seq;
386: V->ops->maxpy = VecMAXPY_Seq;
387: V->ops->mdot = VecMDot_Seq;
388: V->ops->mtdot = VecMTDot_Seq;
389: V->ops->aypx = VecAYPX_Seq;
390: V->ops->waxpy = VecWAXPY_Seq;
391: V->ops->dotnorm2 = NULL;
392: V->ops->placearray = VecPlaceArray_Seq;
393: V->ops->replacearray = VecReplaceArray_SeqCUDA;
394: V->ops->resetarray = VecResetArray_Seq;
395: V->ops->duplicate = VecDuplicate_Seq;
396: V->ops->conjugate = VecConjugate_Seq;
397: V->ops->getlocalvector = NULL;
398: V->ops->restorelocalvector = NULL;
399: V->ops->getlocalvectorread = NULL;
400: V->ops->restorelocalvectorread = NULL;
401: V->ops->getarraywrite = NULL;
402: V->ops->getarrayandmemtype = NULL;
403: V->ops->getarraywriteandmemtype= NULL;
404: V->ops->restorearrayandmemtype = NULL;
405: V->ops->max = VecMax_Seq;
406: V->ops->min = VecMin_Seq;
407: V->ops->reciprocal = VecReciprocal_Default;
408: V->ops->sum = NULL;
409: V->ops->shift = NULL;
410: /* default random number generator */
411: PetscFree(V->defaultrandtype);
412: PetscStrallocpy(PETSCRANDER48,&V->defaultrandtype);
413: } else {
414: V->ops->dot = VecDot_SeqCUDA;
415: V->ops->norm = VecNorm_SeqCUDA;
416: V->ops->tdot = VecTDot_SeqCUDA;
417: V->ops->scale = VecScale_SeqCUDA;
418: V->ops->copy = VecCopy_SeqCUDA;
419: V->ops->set = VecSet_SeqCUDA;
420: V->ops->swap = VecSwap_SeqCUDA;
421: V->ops->axpy = VecAXPY_SeqCUDA;
422: V->ops->axpby = VecAXPBY_SeqCUDA;
423: V->ops->axpbypcz = VecAXPBYPCZ_SeqCUDA;
424: V->ops->pointwisemult = VecPointwiseMult_SeqCUDA;
425: V->ops->pointwisedivide = VecPointwiseDivide_SeqCUDA;
426: V->ops->setrandom = VecSetRandom_SeqCUDA;
427: V->ops->dot_local = VecDot_SeqCUDA;
428: V->ops->tdot_local = VecTDot_SeqCUDA;
429: V->ops->norm_local = VecNorm_SeqCUDA;
430: V->ops->mdot_local = VecMDot_SeqCUDA;
431: V->ops->maxpy = VecMAXPY_SeqCUDA;
432: V->ops->mdot = VecMDot_SeqCUDA;
433: V->ops->aypx = VecAYPX_SeqCUDA;
434: V->ops->waxpy = VecWAXPY_SeqCUDA;
435: V->ops->dotnorm2 = VecDotNorm2_SeqCUDA;
436: V->ops->placearray = VecPlaceArray_SeqCUDA;
437: V->ops->replacearray = VecReplaceArray_SeqCUDA;
438: V->ops->resetarray = VecResetArray_SeqCUDA;
439: V->ops->destroy = VecDestroy_SeqCUDA;
440: V->ops->duplicate = VecDuplicate_SeqCUDA;
441: V->ops->conjugate = VecConjugate_SeqCUDA;
442: V->ops->getlocalvector = VecGetLocalVector_SeqCUDA;
443: V->ops->restorelocalvector = VecRestoreLocalVector_SeqCUDA;
444: V->ops->getlocalvectorread = VecGetLocalVectorRead_SeqCUDA;
445: V->ops->restorelocalvectorread = VecRestoreLocalVectorRead_SeqCUDA;
446: V->ops->getarraywrite = VecGetArrayWrite_SeqCUDA;
447: V->ops->getarray = VecGetArray_SeqCUDA;
448: V->ops->restorearray = VecRestoreArray_SeqCUDA;
449: V->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqCUDA;
450: V->ops->getarraywriteandmemtype= VecGetArrayWriteAndMemType_SeqCUDA;
451: V->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqCUDA;
452: V->ops->max = VecMax_SeqCUDA;
453: V->ops->min = VecMin_SeqCUDA;
454: V->ops->reciprocal = VecReciprocal_SeqCUDA;
455: V->ops->sum = VecSum_SeqCUDA;
456: V->ops->shift = VecShift_SeqCUDA;
458: /* default random number generator */
459: PetscFree(V->defaultrandtype);
460: PetscStrallocpy(PETSCCURAND,&V->defaultrandtype);
461: }
462: return 0;
463: }
465: PetscErrorCode VecCreate_SeqCUDA_Private(Vec V,const PetscScalar *array)
466: {
468: Vec_CUDA *veccuda;
469: PetscMPIInt size;
470: PetscBool option_set;
472: MPI_Comm_size(PetscObjectComm((PetscObject)V),&size);
474: VecCreate_Seq_Private(V,0);
475: PetscObjectChangeTypeName((PetscObject)V,VECSEQCUDA);
476: VecBindToCPU_SeqCUDA(V,PETSC_FALSE);
477: V->ops->bindtocpu = VecBindToCPU_SeqCUDA;
479: /* Later, functions check for the Vec_CUDA structure existence, so do not create it without array */
480: if (array) {
481: if (!V->spptr) {
482: PetscReal pinned_memory_min;
483: PetscCalloc(sizeof(Vec_CUDA),&V->spptr);
484: veccuda = (Vec_CUDA*)V->spptr;
485: V->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
487: pinned_memory_min = 0;
488: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
489: Note: This same code duplicated in VecCUDAAllocateCheck() and VecCreate_MPICUDA_Private(). Is there a good way to avoid this? */
490: PetscOptionsBegin(PetscObjectComm((PetscObject)V),((PetscObject)V)->prefix,"VECCUDA Options","Vec");
491: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
492: if (option_set) V->minimum_bytes_pinned_memory = pinned_memory_min;
493: PetscOptionsEnd();
494: }
495: veccuda = (Vec_CUDA*)V->spptr;
496: veccuda->GPUarray = (PetscScalar*)array;
497: V->offloadmask = PETSC_OFFLOAD_GPU;
498: }
499: return 0;
500: }