Actual source code: vechip2.hip.cpp
1: /*
2: Implements the sequential hip vectors.
3: */
5: #define PETSC_SKIP_SPINLOCK
7: #include <petscconf.h>
8: #include <petsc/private/vecimpl.h>
9: #include <../src/vec/vec/impls/dvecimpl.h>
10: #include <petsc/private/hipvecimpl.h>
12: #include <hip/hip_runtime.h>
13: #include <thrust/device_ptr.h>
14: #include <thrust/transform.h>
15: #include <thrust/functional.h>
16: #include <thrust/reduce.h>
18: #if defined(PETSC_USE_COMPLEX)
19: /* SPOCK compilation issues, need to unroll division and multiplication with complex numbers */
20: struct PetscDivideComplex
21: {
22: __host__ __device__
23: PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
24: {
25: PetscReal lx = PetscRealPart(lhs);
26: PetscReal ly = PetscImaginaryPart(lhs);
27: PetscReal rx = PetscRealPart(rhs);
28: PetscReal ry = PetscImaginaryPart(rhs);
29: PetscReal n = rx*rx + ry*ry;
30: return PetscComplex((lx*rx + ly*ry)/n,(rx*ly - lx*ry)/n);
31: }
32: };
34: struct PetscMultiplyComplex
35: {
36: __host__ __device__
37: PetscScalar operator()(const PetscScalar& lhs, const PetscScalar& rhs)
38: {
39: PetscReal lx = PetscRealPart(lhs);
40: PetscReal ly = PetscImaginaryPart(lhs);
41: PetscReal rx = PetscRealPart(rhs);
42: PetscReal ry = PetscImaginaryPart(rhs);
43: return PetscComplex(lx*rx-ly*ry,ly*rx+lx*ry);
44: }
45: };
46: #endif
48: /*
49: Allocates space for the vector array on the GPU if it does not exist.
50: Does NOT change the PetscHIPFlag for the vector
51: Does NOT zero the HIP array
53: */
54: PetscErrorCode VecHIPAllocateCheck(Vec v)
55: {
57: hipError_t err;
58: Vec_HIP *vechip;
59: PetscBool option_set;
62: if (!v->spptr) {
63: PetscReal pinned_memory_min;
64: PetscCalloc(sizeof(Vec_HIP),&v->spptr);
65: vechip = (Vec_HIP*)v->spptr;
66: err = hipMalloc((void**)&vechip->GPUarray_allocated,sizeof(PetscScalar)*((PetscBLASInt)v->map->n));CHKERRHIP(err);
67: vechip->GPUarray = vechip->GPUarray_allocated;
68: if (v->offloadmask == PETSC_OFFLOAD_UNALLOCATED) {
69: if (v->data && ((Vec_Seq*)v->data)->array) {
70: v->offloadmask = PETSC_OFFLOAD_CPU;
71: } else {
72: v->offloadmask = PETSC_OFFLOAD_GPU;
73: }
74: }
75: pinned_memory_min = 0;
77: /* Need to parse command line for minimum size to use for pinned memory allocations on host here.
78: Note: This same code duplicated in VecCreate_SeqHIP_Private() and VecCreate_MPIHIP_Private(). Is there a good way to avoid this? */
79: PetscOptionsBegin(PetscObjectComm((PetscObject)v),((PetscObject)v)->prefix,"VECHIP Options","Vec");
80: PetscOptionsReal("-vec_pinned_memory_min","Minimum size (in bytes) for an allocation to use pinned memory on host","VecSetPinnedMemoryMin",pinned_memory_min,&pinned_memory_min,&option_set);
81: if (option_set) v->minimum_bytes_pinned_memory = pinned_memory_min;
82: PetscOptionsEnd();
83: }
84: return(0);
85: }
87: /* Copies a vector from the CPU to the GPU unless we already have an up-to-date copy on the GPU */
88: PetscErrorCode VecHIPCopyToGPU(Vec v)
89: {
91: hipError_t err;
92: Vec_HIP *vechip;
93: PetscScalar *varray;
97: VecHIPAllocateCheck(v);
98: if (v->offloadmask == PETSC_OFFLOAD_CPU) {
99: PetscLogEventBegin(VEC_HIPCopyToGPU,v,0,0,0);
100: vechip = (Vec_HIP*)v->spptr;
101: varray = vechip->GPUarray;
102: err = hipMemcpy(varray,((Vec_Seq*)v->data)->array,v->map->n*sizeof(PetscScalar),hipMemcpyHostToDevice);CHKERRHIP(err);
103: PetscLogCpuToGpu((v->map->n)*sizeof(PetscScalar));
104: PetscLogEventEnd(VEC_HIPCopyToGPU,v,0,0,0);
105: v->offloadmask = PETSC_OFFLOAD_BOTH;
106: }
107: return(0);
108: }
110: /*
111: VecHIPCopyFromGPU - Copies a vector from the GPU to the CPU unless we already have an up-to-date copy on the CPU
112: */
113: PetscErrorCode VecHIPCopyFromGPU(Vec v)
114: {
116: hipError_t err;
117: Vec_HIP *vechip;
118: PetscScalar *varray;
122: VecHIPAllocateCheckHost(v);
123: if (v->offloadmask == PETSC_OFFLOAD_GPU) {
124: PetscLogEventBegin(VEC_HIPCopyFromGPU,v,0,0,0);
125: vechip = (Vec_HIP*)v->spptr;
126: varray = vechip->GPUarray;
127: err = hipMemcpy(((Vec_Seq*)v->data)->array,varray,v->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);CHKERRHIP(err);
128: PetscLogGpuToCpu((v->map->n)*sizeof(PetscScalar));
129: PetscLogEventEnd(VEC_HIPCopyFromGPU,v,0,0,0);
130: v->offloadmask = PETSC_OFFLOAD_BOTH;
131: }
132: return(0);
133: }
135: /*MC
136: VECSEQHIP - VECSEQHIP = "seqhip" - The basic sequential vector, modified to use HIP
138: Options Database Keys:
139: . -vec_type seqhip - sets the vector type to VECSEQHIP during a call to VecSetFromOptions()
141: Level: beginner
143: .seealso: VecCreate(), VecSetType(), VecSetFromOptions(), VecCreateSeqWithArray(), VECMPI, VecType, VecCreateMPI(), VecCreateSeq()
144: M*/
146: PetscErrorCode VecAYPX_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
147: {
148: const PetscScalar *xarray;
149: PetscScalar *yarray;
150: PetscErrorCode ierr;
151: PetscBLASInt one = 1,bn = 0;
152: PetscScalar sone = 1.0;
153: hipblasHandle_t hipblasv2handle;
154: hipblasStatus_t hberr;
155: hipError_t err;
158: PetscHIPBLASGetHandle(&hipblasv2handle);
159: PetscBLASIntCast(yin->map->n,&bn);
160: VecHIPGetArrayRead(xin,&xarray);
161: VecHIPGetArray(yin,&yarray);
162: PetscLogGpuTimeBegin();
163: if (alpha == (PetscScalar)0.0) {
164: err = hipMemcpy(yarray,xarray,bn*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
165: } else if (alpha == (PetscScalar)1.0) {
166: hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
167: PetscLogGpuFlops(1.0*yin->map->n);
168: } else {
169: hberr = hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);CHKERRHIPBLAS(hberr);
170: hberr = hipblasXaxpy(hipblasv2handle,bn,&sone,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
171: PetscLogGpuFlops(2.0*yin->map->n);
172: }
173: PetscLogGpuTimeEnd();
174: VecHIPRestoreArrayRead(xin,&xarray);
175: VecHIPRestoreArray(yin,&yarray);
176: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
177: return(0);
178: }
180: PetscErrorCode VecAXPY_SeqHIP(Vec yin,PetscScalar alpha,Vec xin)
181: {
182: const PetscScalar *xarray;
183: PetscScalar *yarray;
184: PetscErrorCode ierr;
185: PetscBLASInt one = 1,bn = 0;
186: hipblasHandle_t hipblasv2handle;
187: hipblasStatus_t hberr;
188: PetscBool xiship;
191: if (alpha == (PetscScalar)0.0) return(0);
192: PetscHIPBLASGetHandle(&hipblasv2handle);
193: PetscObjectTypeCompareAny((PetscObject)xin,&xiship,VECSEQHIP,VECMPIHIP,"");
194: if (xiship) {
195: PetscBLASIntCast(yin->map->n,&bn);
196: VecHIPGetArrayRead(xin,&xarray);
197: VecHIPGetArray(yin,&yarray);
198: PetscLogGpuTimeBegin();
199: hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
200: PetscLogGpuTimeEnd();
201: VecHIPRestoreArrayRead(xin,&xarray);
202: VecHIPRestoreArray(yin,&yarray);
203: PetscLogGpuFlops(2.0*yin->map->n);
204: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
205: } else {
206: VecAXPY_Seq(yin,alpha,xin);
207: }
208: return(0);
209: }
211: PetscErrorCode VecPointwiseDivide_SeqHIP(Vec win, Vec xin, Vec yin)
212: {
213: PetscInt n = xin->map->n;
214: const PetscScalar *xarray=NULL,*yarray=NULL;
215: PetscScalar *warray=NULL;
216: thrust::device_ptr<const PetscScalar> xptr,yptr;
217: thrust::device_ptr<PetscScalar> wptr;
218: PetscErrorCode ierr;
221: if (xin->boundtocpu || yin->boundtocpu) {
222: VecPointwiseDivide_Seq(win,xin,yin);
223: return(0);
224: }
225: VecHIPGetArrayWrite(win,&warray);
226: VecHIPGetArrayRead(xin,&xarray);
227: VecHIPGetArrayRead(yin,&yarray);
228: PetscLogGpuTimeBegin();
229: try {
230: wptr = thrust::device_pointer_cast(warray);
231: xptr = thrust::device_pointer_cast(xarray);
232: yptr = thrust::device_pointer_cast(yarray);
233: #if defined(PETSC_USE_COMPLEX)
234: thrust::transform(xptr,xptr+n,yptr,wptr,PetscDivideComplex());
235: #else
236: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::divides<PetscScalar>());
237: #endif
238: } catch (char *ex) {
239: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
240: }
241: PetscLogGpuTimeEnd();
242: PetscLogGpuFlops(n);
243: VecHIPRestoreArrayRead(xin,&xarray);
244: VecHIPRestoreArrayRead(yin,&yarray);
245: VecHIPRestoreArrayWrite(win,&warray);
246: return(0);
247: }
249: PetscErrorCode VecWAXPY_SeqHIP(Vec win,PetscScalar alpha,Vec xin, Vec yin)
250: {
251: const PetscScalar *xarray=NULL,*yarray=NULL;
252: PetscScalar *warray=NULL;
253: PetscErrorCode ierr;
254: PetscBLASInt one = 1,bn = 0;
255: hipblasHandle_t hipblasv2handle;
256: hipblasStatus_t hberr;
257: hipError_t err;
260: PetscHIPBLASGetHandle(&hipblasv2handle);
261: PetscBLASIntCast(win->map->n,&bn);
262: if (alpha == (PetscScalar)0.0) {
263: VecCopy_SeqHIP(yin,win);
264: } else {
265: VecHIPGetArrayRead(xin,&xarray);
266: VecHIPGetArrayRead(yin,&yarray);
267: VecHIPGetArrayWrite(win,&warray);
268: PetscLogGpuTimeBegin();
269: err = hipMemcpy(warray,yarray,win->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
270: hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,warray,one);CHKERRHIPBLAS(hberr);
271: PetscLogGpuTimeEnd();
272: PetscLogGpuFlops(2*win->map->n);
273: VecHIPRestoreArrayRead(xin,&xarray);
274: VecHIPRestoreArrayRead(yin,&yarray);
275: VecHIPRestoreArrayWrite(win,&warray);
276: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
277: }
278: return(0);
279: }
281: PetscErrorCode VecMAXPY_SeqHIP(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *y)
282: {
283: PetscErrorCode ierr;
284: PetscInt n = xin->map->n,j;
285: PetscScalar *xarray;
286: const PetscScalar *yarray;
287: PetscBLASInt one = 1,bn = 0;
288: hipblasHandle_t hipblasv2handle;
289: hipblasStatus_t cberr;
292: PetscLogGpuFlops(nv*2.0*n);
293: PetscLogCpuToGpuScalar(nv*sizeof(PetscScalar));
294: PetscHIPBLASGetHandle(&hipblasv2handle);
295: PetscBLASIntCast(n,&bn);
296: VecHIPGetArray(xin,&xarray);
297: PetscLogGpuTimeBegin();
298: for (j=0; j<nv; j++) {
299: VecHIPGetArrayRead(y[j],&yarray);
300: cberr = hipblasXaxpy(hipblasv2handle,bn,alpha+j,yarray,one,xarray,one);CHKERRHIPBLAS(cberr);
301: VecHIPRestoreArrayRead(y[j],&yarray);
302: }
303: PetscLogGpuTimeEnd();
304: VecHIPRestoreArray(xin,&xarray);
305: return(0);
306: }
308: PetscErrorCode VecDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
309: {
310: const PetscScalar *xarray,*yarray;
311: PetscErrorCode ierr;
312: PetscBLASInt one = 1,bn = 0;
313: hipblasHandle_t hipblasv2handle;
314: hipblasStatus_t cerr;
317: PetscHIPBLASGetHandle(&hipblasv2handle);
318: PetscBLASIntCast(yin->map->n,&bn);
319: VecHIPGetArrayRead(xin,&xarray);
320: VecHIPGetArrayRead(yin,&yarray);
321: /* arguments y, x are reversed because BLAS complex conjugates the first argument, PETSc the second */
322: PetscLogGpuTimeBegin();
323: cerr = hipblasXdot(hipblasv2handle,bn,yarray,one,xarray,one,z);CHKERRHIPBLAS(cerr);
324: PetscLogGpuTimeEnd();
325: if (xin->map->n >0) {
326: PetscLogGpuFlops(2.0*xin->map->n-1);
327: }
328: PetscLogGpuToCpu(sizeof(PetscScalar));
329: VecHIPRestoreArrayRead(xin,&xarray);
330: VecHIPRestoreArrayRead(yin,&yarray);
331: return(0);
332: }
334: //
335: // HIP kernels for MDot to follow
336: //
338: // set work group size to be a power of 2 (128 is usually a good compromise between portability and speed)
339: #define MDOT_WORKGROUP_SIZE 128
340: #define MDOT_WORKGROUP_NUM 128
342: #if !defined(PETSC_USE_COMPLEX)
343: // M = 2:
344: __global__ void VecMDot_SeqHIP_kernel2(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,
345: PetscInt size, PetscScalar *group_results)
346: {
347: __shared__ PetscScalar tmp_buffer[2*MDOT_WORKGROUP_SIZE];
348: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
349: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
350: PetscInt vec_start_index = blockIdx.x * entries_per_group;
351: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
353: PetscScalar entry_x = 0;
354: PetscScalar group_sum0 = 0;
355: PetscScalar group_sum1 = 0;
356: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
357: entry_x = x[i]; // load only once from global memory!
358: group_sum0 += entry_x * y0[i];
359: group_sum1 += entry_x * y1[i];
360: }
361: tmp_buffer[threadIdx.x] = group_sum0;
362: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
364: // parallel reduction
365: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
366: __syncthreads();
367: if (threadIdx.x < stride) {
368: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
369: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
370: }
371: }
373: // write result of group to group_results
374: if (threadIdx.x == 0) {
375: group_results[blockIdx.x] = tmp_buffer[0];
376: group_results[blockIdx.x + gridDim.x] = tmp_buffer[MDOT_WORKGROUP_SIZE];
377: }
378: }
380: // M = 3:
381: __global__ void VecMDot_SeqHIP_kernel3(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,
382: PetscInt size, PetscScalar *group_results)
383: {
384: __shared__ PetscScalar tmp_buffer[3*MDOT_WORKGROUP_SIZE];
385: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
386: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
387: PetscInt vec_start_index = blockIdx.x * entries_per_group;
388: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
390: PetscScalar entry_x = 0;
391: PetscScalar group_sum0 = 0;
392: PetscScalar group_sum1 = 0;
393: PetscScalar group_sum2 = 0;
394: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
395: entry_x = x[i]; // load only once from global memory!
396: group_sum0 += entry_x * y0[i];
397: group_sum1 += entry_x * y1[i];
398: group_sum2 += entry_x * y2[i];
399: }
400: tmp_buffer[threadIdx.x] = group_sum0;
401: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
402: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
404: // parallel reduction
405: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
406: __syncthreads();
407: if (threadIdx.x < stride) {
408: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
409: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
410: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
411: }
412: }
414: // write result of group to group_results
415: if (threadIdx.x == 0) {
416: group_results[blockIdx.x ] = tmp_buffer[0];
417: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
418: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
419: }
420: }
422: // M = 4:
423: __global__ void VecMDot_SeqHIP_kernel4(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
424: PetscInt size, PetscScalar *group_results)
425: {
426: __shared__ PetscScalar tmp_buffer[4*MDOT_WORKGROUP_SIZE];
427: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
428: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
429: PetscInt vec_start_index = blockIdx.x * entries_per_group;
430: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
432: PetscScalar entry_x = 0;
433: PetscScalar group_sum0 = 0;
434: PetscScalar group_sum1 = 0;
435: PetscScalar group_sum2 = 0;
436: PetscScalar group_sum3 = 0;
437: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
438: entry_x = x[i]; // load only once from global memory!
439: group_sum0 += entry_x * y0[i];
440: group_sum1 += entry_x * y1[i];
441: group_sum2 += entry_x * y2[i];
442: group_sum3 += entry_x * y3[i];
443: }
444: tmp_buffer[threadIdx.x] = group_sum0;
445: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
446: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
447: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
449: // parallel reduction
450: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
451: __syncthreads();
452: if (threadIdx.x < stride) {
453: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
454: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
455: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
456: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
457: }
458: }
460: // write result of group to group_results
461: if (threadIdx.x == 0) {
462: group_results[blockIdx.x ] = tmp_buffer[0];
463: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
464: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
465: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
466: }
467: }
469: // M = 8:
470: __global__ void VecMDot_SeqHIP_kernel8(const PetscScalar *x,const PetscScalar *y0,const PetscScalar *y1,const PetscScalar *y2,const PetscScalar *y3,
471: const PetscScalar *y4,const PetscScalar *y5,const PetscScalar *y6,const PetscScalar *y7,
472: PetscInt size, PetscScalar *group_results)
473: {
474: __shared__ PetscScalar tmp_buffer[8*MDOT_WORKGROUP_SIZE];
475: PetscInt entries_per_group = (size - 1) / gridDim.x + 1;
476: entries_per_group = (entries_per_group == 0) ? 1 : entries_per_group; // for very small vectors, a group should still do some work
477: PetscInt vec_start_index = blockIdx.x * entries_per_group;
478: PetscInt vec_stop_index = PetscMin((blockIdx.x + 1) * entries_per_group, size); // don't go beyond vec size
480: PetscScalar entry_x = 0;
481: PetscScalar group_sum0 = 0;
482: PetscScalar group_sum1 = 0;
483: PetscScalar group_sum2 = 0;
484: PetscScalar group_sum3 = 0;
485: PetscScalar group_sum4 = 0;
486: PetscScalar group_sum5 = 0;
487: PetscScalar group_sum6 = 0;
488: PetscScalar group_sum7 = 0;
489: for (PetscInt i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
490: entry_x = x[i]; // load only once from global memory!
491: group_sum0 += entry_x * y0[i];
492: group_sum1 += entry_x * y1[i];
493: group_sum2 += entry_x * y2[i];
494: group_sum3 += entry_x * y3[i];
495: group_sum4 += entry_x * y4[i];
496: group_sum5 += entry_x * y5[i];
497: group_sum6 += entry_x * y6[i];
498: group_sum7 += entry_x * y7[i];
499: }
500: tmp_buffer[threadIdx.x] = group_sum0;
501: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] = group_sum1;
502: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] = group_sum2;
503: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] = group_sum3;
504: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] = group_sum4;
505: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] = group_sum5;
506: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] = group_sum6;
507: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] = group_sum7;
509: // parallel reduction
510: for (PetscInt stride = blockDim.x/2; stride > 0; stride /= 2) {
511: __syncthreads();
512: if (threadIdx.x < stride) {
513: tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
514: tmp_buffer[threadIdx.x + MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + MDOT_WORKGROUP_SIZE];
515: tmp_buffer[threadIdx.x + 2 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 2 * MDOT_WORKGROUP_SIZE];
516: tmp_buffer[threadIdx.x + 3 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 3 * MDOT_WORKGROUP_SIZE];
517: tmp_buffer[threadIdx.x + 4 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 4 * MDOT_WORKGROUP_SIZE];
518: tmp_buffer[threadIdx.x + 5 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 5 * MDOT_WORKGROUP_SIZE];
519: tmp_buffer[threadIdx.x + 6 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 6 * MDOT_WORKGROUP_SIZE];
520: tmp_buffer[threadIdx.x + 7 * MDOT_WORKGROUP_SIZE] += tmp_buffer[threadIdx.x+stride + 7 * MDOT_WORKGROUP_SIZE];
521: }
522: }
524: // write result of group to group_results
525: if (threadIdx.x == 0) {
526: group_results[blockIdx.x ] = tmp_buffer[0];
527: group_results[blockIdx.x + gridDim.x] = tmp_buffer[ MDOT_WORKGROUP_SIZE];
528: group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * MDOT_WORKGROUP_SIZE];
529: group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * MDOT_WORKGROUP_SIZE];
530: group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * MDOT_WORKGROUP_SIZE];
531: group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * MDOT_WORKGROUP_SIZE];
532: group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * MDOT_WORKGROUP_SIZE];
533: group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * MDOT_WORKGROUP_SIZE];
534: }
535: }
536: #endif /* !defined(PETSC_USE_COMPLEX) */
538: PetscErrorCode VecMDot_SeqHIP(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
539: {
540: PetscErrorCode ierr;
541: PetscInt i,n = xin->map->n,current_y_index = 0;
542: const PetscScalar *xptr,*y0ptr,*y1ptr,*y2ptr,*y3ptr,*y4ptr,*y5ptr,*y6ptr,*y7ptr;
543: #if !defined(PETSC_USE_COMPLEX)
544: PetscInt nv1 = ((nv % 4) == 1) ? nv-1: nv,j;
545: PetscScalar *group_results_gpu,*group_results_cpu;
546: hipError_t hip_ierr;
547: #endif
548: PetscBLASInt one = 1,bn = 0;
549: hipblasHandle_t hipblasv2handle;
550: hipblasStatus_t hberr;
553: PetscHIPBLASGetHandle(&hipblasv2handle);
554: PetscBLASIntCast(xin->map->n,&bn);
555: if (nv <= 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Number of vectors provided to VecMDot_SeqHIP not positive.");
556: /* Handle the case of local size zero first */
557: if (!xin->map->n) {
558: for (i=0; i<nv; ++i) z[i] = 0;
559: return(0);
560: }
562: #if !defined(PETSC_USE_COMPLEX)
563: // allocate scratchpad memory for the results of individual work groups:
564: hip_hipMalloc((void**)&group_results_gpu, nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);CHKERRHIP(hip_ierr);
565: PetscMalloc1(nv1*MDOT_WORKGROUP_NUM,&group_results_cpu);
566: #endif
567: VecHIPGetArrayRead(xin,&xptr);
568: PetscLogGpuTimeBegin();
570: while (current_y_index < nv)
571: {
572: switch (nv - current_y_index) {
574: case 7:
575: case 6:
576: case 5:
577: case 4:
578: VecHIPGetArrayRead(yin[current_y_index ],&y0ptr);
579: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
580: VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
581: VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
582: #if defined(PETSC_USE_COMPLEX)
583: hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
584: hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
585: hberr = hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRHIPBLAS(hberr);
586: hberr = hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRHIPBLAS(hberr);
587: #else
588: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel4, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
589: #endif
590: VecHIPRestoreArrayRead(yin[current_y_index ],&y0ptr);
591: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
592: VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
593: VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
594: current_y_index += 4;
595: break;
597: case 3:
598: VecHIPGetArrayRead(yin[current_y_index ],&y0ptr);
599: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
600: VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
602: #if defined(PETSC_USE_COMPLEX)
603: hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
604: hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
605: hberr = hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRHIPBLAS(hberr);
606: #else
607: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel3, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
608: #endif
609: VecHIPRestoreArrayRead(yin[current_y_index ],&y0ptr);
610: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
611: VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
612: current_y_index += 3;
613: break;
615: case 2:
616: VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
617: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
618: #if defined(PETSC_USE_COMPLEX)
619: hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
620: hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
621: #else
622: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel2, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
623: #endif
624: VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
625: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
626: current_y_index += 2;
627: break;
629: case 1:
630: VecHIPGetArrayRead(yin[current_y_index],&y0ptr);
631: hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
632: VecHIPRestoreArrayRead(yin[current_y_index],&y0ptr);
633: current_y_index += 1;
634: break;
636: default: // 8 or more vectors left
637: VecHIPGetArrayRead(yin[current_y_index ],&y0ptr);
638: VecHIPGetArrayRead(yin[current_y_index+1],&y1ptr);
639: VecHIPGetArrayRead(yin[current_y_index+2],&y2ptr);
640: VecHIPGetArrayRead(yin[current_y_index+3],&y3ptr);
641: VecHIPGetArrayRead(yin[current_y_index+4],&y4ptr);
642: VecHIPGetArrayRead(yin[current_y_index+5],&y5ptr);
643: VecHIPGetArrayRead(yin[current_y_index+6],&y6ptr);
644: VecHIPGetArrayRead(yin[current_y_index+7],&y7ptr);
645: #if defined(PETSC_USE_COMPLEX)
646: hberr = hipblasXdot(hipblasv2handle,bn,y0ptr,one,xptr,one,&z[current_y_index]);CHKERRHIPBLAS(hberr);
647: hberr = hipblasXdot(hipblasv2handle,bn,y1ptr,one,xptr,one,&z[current_y_index+1]);CHKERRHIPBLAS(hberr);
648: hberr = hipblasXdot(hipblasv2handle,bn,y2ptr,one,xptr,one,&z[current_y_index+2]);CHKERRHIPBLAS(hberr);
649: hberr = hipblasXdot(hipblasv2handle,bn,y3ptr,one,xptr,one,&z[current_y_index+3]);CHKERRHIPBLAS(hberr);
650: hberr = hipblasXdot(hipblasv2handle,bn,y4ptr,one,xptr,one,&z[current_y_index+4]);CHKERRHIPBLAS(hberr);
651: hberr = hipblasXdot(hipblasv2handle,bn,y5ptr,one,xptr,one,&z[current_y_index+5]);CHKERRHIPBLAS(hberr);
652: hberr = hipblasXdot(hipblasv2handle,bn,y6ptr,one,xptr,one,&z[current_y_index+6]);CHKERRHIPBLAS(hberr);
653: hberr = hipblasXdot(hipblasv2handle,bn,y7ptr,one,xptr,one,&z[current_y_index+7]);CHKERRHIPBLAS(hberr);
654: #else
655: hipLaunchKernelGGL(VecMDot_SeqHIP_kernel8, dim3(MDOT_WORKGROUP_NUM), dim3(MDOT_WORKGROUP_SIZE), 0, 0, xptr,y0ptr,y1ptr,y2ptr,y3ptr,y4ptr,y5ptr,y6ptr,y7ptr,n,group_results_gpu+current_y_index*MDOT_WORKGROUP_NUM);
656: #endif
657: VecHIPRestoreArrayRead(yin[current_y_index ],&y0ptr);
658: VecHIPRestoreArrayRead(yin[current_y_index+1],&y1ptr);
659: VecHIPRestoreArrayRead(yin[current_y_index+2],&y2ptr);
660: VecHIPRestoreArrayRead(yin[current_y_index+3],&y3ptr);
661: VecHIPRestoreArrayRead(yin[current_y_index+4],&y4ptr);
662: VecHIPRestoreArrayRead(yin[current_y_index+5],&y5ptr);
663: VecHIPRestoreArrayRead(yin[current_y_index+6],&y6ptr);
664: VecHIPRestoreArrayRead(yin[current_y_index+7],&y7ptr);
665: current_y_index += 8;
666: break;
667: }
668: }
669: PetscLogGpuTimeEnd();
670: VecHIPRestoreArrayRead(xin,&xptr);
672: #if defined(PETSC_USE_COMPLEX)
673: PetscLogGpuToCpu(nv*sizeof(PetscScalar));
674: #else
675: // copy results to CPU
676: hip_hipMemcpy(group_results_cpu,group_results_gpu,nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM,hipMemcpyDeviceToHost);CHKERRHIP(hip_ierr);
678: // sum group results into z
679: for (j=0; j<nv1; ++j) {
680: z[j] = 0;
681: for (i=j*MDOT_WORKGROUP_NUM; i<(j+1)*MDOT_WORKGROUP_NUM; ++i) z[j] += group_results_cpu[i];
682: }
683: PetscLogFlops(nv1*MDOT_WORKGROUP_NUM);
684: hip_hipFree(group_results_gpu);CHKERRHIP(hip_ierr);
685: PetscFree(group_results_cpu);
686: PetscLogGpuToCpu(nv1*sizeof(PetscScalar)*MDOT_WORKGROUP_NUM);
687: #endif
688: PetscLogGpuFlops(PetscMax(nv*(2.0*n-1),0.0));
689: return(0);
690: }
692: #undef MDOT_WORKGROUP_SIZE
693: #undef MDOT_WORKGROUP_NUM
695: PetscErrorCode VecSet_SeqHIP(Vec xin,PetscScalar alpha)
696: {
697: PetscInt n = xin->map->n;
698: PetscScalar *xarray = NULL;
699: thrust::device_ptr<PetscScalar> xptr;
700: PetscErrorCode ierr;
701: hipError_t err;
704: VecHIPGetArrayWrite(xin,&xarray);
705: PetscLogGpuTimeBegin();
706: if (alpha == (PetscScalar)0.0) {
707: err = hipMemset(xarray,0,n*sizeof(PetscScalar));CHKERRHIP(err);
708: } else {
709: try {
710: xptr = thrust::device_pointer_cast(xarray);
711: thrust::fill(xptr,xptr+n,alpha);
712: } catch (char *ex) {
713: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
714: }
715: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
716: }
717: PetscLogGpuTimeEnd();
718: VecHIPRestoreArrayWrite(xin,&xarray);
719: return(0);
720: }
722: struct PetscScalarReciprocal
723: {
724: __host__ __device__
725: PetscScalar operator()(const PetscScalar& s)
726: {
727: #if defined(PETSC_USE_COMPLEX)
728: /* SPOCK compilation issue, need to unroll division */
729: PetscReal sx = PetscRealPart(s);
730: PetscReal sy = PetscImaginaryPart(s);
731: PetscReal n = sx*sx + sy*sy;
732: return n != 0.0 ? PetscComplex(sx/n,-sy/n) : 0.0;
733: #else
734: return (s != (PetscScalar)0.0) ? (PetscScalar)1.0/s : 0.0;
735: #endif
736: }
737: };
739: PetscErrorCode VecReciprocal_SeqHIP(Vec v)
740: {
742: PetscInt n;
743: PetscScalar *x;
746: VecGetLocalSize(v,&n);
747: VecHIPGetArray(v,&x);
748: PetscLogGpuTimeBegin();
749: try {
750: auto xptr = thrust::device_pointer_cast(x);
751: thrust::transform(xptr,xptr+n,xptr,PetscScalarReciprocal());
752: } catch (char *ex) {
753: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
754: }
755: PetscLogGpuTimeEnd();
756: VecHIPRestoreArray(v,&x);
757: return(0);
758: }
760: PetscErrorCode VecScale_SeqHIP(Vec xin,PetscScalar alpha)
761: {
762: PetscScalar *xarray;
763: PetscErrorCode ierr;
764: PetscBLASInt one = 1,bn = 0;
765: hipblasHandle_t hipblasv2handle;
766: hipblasStatus_t hberr;
769: if (alpha == (PetscScalar)0.0) {
770: VecSet_SeqHIP(xin,alpha);
771: } else if (alpha != (PetscScalar)1.0) {
772: PetscHIPBLASGetHandle(&hipblasv2handle);
773: PetscBLASIntCast(xin->map->n,&bn);
774: VecHIPGetArray(xin,&xarray);
775: PetscLogGpuTimeBegin();
776: hberr = hipblasXscal(hipblasv2handle,bn,&alpha,xarray,one);CHKERRHIPBLAS(hberr);
777: VecHIPRestoreArray(xin,&xarray);
778: PetscLogGpuTimeEnd();
779: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
780: PetscLogGpuFlops(xin->map->n);
781: }
782: return(0);
783: }
785: PetscErrorCode VecTDot_SeqHIP(Vec xin,Vec yin,PetscScalar *z)
786: {
787: const PetscScalar *xarray,*yarray;
788: PetscErrorCode ierr;
789: PetscBLASInt one = 1,bn = 0;
790: hipblasHandle_t hipblasv2handle;
791: hipblasStatus_t cerr;
794: PetscHIPBLASGetHandle(&hipblasv2handle);
795: PetscBLASIntCast(xin->map->n,&bn);
796: VecHIPGetArrayRead(xin,&xarray);
797: VecHIPGetArrayRead(yin,&yarray);
798: PetscLogGpuTimeBegin();
799: cerr = hipblasXdotu(hipblasv2handle,bn,xarray,one,yarray,one,z);CHKERRHIPBLAS(cerr);
800: PetscLogGpuTimeEnd();
801: if (xin->map->n > 0) {
802: PetscLogGpuFlops(2.0*xin->map->n-1);
803: }
804: PetscLogGpuToCpu(sizeof(PetscScalar));
805: VecHIPRestoreArrayRead(yin,&yarray);
806: VecHIPRestoreArrayRead(xin,&xarray);
807: return(0);
808: }
810: PetscErrorCode VecCopy_SeqHIP(Vec xin,Vec yin)
811: {
812: const PetscScalar *xarray;
813: PetscScalar *yarray;
814: PetscErrorCode ierr;
815: hipError_t err;
818: if (xin != yin) {
819: if (xin->offloadmask == PETSC_OFFLOAD_GPU) {
820: PetscBool yiship;
822: PetscObjectTypeCompareAny((PetscObject)yin,&yiship,VECSEQHIP,VECMPIHIP,"");
823: VecHIPGetArrayRead(xin,&xarray);
824: if (yiship) {
825: VecHIPGetArrayWrite(yin,&yarray);
826: } else {
827: VecGetArrayWrite(yin,&yarray);
828: }
829: PetscLogGpuTimeBegin();
830: if (yiship) {
831: err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
832: } else {
833: err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToHost);CHKERRHIP(err);
834: }
835: PetscLogGpuTimeEnd();
836: VecHIPRestoreArrayRead(xin,&xarray);
837: if (yiship) {
838: VecHIPRestoreArrayWrite(yin,&yarray);
839: } else {
840: VecRestoreArrayWrite(yin,&yarray);
841: }
842: } else if (xin->offloadmask == PETSC_OFFLOAD_CPU) {
843: /* copy in CPU if we are on the CPU */
844: VecCopy_SeqHIP_Private(xin,yin);
845: } else if (xin->offloadmask == PETSC_OFFLOAD_BOTH) {
846: /* if xin is valid in both places, see where yin is and copy there (because it's probably where we'll want to next use it) */
847: if (yin->offloadmask == PETSC_OFFLOAD_CPU) {
848: /* copy in CPU */
849: VecCopy_SeqHIP_Private(xin,yin);
850: } else if (yin->offloadmask == PETSC_OFFLOAD_GPU) {
851: /* copy in GPU */
852: VecHIPGetArrayRead(xin,&xarray);
853: VecHIPGetArrayWrite(yin,&yarray);
854: PetscLogGpuTimeBegin();
855: err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
856: PetscLogGpuTimeEnd();
857: VecHIPRestoreArrayRead(xin,&xarray);
858: VecHIPRestoreArrayWrite(yin,&yarray);
859: } else if (yin->offloadmask == PETSC_OFFLOAD_BOTH) {
860: /* xin and yin are both valid in both places (or yin was unallocated before the earlier call to allocatecheck
861: default to copy in GPU (this is an arbitrary choice) */
862: VecHIPGetArrayRead(xin,&xarray);
863: VecHIPGetArrayWrite(yin,&yarray);
864: PetscLogGpuTimeBegin();
865: err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
866: PetscLogGpuTimeEnd();
867: VecHIPRestoreArrayRead(xin,&xarray);
868: VecHIPRestoreArrayWrite(yin,&yarray);
869: } else {
870: VecCopy_SeqHIP_Private(xin,yin);
871: }
872: }
873: }
874: return(0);
875: }
877: PetscErrorCode VecSwap_SeqHIP(Vec xin,Vec yin)
878: {
879: PetscErrorCode ierr;
880: PetscBLASInt one = 1,bn;
881: PetscScalar *xarray,*yarray;
882: hipblasHandle_t hipblasv2handle;
883: hipblasStatus_t hberr;
886: PetscHIPBLASGetHandle(&hipblasv2handle);
887: PetscBLASIntCast(xin->map->n,&bn);
888: if (xin != yin) {
889: VecHIPGetArray(xin,&xarray);
890: VecHIPGetArray(yin,&yarray);
891: PetscLogGpuTimeBegin();
892: hberr = hipblasXswap(hipblasv2handle,bn,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
893: PetscLogGpuTimeEnd();
894: VecHIPRestoreArray(xin,&xarray);
895: VecHIPRestoreArray(yin,&yarray);
896: }
897: return(0);
898: }
900: PetscErrorCode VecAXPBY_SeqHIP(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
901: {
902: PetscErrorCode ierr;
903: PetscScalar a = alpha,b = beta;
904: const PetscScalar *xarray;
905: PetscScalar *yarray;
906: PetscBLASInt one = 1, bn = 0;
907: hipblasHandle_t hipblasv2handle;
908: hipblasStatus_t hberr;
909: hipError_t err;
912: PetscHIPBLASGetHandle(&hipblasv2handle);
913: PetscBLASIntCast(yin->map->n,&bn);
914: if (a == (PetscScalar)0.0) {
915: VecScale_SeqHIP(yin,beta);
916: } else if (b == (PetscScalar)1.0) {
917: VecAXPY_SeqHIP(yin,alpha,xin);
918: } else if (a == (PetscScalar)1.0) {
919: VecAYPX_SeqHIP(yin,beta,xin);
920: } else if (b == (PetscScalar)0.0) {
921: VecHIPGetArrayRead(xin,&xarray);
922: VecHIPGetArray(yin,&yarray);
923: PetscLogGpuTimeBegin();
924: err = hipMemcpy(yarray,xarray,yin->map->n*sizeof(PetscScalar),hipMemcpyDeviceToDevice);CHKERRHIP(err);
925: hberr = hipblasXscal(hipblasv2handle,bn,&alpha,yarray,one);CHKERRHIPBLAS(hberr);
926: PetscLogGpuTimeEnd();
927: PetscLogGpuFlops(xin->map->n);
928: PetscLogCpuToGpuScalar(sizeof(PetscScalar));
929: VecHIPRestoreArrayRead(xin,&xarray);
930: VecHIPRestoreArray(yin,&yarray);
931: } else {
932: VecHIPGetArrayRead(xin,&xarray);
933: VecHIPGetArray(yin,&yarray);
934: PetscLogGpuTimeBegin();
935: hberr = hipblasXscal(hipblasv2handle,bn,&beta,yarray,one);CHKERRHIPBLAS(hberr);
936: hberr = hipblasXaxpy(hipblasv2handle,bn,&alpha,xarray,one,yarray,one);CHKERRHIPBLAS(hberr);
937: VecHIPRestoreArrayRead(xin,&xarray);
938: VecHIPRestoreArray(yin,&yarray);
939: PetscLogGpuTimeEnd();
940: PetscLogGpuFlops(3.0*xin->map->n);
941: PetscLogCpuToGpuScalar(2*sizeof(PetscScalar));
942: }
943: return(0);
944: }
946: PetscErrorCode VecAXPBYPCZ_SeqHIP(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)
947: {
949: PetscInt n = zin->map->n;
952: if (gamma == (PetscScalar)1.0) {
953: /* z = ax + b*y + z */
954: VecAXPY_SeqHIP(zin,alpha,xin);
955: VecAXPY_SeqHIP(zin,beta,yin);
956: PetscLogGpuFlops(4.0*n);
957: } else {
958: /* z = a*x + b*y + c*z */
959: VecScale_SeqHIP(zin,gamma);
960: VecAXPY_SeqHIP(zin,alpha,xin);
961: VecAXPY_SeqHIP(zin,beta,yin);
962: PetscLogGpuFlops(5.0*n);
963: }
964: return(0);
965: }
967: PetscErrorCode VecPointwiseMult_SeqHIP(Vec win,Vec xin,Vec yin)
968: {
969: PetscInt n = win->map->n;
970: const PetscScalar *xarray,*yarray;
971: PetscScalar *warray;
972: thrust::device_ptr<const PetscScalar> xptr,yptr;
973: thrust::device_ptr<PetscScalar> wptr;
974: PetscErrorCode ierr;
977: if (xin->boundtocpu || yin->boundtocpu) {
978: VecPointwiseMult_Seq(win,xin,yin);
979: return(0);
980: }
981: VecHIPGetArrayRead(xin,&xarray);
982: VecHIPGetArrayRead(yin,&yarray);
983: VecHIPGetArrayWrite(win,&warray);
984: PetscLogGpuTimeBegin();
985: try {
986: wptr = thrust::device_pointer_cast(warray);
987: xptr = thrust::device_pointer_cast(xarray);
988: yptr = thrust::device_pointer_cast(yarray);
989: #if defined(PETSC_USE_COMPLEX)
990: thrust::transform(xptr,xptr+n,yptr,wptr,PetscMultiplyComplex());
991: #else
992: thrust::transform(xptr,xptr+n,yptr,wptr,thrust::multiplies<PetscScalar>());
993: #endif
994: } catch (char *ex) {
995: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
996: }
997: PetscLogGpuTimeEnd();
998: VecHIPRestoreArrayRead(xin,&xarray);
999: VecHIPRestoreArrayRead(yin,&yarray);
1000: VecHIPRestoreArrayWrite(win,&warray);
1001: PetscLogGpuFlops(n);
1002: return(0);
1003: }
1005: /* should do infinity norm in hip */
1007: PetscErrorCode VecNorm_SeqHIP(Vec xin,NormType type,PetscReal *z)
1008: {
1009: PetscErrorCode ierr;
1010: PetscInt n = xin->map->n;
1011: PetscBLASInt one = 1, bn = 0;
1012: const PetscScalar *xarray;
1013: hipblasHandle_t hipblasv2handle;
1014: hipblasStatus_t hberr;
1015: hipError_t err;
1018: PetscHIPBLASGetHandle(&hipblasv2handle);
1019: PetscBLASIntCast(n,&bn);
1020: if (type == NORM_2 || type == NORM_FROBENIUS) {
1021: VecHIPGetArrayRead(xin,&xarray);
1022: PetscLogGpuTimeBegin();
1023: hberr = hipblasXnrm2(hipblasv2handle,bn,xarray,one,z);CHKERRHIPBLAS(hberr);
1024: PetscLogGpuTimeEnd();
1025: VecHIPRestoreArrayRead(xin,&xarray);
1026: PetscLogGpuFlops(PetscMax(2.0*n-1,0.0));
1027: } else if (type == NORM_INFINITY) {
1028: int i;
1029: VecHIPGetArrayRead(xin,&xarray);
1030: PetscLogGpuTimeBegin();
1031: hberr = hipblasIXamax(hipblasv2handle,bn,xarray,one,&i);CHKERRHIPBLAS(hberr);
1032: PetscLogGpuTimeEnd();
1033: if (bn) {
1034: PetscScalar zs;
1035: err = hipMemcpy(&zs,xarray+i-1,sizeof(PetscScalar),hipMemcpyDeviceToHost);CHKERRHIP(err);
1036: *z = PetscAbsScalar(zs);
1037: } else *z = 0.0;
1038: VecHIPRestoreArrayRead(xin,&xarray);
1039: } else if (type == NORM_1) {
1040: VecHIPGetArrayRead(xin,&xarray);
1041: PetscLogGpuTimeBegin();
1042: hberr = hipblasXasum(hipblasv2handle,bn,xarray,one,z);CHKERRHIPBLAS(hberr);
1043: VecHIPRestoreArrayRead(xin,&xarray);
1044: PetscLogGpuTimeEnd();
1045: PetscLogGpuFlops(PetscMax(n-1.0,0.0));
1046: } else if (type == NORM_1_AND_2) {
1047: VecNorm_SeqHIP(xin,NORM_1,z);
1048: VecNorm_SeqHIP(xin,NORM_2,z+1);
1049: }
1050: PetscLogGpuToCpu(sizeof(PetscReal));
1051: return(0);
1052: }
1054: PetscErrorCode VecDotNorm2_SeqHIP(Vec s, Vec t, PetscScalar *dp, PetscScalar *nm)
1055: {
1059: VecDot_SeqHIP(s,t,dp);
1060: VecDot_SeqHIP(t,t,nm);
1061: return(0);
1062: }
1064: PetscErrorCode VecDestroy_SeqHIP(Vec v)
1065: {
1067: hipError_t err;
1070: if (v->spptr) {
1071: if (((Vec_HIP*)v->spptr)->GPUarray_allocated) {
1072: err = hipFree(((Vec_HIP*)v->spptr)->GPUarray_allocated);CHKERRHIP(err);
1073: ((Vec_HIP*)v->spptr)->GPUarray_allocated = NULL;
1074: }
1075: if (((Vec_HIP*)v->spptr)->stream) {
1076: err = hipStreamDestroy(((Vec_HIP*)v->spptr)->stream);CHKERRHIP(err);
1077: }
1078: }
1079: VecDestroy_SeqHIP_Private(v);
1080: PetscFree(v->spptr);
1081: return(0);
1082: }
1084: #if defined(PETSC_USE_COMPLEX)
1085: /* SPOCK compilation issue, need to do conjugation ourselves */
1086: struct conjugate
1087: {
1088: __host__ __device__
1089: PetscScalar operator()(const PetscScalar& x)
1090: {
1091: return PetscScalar(PetscRealPart(x),-PetscImaginaryPart(x));
1092: }
1093: };
1094: #endif
1096: PetscErrorCode VecConjugate_SeqHIP(Vec xin)
1097: {
1098: #if defined(PETSC_USE_COMPLEX)
1099: PetscScalar *xarray;
1100: PetscErrorCode ierr;
1101: PetscInt n = xin->map->n;
1102: thrust::device_ptr<PetscScalar> xptr;
1105: VecHIPGetArray(xin,&xarray);
1106: PetscLogGpuTimeBegin();
1107: try {
1108: xptr = thrust::device_pointer_cast(xarray);
1109: thrust::transform(xptr,xptr+n,xptr,conjugate());
1110: } catch (char *ex) {
1111: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1112: }
1113: PetscLogGpuTimeEnd();
1114: VecHIPRestoreArray(xin,&xarray);
1115: #else
1117: #endif
1118: return(0);
1119: }
1121: PETSC_STATIC_INLINE PetscErrorCode VecGetLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1122: {
1124: hipError_t err;
1125: PetscBool wisseqhip;
1131: PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1132: if (w->data && wisseqhip) {
1133: if (((Vec_Seq*)w->data)->array_allocated) {
1134: if (w->pinned_memory) {
1135: PetscMallocSetHIPHost();
1136: }
1137: PetscFree(((Vec_Seq*)w->data)->array_allocated);
1138: if (w->pinned_memory) {
1139: PetscMallocResetHIPHost();
1140: w->pinned_memory = PETSC_FALSE;
1141: }
1142: }
1143: ((Vec_Seq*)w->data)->array = NULL;
1144: ((Vec_Seq*)w->data)->unplacedarray = NULL;
1145: }
1146: if (w->spptr && wisseqhip) {
1147: if (((Vec_HIP*)w->spptr)->GPUarray) {
1148: err = hipFree(((Vec_HIP*)w->spptr)->GPUarray);CHKERRHIP(err);
1149: ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1150: }
1151: if (((Vec_HIP*)v->spptr)->stream) {
1152: err = hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);CHKERRHIP(err);
1153: }
1154: PetscFree(w->spptr);
1155: }
1157: if (v->petscnative && wisseqhip) {
1158: PetscFree(w->data);
1159: w->data = v->data;
1160: w->offloadmask = v->offloadmask;
1161: w->pinned_memory = v->pinned_memory;
1162: w->spptr = v->spptr;
1163: PetscObjectStateIncrease((PetscObject)w);
1164: } else {
1165: if (read) {
1166: VecGetArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1167: } else {
1168: VecGetArray(v,&((Vec_Seq*)w->data)->array);
1169: }
1170: w->offloadmask = PETSC_OFFLOAD_CPU;
1171: if (wisseqhip) {
1172: VecHIPAllocateCheck(w);
1173: }
1174: }
1175: return(0);
1176: }
1178: PETSC_STATIC_INLINE PetscErrorCode VecRestoreLocalVectorK_SeqHIP(Vec v,Vec w,PetscBool read)
1179: {
1181: hipError_t err;
1182: PetscBool wisseqhip;
1188: PetscObjectTypeCompare((PetscObject)w,VECSEQHIP,&wisseqhip);
1189: if (v->petscnative && wisseqhip) {
1190: v->data = w->data;
1191: v->offloadmask = w->offloadmask;
1192: v->pinned_memory = w->pinned_memory;
1193: v->spptr = w->spptr;
1194: w->data = 0;
1195: w->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1196: w->spptr = 0;
1197: } else {
1198: if (read) {
1199: VecRestoreArrayRead(v,(const PetscScalar**)&((Vec_Seq*)w->data)->array);
1200: } else {
1201: VecRestoreArray(v,&((Vec_Seq*)w->data)->array);
1202: }
1203: if ((Vec_HIP*)w->spptr && wisseqhip) {
1204: err = hipFree(((Vec_HIP*)w->spptr)->GPUarray);CHKERRHIP(err);
1205: ((Vec_HIP*)w->spptr)->GPUarray = NULL;
1206: if (((Vec_HIP*)v->spptr)->stream) {
1207: err = hipStreamDestroy(((Vec_HIP*)w->spptr)->stream);CHKERRHIP(err);
1208: }
1209: PetscFree(w->spptr);
1210: }
1211: }
1212: return(0);
1213: }
1215: PetscErrorCode VecGetLocalVector_SeqHIP(Vec v,Vec w)
1216: {
1220: VecGetLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1221: return(0);
1222: }
1224: PetscErrorCode VecGetLocalVectorRead_SeqHIP(Vec v,Vec w)
1225: {
1229: VecGetLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1230: return(0);
1231: }
1233: PetscErrorCode VecRestoreLocalVector_SeqHIP(Vec v,Vec w)
1234: {
1238: VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_FALSE);
1239: return(0);
1240: }
1242: PetscErrorCode VecRestoreLocalVectorRead_SeqHIP(Vec v,Vec w)
1243: {
1247: VecRestoreLocalVectorK_SeqHIP(v,w,PETSC_TRUE);
1248: return(0);
1249: }
1251: struct petscrealpart : public thrust::unary_function<PetscScalar,PetscReal>
1252: {
1253: __host__ __device__
1254: PetscReal operator()(const PetscScalar& x) {
1255: return PetscRealPart(x);
1256: }
1257: };
1259: struct petscrealparti : public thrust::unary_function<thrust::tuple<PetscScalar, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1260: {
1261: __host__ __device__
1262: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscScalar, PetscInt>& x) {
1263: return thrust::make_tuple(PetscRealPart(x.get<0>()), x.get<1>());
1264: }
1265: };
1267: struct petscmax : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1268: {
1269: __host__ __device__
1270: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1271: return x < y ? y : x;
1272: }
1273: };
1275: struct petscmaxi : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1276: {
1277: __host__ __device__
1278: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1279: return x.get<0>() < y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1280: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1281: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1282: }
1283: };
1285: struct petscmin : public thrust::binary_function<PetscReal,PetscReal,PetscReal>
1286: {
1287: __host__ __device__
1288: PetscReal operator()(const PetscReal& x, const PetscReal& y) {
1289: return x < y ? x : y;
1290: }
1291: };
1293: struct petscmini : public thrust::binary_function<thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>,thrust::tuple<PetscReal, PetscInt>>
1294: {
1295: __host__ __device__
1296: thrust::tuple<PetscReal, PetscInt> operator()(const thrust::tuple<PetscReal, PetscInt>& x, const thrust::tuple<PetscReal, PetscInt>& y) {
1297: return x.get<0>() > y.get<0>() ? thrust::make_tuple(y.get<0>(), y.get<1>()) :
1298: (x.get<0>() != y.get<0>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) :
1299: (x.get<1>() < y.get<1>() ? thrust::make_tuple(x.get<0>(), x.get<1>()) : thrust::make_tuple(y.get<0>(), y.get<1>())));
1300: }
1301: };
1303: PetscErrorCode VecMax_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1304: {
1305: PetscErrorCode ierr;
1306: PetscInt n = v->map->n;
1307: const PetscScalar *av;
1308: thrust::device_ptr<const PetscScalar> avpt;
1312: if (!n) {
1313: *m = PETSC_MIN_REAL;
1314: if (p) *p = -1;
1315: return(0);
1316: }
1317: VecHIPGetArrayRead(v,&av);
1318: avpt = thrust::device_pointer_cast(av);
1319: PetscLogGpuTimeBegin();
1320: if (p) {
1321: thrust::tuple<PetscReal,PetscInt> res(PETSC_MIN_REAL,-1);
1322: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1323: try {
1324: #if defined(PETSC_USE_COMPLEX)
1325: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmaxi());
1326: #else
1327: res = thrust::reduce(zibit,zibit+n,res,petscmaxi());
1328: #endif
1329: } catch (char *ex) {
1330: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1331: }
1332: *m = res.get<0>();
1333: *p = res.get<1>();
1334: } else {
1335: try {
1336: #if defined(PETSC_USE_COMPLEX)
1337: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MIN_REAL,petscmax());
1338: #else
1339: *m = thrust::reduce(avpt,avpt+n,PETSC_MIN_REAL,petscmax());
1340: #endif
1341: } catch (char *ex) {
1342: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1343: }
1344: }
1345: PetscLogGpuTimeEnd();
1346: VecHIPRestoreArrayRead(v,&av);
1347: return(0);
1348: }
1350: PetscErrorCode VecMin_SeqHIP(Vec v, PetscInt *p, PetscReal *m)
1351: {
1352: PetscErrorCode ierr;
1353: PetscInt n = v->map->n;
1354: const PetscScalar *av;
1355: thrust::device_ptr<const PetscScalar> avpt;
1359: if (!n) {
1360: *m = PETSC_MAX_REAL;
1361: if (p) *p = -1;
1362: return(0);
1363: }
1364: VecHIPGetArrayRead(v,&av);
1365: avpt = thrust::device_pointer_cast(av);
1366: PetscLogGpuTimeBegin();
1367: if (p) {
1368: thrust::tuple<PetscReal,PetscInt> res(PETSC_MAX_REAL,-1);
1369: auto zibit = thrust::make_zip_iterator(thrust::make_tuple(avpt,thrust::counting_iterator<PetscInt>(0)));
1370: try {
1371: #if defined(PETSC_USE_COMPLEX)
1372: res = thrust::transform_reduce(zibit,zibit+n,petscrealparti(),res,petscmini());
1373: #else
1374: res = thrust::reduce(zibit,zibit+n,res,petscmini());
1375: #endif
1376: } catch (char *ex) {
1377: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1378: }
1379: *m = res.get<0>();
1380: *p = res.get<1>();
1381: } else {
1382: try {
1383: #if defined(PETSC_USE_COMPLEX)
1384: *m = thrust::transform_reduce(avpt,avpt+n,petscrealpart(),PETSC_MAX_REAL,petscmin());
1385: #else
1386: *m = thrust::reduce(avpt,avpt+n,PETSC_MAX_REAL,petscmin());
1387: #endif
1388: } catch (char *ex) {
1389: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1390: }
1391: }
1392: PetscLogGpuTimeEnd();
1393: VecHIPRestoreArrayRead(v,&av);
1394: return(0);
1395: }
1397: PetscErrorCode VecSum_SeqHIP(Vec v,PetscScalar *sum)
1398: {
1399: PetscErrorCode ierr;
1400: PetscInt n = v->map->n;
1401: const PetscScalar *a;
1402: thrust::device_ptr<const PetscScalar> dptr;
1406: VecHIPGetArrayRead(v,&a);
1407: dptr = thrust::device_pointer_cast(a);
1408: PetscLogGpuTimeBegin();
1409: try {
1410: *sum = thrust::reduce(dptr,dptr+n,PetscScalar(0.0));
1411: } catch (char *ex) {
1412: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1413: }
1414: PetscLogGpuTimeEnd();
1415: VecHIPRestoreArrayRead(v,&a);
1416: return(0);
1417: }
1419: struct petscshift : public thrust::unary_function<PetscScalar,PetscScalar>
1420: {
1421: const PetscScalar shift_;
1422: petscshift(PetscScalar shift) : shift_(shift){}
1423: __host__ __device__
1424: PetscScalar operator()(PetscScalar x) {return x + shift_;}
1425: };
1427: PetscErrorCode VecShift_SeqHIP(Vec v,PetscScalar shift)
1428: {
1429: PetscErrorCode ierr;
1430: PetscInt n = v->map->n;
1431: PetscScalar *a;
1432: thrust::device_ptr<PetscScalar> dptr;
1436: VecHIPGetArray(v,&a);
1437: dptr = thrust::device_pointer_cast(a);
1438: PetscLogGpuTimeBegin();
1439: try {
1440: thrust::transform(dptr,dptr+n,dptr,petscshift(shift)); /* in-place transform */
1441: } catch (char *ex) {
1442: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
1443: }
1444: PetscLogGpuTimeEnd();
1445: VecHIPRestoreArray(v,&a);
1446: return(0);
1447: }