Actual source code: landaucu.cu
1: /*
2: Implements the Landau kernel
3: */
4: #include <petscconf.h>
5: #include <petsc/private/dmpleximpl.h>
6: #include <petsclandau.h>
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <petscmat.h>
10: #include <petscdevice.h>
12: #include "../land_tensors.h"
13: #include <petscaijdevice.h>
15: #define CHECK_LAUNCH_ERROR() \
16: do { \
17: /* Check synchronous errors, i.e. pre-launch */ \
18: cudaError_t err = cudaGetLastError(); \
19: if (cudaSuccess != err) { \
20: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
21: } \
22: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
23: err = cudaDeviceSynchronize(); \
24: if (cudaSuccess != err) { \
25: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
26: } \
27: } while (0)
29: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE],
30: PetscInt Nf[], PetscInt Nq, PetscInt grid)
31: {
32: P4estVertexMaps h_maps;
33: h_maps.num_elements = maps[grid].num_elements;
34: h_maps.num_face = maps[grid].num_face;
35: h_maps.num_reduced = maps[grid].num_reduced;
36: h_maps.deviceType = maps[grid].deviceType;
37: h_maps.Nf = Nf[grid];
38: h_maps.numgrids = maps[grid].numgrids;
39: cudaMalloc((void **)&h_maps.c_maps, maps[grid].num_reduced * sizeof *pointMaps);
40: cudaMemcpy( h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced * sizeof *pointMaps, cudaMemcpyHostToDevice);
41: cudaMalloc((void **)&h_maps.gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx);
42: cudaMemcpy( h_maps.gIdx, maps[grid].gIdx,maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice);
43: cudaMalloc((void **)&maps[grid].d_self, sizeof(P4estVertexMaps));
44: cudaMemcpy( maps[grid].d_self, &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);
45: return 0;
46: }
48: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
49: {
50: for (PetscInt grid=0;grid<num_grids;grid++) {
51: P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
52: cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);
53: cudaFree(h_maps.c_maps);
54: cudaFree(h_maps.gIdx);
55: cudaFree(d_maps);
56: }
57: return 0;
58: }
60: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids,
61: PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[],
62: PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal a_invJ[],
63: PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
64: {
65: PetscTabulation *Tf;
66: PetscReal *BB,*DD;
67: PetscInt dim,Nb=Nq,szf=sizeof(PetscReal),szs=sizeof(PetscScalar),szi=sizeof(PetscInt);
68: PetscInt h_ip_offset[LANDAU_MAX_GRIDS+1],h_ipf_offset[LANDAU_MAX_GRIDS+1],h_elem_offset[LANDAU_MAX_GRIDS+1],nip,IPfdf_sz,Nf;
69: PetscDS prob;
71: DMGetDimension(plex, &dim);
72: DMGetDS(plex, &prob);
74: PetscDSGetTabulation(prob, &Tf);
75: BB = Tf[0]->T[0]; DD = Tf[0]->T[1];
76: Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
77: nip = 0;
78: IPfdf_sz = 0;
79: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
80: PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
81: h_elem_offset[grid+1] = h_elem_offset[grid] + a_numCells[grid];
82: nip += a_numCells[grid]*Nq;
83: h_ip_offset[grid+1] = nip;
84: IPfdf_sz += Nq*nfloc*a_numCells[grid];
85: h_ipf_offset[grid+1] = IPfdf_sz;
86: }
87: Nf = a_species_offset[num_grids];
88: {
89: cudaMalloc((void **)&SData_d->B, Nq*Nb*szf); // kernel input
90: cudaMemcpy( SData_d->B, BB, Nq*Nb*szf, cudaMemcpyHostToDevice);
91: cudaMalloc((void **)&SData_d->D, Nq*Nb*dim*szf); // kernel input
92: cudaMemcpy( SData_d->D, DD, Nq*Nb*dim*szf, cudaMemcpyHostToDevice);
94: cudaMalloc((void **)&SData_d->alpha, Nf*szf); // kernel input
95: cudaMalloc((void **)&SData_d->beta, Nf*szf); // kernel input
96: cudaMalloc((void **)&SData_d->invMass, Nf*szf); // kernel input
98: cudaMemcpy(SData_d->alpha, nu_alpha, Nf*szf, cudaMemcpyHostToDevice);
99: cudaMemcpy(SData_d->beta, nu_beta, Nf*szf, cudaMemcpyHostToDevice);
100: cudaMemcpy(SData_d->invMass,a_invMass,Nf*szf, cudaMemcpyHostToDevice);
102: // collect geometry
103: cudaMalloc((void **)&SData_d->invJ, nip*dim*dim*szf); // kernel input
104: cudaMemcpy(SData_d->invJ, a_invJ, nip*dim*dim*szf, cudaMemcpyHostToDevice);
105: cudaMalloc((void **)&SData_d->x, nip*szf); // kernel input
106: cudaMemcpy( SData_d->x, a_x, nip*szf, cudaMemcpyHostToDevice);
107: cudaMalloc((void **)&SData_d->y, nip*szf); // kernel input
108: cudaMemcpy( SData_d->y, a_y, nip*szf, cudaMemcpyHostToDevice);
109: #if LANDAU_DIM==3
110: cudaMalloc((void **)&SData_d->z, nip*szf); // kernel input
111: cudaMemcpy( SData_d->z, a_z, nip*szf, cudaMemcpyHostToDevice);
112: #endif
113: cudaMalloc((void **)&SData_d->w, nip*szf); // kernel input
114: cudaMemcpy( SData_d->w, a_w, nip*szf, cudaMemcpyHostToDevice);
116: cudaMalloc((void **)&SData_d->NCells, num_grids*szi);
117: cudaMemcpy( SData_d->NCells, a_numCells, num_grids*szi, cudaMemcpyHostToDevice);
118: cudaMalloc((void **)&SData_d->species_offset, (num_grids+1)*szi);
119: cudaMemcpy( SData_d->species_offset, a_species_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);
120: cudaMalloc((void **)&SData_d->mat_offset, (num_grids+1)*szi);
121: cudaMemcpy( SData_d->mat_offset, a_mat_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);
122: cudaMalloc((void **)&SData_d->ip_offset, (num_grids+1)*szi);
123: cudaMemcpy( SData_d->ip_offset, h_ip_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);
124: cudaMalloc((void **)&SData_d->ipf_offset, (num_grids+1)*szi);
125: cudaMemcpy( SData_d->ipf_offset, h_ipf_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);
126: cudaMalloc((void **)&SData_d->elem_offset, (num_grids+1)*szi);
127: cudaMemcpy( SData_d->elem_offset, h_elem_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);
128: cudaMalloc((void**)&SData_d->maps, num_grids*sizeof(P4estVertexMaps*));
129: // allocate space for dynamic data once
130: cudaMalloc((void **)&SData_d->Eq_m, Nf*szf); // this could be for each vertex (todo?)
131: cudaMalloc((void **)&SData_d->f, nip*Nf*szs*batch_sz); // for each vertex in batch
132: cudaMalloc((void **)&SData_d->dfdx, nip*Nf*szs*batch_sz);
133: cudaMalloc((void **)&SData_d->dfdy, nip*Nf*szs*batch_sz);
134: #if LANDAU_DIM==3
135: cudaMalloc((void **)&SData_d->dfdz, nip*Nf*szs*batch_sz);
136: #endif
137: }
138: return 0;
139: }
141: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
142: {
143: if (SData_d->alpha) {
144: cudaFree(SData_d->alpha);
145: SData_d->alpha = NULL;
146: cudaFree(SData_d->beta);
147: cudaFree(SData_d->invMass);
148: cudaFree(SData_d->B);
149: cudaFree(SData_d->D);
150: cudaFree(SData_d->invJ);
151: #if LANDAU_DIM==3
152: cudaFree(SData_d->z);
153: #endif
154: cudaFree(SData_d->x);
155: cudaFree(SData_d->y);
156: cudaFree(SData_d->w);
157: // dynamic data
158: cudaFree(SData_d->Eq_m);
159: cudaFree(SData_d->f);
160: cudaFree(SData_d->dfdx);
161: cudaFree(SData_d->dfdy);
162: #if LANDAU_DIM==3
163: cudaFree(SData_d->dfdz);
164: #endif
165: cudaFree(SData_d->NCells);
166: cudaFree(SData_d->species_offset);
167: cudaFree(SData_d->mat_offset);
168: cudaFree(SData_d->ip_offset);
169: cudaFree(SData_d->ipf_offset);
170: cudaFree(SData_d->elem_offset);
171: cudaFree(SData_d->maps);
172: }
173: return 0;
174: }
175: //
176: // The GPU Landau kernel
177: //
178: __global__
179: void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_invJ[],
180: const PetscReal * const BB, const PetscReal * const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[],
181: PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
182: #if LANDAU_DIM==3
183: PetscReal d_dfdz[],
184: #endif
185: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
186: {
187: const PetscInt Nq = blockDim.y, myQi = threadIdx.y;
188: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x, IPf_sz_glb = d_ipf_offset[num_grids];
189: const PetscReal *Bq = &BB[myQi*Nb], *Dq = &DD[myQi*Nb*dim];
190: PetscInt grid = 0, f,d,b,e,q;
191: while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
192: {
193: const PetscInt loc_nip = d_numCells[grid]*Nq, loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
194: const PetscInt moffset = LAND_MOFFSET(b_id,grid,gridDim.x,num_grids,d_mat_offset);
195: const PetscScalar *coef;
196: PetscReal u_x[LANDAU_DIM];
197: const PetscReal *invJ = &d_invJ[(d_ip_offset[grid] + loc_elem*Nq + myQi)*dim*dim];
198: PetscScalar coef_buff[LANDAU_MAX_SPECIES*LANDAU_MAX_NQ];
199: if (!d_maps) {
200: coef = &d_vertex_f[b_id*IPf_sz_glb + d_ipf_offset[grid] + loc_elem*Nb*loc_Nf]; // closure and IP indexing are the same
201: } else {
202: coef = coef_buff;
203: for (f = 0; f < loc_Nf ; ++f) {
204: LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][f][0];
205: for (b = 0; b < Nb; ++b) {
206: PetscInt idx = Idxs[b];
207: if (idx >= 0) {
208: coef_buff[f*Nb+b] = d_vertex_f[idx+moffset];
209: } else {
210: idx = -idx - 1;
211: coef_buff[f*Nb+b] = 0;
212: for (q = 0; q < d_maps[grid]->num_face; q++) {
213: PetscInt id = d_maps[grid]->c_maps[idx][q].gid;
214: PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
215: if (id >= 0) coef_buff[f*Nb+b] += scale*d_vertex_f[id+moffset];
216: }
217: }
218: }
219: }
220: }
222: /* get f and df */
223: for (f = threadIdx.x; f < loc_Nf; f += blockDim.x) {
224: PetscReal refSpaceDer[LANDAU_DIM];
225: const PetscInt idx = b_id*IPf_sz_glb + d_ipf_offset[grid] + f*loc_nip + loc_elem*Nq + myQi;
226: d_f[idx] = 0.0;
227: for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
228: for (b = 0; b < Nb; ++b) {
229: const PetscInt cidx = b;
230: d_f[idx] += Bq[cidx]*PetscRealPart(coef[f*Nb+cidx]);
231: for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx*dim+d]*PetscRealPart(coef[f*Nb+cidx]);
232: }
233: for (d = 0; d < dim; ++d) {
234: for (e = 0, u_x[d] = 0.0; e < dim; ++e) {
235: u_x[d] += invJ[e*dim+d]*refSpaceDer[e];
236: }
237: //printf("%d.%d.%d.%d %d) u_x[%d]=%g\n",b_id,grid,loc_elem,myQi,idx,d,u_x[d]);
238: }
239: d_dfdx[idx] = u_x[0];
240: d_dfdy[idx] = u_x[1];
241: #if LANDAU_DIM==3
242: d_dfdz[idx] = u_x[2];
243: #endif
244: }
245: }
246: }
248: __device__ void
249: landau_jac_kernel(const PetscInt num_grids, const PetscInt jpidx, PetscInt nip_global, const PetscInt grid,
250: const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[],
251: const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
252: const PetscReal * const BB, const PetscReal * const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
253: PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
254: PetscReal s_scale[][LANDAU_MAX_Q_FACE],
255: PetscInt s_idx[][LANDAU_MAX_Q_FACE],
256: PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
257: PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
258: PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
259: PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
260: PetscReal s_nu_alpha[],
261: PetscReal s_nu_beta[],
262: PetscReal s_invMass[],
263: PetscReal s_f[],
264: PetscReal s_dfx[],
265: PetscReal s_dfy[],
266: PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
267: #if LANDAU_DIM==3
268: const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
269: #endif
270: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
271: {
272: const PetscInt Nq = blockDim.y, myQi = threadIdx.y;
273: const PetscInt b_elem_idx=blockIdx.y, b_id=blockIdx.x, IPf_sz_glb=d_ipf_offset[num_grids];
274: const PetscInt loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
275: const PetscInt moffset = LAND_MOFFSET(b_id,grid,gridDim.x,num_grids,d_mat_offset);
276: int delta,d,f,g,d2,dp,d3,fieldA,ipidx_b;
277: PetscReal gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
278: #if LANDAU_DIM==2
279: const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
280: constexpr int dim = 2;
281: #else
282: const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
283: constexpr int dim = 3;
284: #endif
285: const PetscInt f_off = d_species_offset[grid], Nb=Nq;
286: // create g2 & g3
287: for (f=threadIdx.x; f<loc_Nf; f+=blockDim.x) {
288: for (d=0;d<dim;d++) { // clear accumulation data D & K
289: s_gg2[d][myQi][f] = 0;
290: for (d2=0;d2<dim;d2++) s_gg3[d][d2][myQi][f] = 0;
291: }
292: }
293: #pragma unroll
294: for (d2 = 0; d2 < dim; d2++) {
295: gg2_temp[d2] = 0;
296: #pragma unroll
297: for (d3 = 0; d3 < dim; d3++) {
298: gg3_temp[d2][d3] = 0;
299: }
300: }
301: if (threadIdx.y == 0) {
302: // copy species into shared memory
303: for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
304: s_nu_alpha[fieldA] = nu_alpha[fieldA];
305: s_nu_beta[fieldA] = nu_beta[fieldA];
306: s_invMass[fieldA] = invMass[fieldA];
307: }
308: }
309: __syncthreads();
310: // inner integral, collect gg2/3
311: for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
312: const PetscInt ipidx = ipidx_b + threadIdx.x;
313: PetscInt f_off_r,grid_r,loc_Nf_r,nip_loc_r,ipidx_g,fieldB,IPf_idx_r;
314: __syncthreads();
315: if (ipidx < nip_global) {
316: grid_r = 0;
317: while (ipidx >= d_ip_offset[grid_r+1]) grid_r++;
318: f_off_r = d_species_offset[grid_r];
319: ipidx_g = ipidx - d_ip_offset[grid_r];
320: nip_loc_r = d_numCells[grid_r]*Nq;
321: loc_Nf_r = d_species_offset[grid_r+1] - d_species_offset[grid_r];
322: IPf_idx_r = b_id*IPf_sz_glb + d_ipf_offset[grid_r] + ipidx_g;
323: for (fieldB = threadIdx.y; fieldB < loc_Nf_r ; fieldB += blockDim.y) {
324: const PetscInt idx = IPf_idx_r + fieldB*nip_loc_r;
325: s_f [fieldB*blockDim.x + threadIdx.x] = d_f[idx]; // all vector threads get copy of data
326: //printf("\t\td_f %d.%d.%d.%d.%d) %d) d_f[%d]=%e == s_f[%d]\n",b_id,grid,loc_elem,myQi,fieldB,ipidx,idx,d_f[idx],fieldB*blockDim.x + threadIdx.x);
327: s_dfx[fieldB*blockDim.x + threadIdx.x] = d_dfdx[idx];
328: s_dfy[fieldB*blockDim.x + threadIdx.x] = d_dfdy[idx];
329: #if LANDAU_DIM==3
330: s_dfz[fieldB*blockDim.x + threadIdx.x] = d_dfdz[idx];
331: #endif
332: }
333: }
334: __syncthreads();
335: if (ipidx < nip_global) {
336: const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
337: PetscReal temp1[3] = {0, 0, 0}, temp2 = 0;
338: #if LANDAU_DIM==2
339: PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
340: LandauTensor2D(vj, x, y, Ud, Uk, mask);
341: #else
342: PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2]-z) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
343: LandauTensor3D(vj, x, y, z, U, mask);
344: #endif
345: for (int fieldB = 0; fieldB < loc_Nf_r; fieldB++) {
346: temp1[0] += s_dfx[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
347: temp1[1] += s_dfy[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
348: #if LANDAU_DIM==3
349: temp1[2] += s_dfz[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
350: #endif
351: temp2 += s_f [fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r];
352: //printf("\t\t\t temp2 %d.%d.%d * %d * %d: ipidx %d temp2=%e\n",b_id,grid,loc_elem,myQi,fieldB, ipidx, temp2);
353: }
354: temp1[0] *= wi;
355: temp1[1] *= wi;
356: #if LANDAU_DIM==3
357: temp1[2] *= wi;
358: #endif
359: temp2 *= wi;
360: #if LANDAU_DIM==2
361: #pragma unroll
362: for (d2 = 0; d2 < 2; d2++) {
363: #pragma unroll
364: for (d3 = 0; d3 < 2; ++d3) {
365: /* K = U * grad(f): g2=e: i,A */
366: gg2_temp[d2] += Uk[d2][d3]*temp1[d3];
367: /* D = -U * (I \kron (fx)): g3=f: i,j,A */
368: gg3_temp[d2][d3] += Ud[d2][d3]*temp2;
369: }
370: }
371: #else
372: #pragma unroll
373: for (d2 = 0; d2 < 3; ++d2) {
374: #pragma unroll
375: for (d3 = 0; d3 < 3; ++d3) {
376: /* K = U * grad(f): g2 = e: i,A */
377: gg2_temp[d2] += U[d2][d3]*temp1[d3];
378: /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
379: gg3_temp[d2][d3] += U[d2][d3]*temp2;
380: }
381: }
382: #endif
383: }
384: } /* IPs */
386: /* reduce gg temp sums across threads */
387: for (delta = blockDim.x/2; delta > 0; delta /= 2) {
388: #pragma unroll
389: for (d2 = 0; d2 < dim; d2++) {
390: gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
391: #pragma unroll
392: for (d3 = 0; d3 < dim; d3++) {
393: gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
394: }
395: }
396: }
397: // add alpha and put in gg2/3
398: for (fieldA = threadIdx.x; fieldA < loc_Nf; fieldA += blockDim.x) {
399: #pragma unroll
400: for (d2 = 0; d2 < dim; d2++) {
401: s_gg2[d2][myQi][fieldA] += gg2_temp[d2]*s_nu_alpha[fieldA+f_off];
402: #pragma unroll
403: for (d3 = 0; d3 < dim; d3++) {
404: s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3]*s_nu_alpha[fieldA+f_off]*s_invMass[fieldA+f_off];
405: }
406: }
407: }
408: __syncthreads();
409: /* add electric field term once per IP */
410: for (fieldA = threadIdx.x ; fieldA < loc_Nf ; fieldA += blockDim.x) {
411: s_gg2[dim-1][myQi][fieldA] += Eq_m[fieldA+f_off];
412: }
413: __syncthreads();
414: /* Jacobian transform - g2 */
415: for (fieldA = threadIdx.x ; fieldA < loc_Nf ; fieldA += blockDim.x) {
416: PetscReal wj = ww[jpidx];
417: for (d = 0; d < dim; ++d) {
418: s_g2[d][myQi][fieldA] = 0.0;
419: for (d2 = 0; d2 < dim; ++d2) {
420: s_g2[d][myQi][fieldA] += invJj[d*dim+d2]*s_gg2[d2][myQi][fieldA];
421: s_g3[d][d2][myQi][fieldA] = 0.0;
422: for (d3 = 0; d3 < dim; ++d3) {
423: for (dp = 0; dp < dim; ++dp) {
424: s_g3[d][d2][myQi][fieldA] += invJj[d*dim + d3]*s_gg3[d3][dp][myQi][fieldA]*invJj[d2*dim + dp];
425: }
426: }
427: s_g3[d][d2][myQi][fieldA] *= wj;
428: }
429: s_g2[d][myQi][fieldA] *= wj;
430: }
431: }
432: __syncthreads(); // Synchronize (ensure all the data is available) and sum IP matrices
433: /* FE matrix construction */
434: {
435: int fieldA,d,qj,d2,q,idx,totDim=Nb*loc_Nf;
436: /* assemble */
437: for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
438: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
439: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
440: PetscScalar t = 0;
441: for (qj = 0 ; qj < Nq ; qj++) {
442: const PetscReal *BJq = &BB[qj*Nb], *DIq = &DD[qj*Nb*dim];
443: for (d = 0; d < dim; ++d) {
444: t += DIq[f*dim+d]*s_g2[d][qj][fieldA]*BJq[g];
445: for (d2 = 0; d2 < dim; ++d2) {
446: t += DIq[f*dim + d]*s_g3[d][d2][qj][fieldA]*DIq[g*dim + d2];
447: }
448: }
449: }
450: if (elemMat) {
451: const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
452: elemMat[fOff] += t; // ????
453: } else s_fieldMats[f][g] = t;
454: }
455: }
456: if (s_fieldMats) {
457: PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
458: PetscInt nr,nc;
459: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
460: __syncthreads();
461: if (threadIdx.y == 0) {
462: for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
463: idx = Idxs[f];
464: if (idx >= 0) {
465: s_idx[f][0] = idx + moffset;
466: s_scale[f][0] = 1.;
467: } else {
468: idx = -idx - 1;
469: for (q = 0; q < d_maps[grid]->num_face; q++) {
470: if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
471: else s_idx[f][q] = -1;
472: s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
473: }
474: }
475: }
476: }
477: __syncthreads();
478: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
479: idx = Idxs[f];
480: if (idx >= 0) {
481: nr = 1;
482: } else {
483: nr = d_maps[grid]->num_face;
484: }
485: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
486: idx = Idxs[g];
487: if (idx >= 0) {
488: nc = 1;
489: } else {
490: nc = d_maps[grid]->num_face;
491: }
492: for (q = 0; q < nr; q++) {
493: for (d = 0; d < nc; d++) {
494: vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
495: }
496: }
497: MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
498: }
499: }
500: __syncthreads();
501: }
502: }
503: }
504: }
506: //
507: // The CUDA Landau kernel
508: //
509: __global__
510: void __launch_bounds__(256,2)
511: landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal invJj[],
512: const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
513: const PetscReal * const BB, const PetscReal * const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[],
514: PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
515: #if LANDAU_DIM==3
516: const PetscReal zz[], PetscReal d_dfdz[],
517: #endif
518: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
519: {
520: extern __shared__ PetscReal smem[];
521: int size = 0;
522: PetscReal (*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
523: (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
524: size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
525: PetscReal (*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
526: (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
527: size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
528: PetscReal (*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
529: (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
530: size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
531: PetscReal (*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
532: (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
533: size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
534: PetscReal *s_nu_alpha = &smem[size];
535: size += LANDAU_MAX_SPECIES;
536: PetscReal *s_nu_beta = &smem[size];
537: size += LANDAU_MAX_SPECIES;
538: PetscReal *s_invMass = &smem[size];
539: size += LANDAU_MAX_SPECIES;
540: PetscReal *s_f = &smem[size];
541: size += blockDim.x*LANDAU_MAX_SPECIES;
542: PetscReal *s_dfx = &smem[size];
543: size += blockDim.x*LANDAU_MAX_SPECIES;
544: PetscReal *s_dfy = &smem[size];
545: size += blockDim.x*LANDAU_MAX_SPECIES;
546: #if LANDAU_DIM==3
547: PetscReal *s_dfz = &smem[size];
548: size += blockDim.x*LANDAU_MAX_SPECIES;
549: #endif
550: PetscScalar (*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
551: PetscReal (*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
552: PetscInt (*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
553: const PetscInt b_elem_idx = blockIdx.y, b_id = blockIdx.x;
554: PetscInt Nq = blockDim.y, grid = 0; // Nq == Nb
555: PetscScalar *elemMat = NULL; /* my output */
557: while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
558: {
559: const PetscInt loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
560: const PetscInt myQi = threadIdx.y;
561: const PetscInt jpidx = d_ip_offset[grid] + myQi + loc_elem * Nq;
562: const PetscReal *invJ = &invJj[jpidx*dim*dim];
563: if (d_elem_mats) {
564: PetscInt totDim = loc_Nf*Nb;
565: elemMat = d_elem_mats; // start a beginning and get to my element matrix
566: for (PetscInt b_id2=0 ; b_id2<b_id ; b_id2++) {
567: for (PetscInt grid2=0 ; grid2<num_grids ; grid2++) {
568: PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
569: elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
570: }
571: }
572: for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
573: PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
574: elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids, could be in an offset
575: }
576: elemMat += loc_elem*totDim*totDim; // index into local matrix & zero out
577: for (int i = threadIdx.x + threadIdx.y*blockDim.x; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
578: //if (threadIdx.x) printf("\t %d.%d.%d landau_jacobian elem mat idx = %d\n",b_id,grid,loc_elem,(int)(elemMat-d_elem_mats));
579: }
580: __syncthreads();
581: if (d_maps) {
582: // reuse the space for fieldMats
583: s_fieldMats = (PetscScalar (*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) &smem[size];
584: size += LANDAU_MAX_NQ*LANDAU_MAX_NQ;
585: s_scale = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
586: size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
587: s_idx = (PetscInt (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
588: size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
589: } else {
590: s_fieldMats = NULL;
591: }
592: __syncthreads();
593: landau_jac_kernel(num_grids, jpidx, nip_global, grid, xx, yy, ww,
594: invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD,
595: elemMat, d_maps, d_mat,
596: *s_fieldMats, *s_scale, *s_idx,
597: *s_g2, *s_g3, *s_gg2, *s_gg3,
598: s_nu_alpha, s_nu_beta, s_invMass,
599: s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
600: #if LANDAU_DIM==3
601: zz, s_dfz, d_dfdz,
602: #endif
603: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
604: }
605: }
607: __global__
608: void __launch_bounds__(256,4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscInt num_grids, const PetscReal d_w[], const PetscReal * const BB, const PetscReal * const DD,
609: PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift,
610: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_elem_offset[])
611: {
612: const PetscInt Nq = blockDim.y, b_elem_idx = blockIdx.y, b_id = blockIdx.x;
613: __shared__ PetscScalar s_fieldMats[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
614: __shared__ PetscInt s_idx[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
615: __shared__ PetscReal s_scale[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
616: PetscScalar *elemMat = NULL; /* my output */
617: PetscInt fieldA,d,qj,q,idx,f,g, grid = 0;
619: while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
620: {
621: const PetscInt loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
622: const PetscInt moffset = LAND_MOFFSET(b_id,grid,gridDim.x,num_grids,d_mat_offset), totDim = loc_Nf*Nq;
623: if (d_elem_mats) {
624: elemMat = d_elem_mats; // start a beginning
625: for (PetscInt b_id2=0 ; b_id2<b_id ; b_id2++) {
626: for (PetscInt grid2=0 ; grid2<num_grids ; grid2++) {
627: PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
628: elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
629: }
630: }
631: for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
632: PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
633: elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
634: }
635: elemMat += loc_elem*totDim*totDim;
636: for (int i = threadIdx.x + threadIdx.y*blockDim.x ; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
637: }
638: __syncthreads();
639: /* FE mass matrix construction */
640: for (fieldA = 0; fieldA < loc_Nf; fieldA++) {
641: PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
642: PetscInt nr,nc;
643: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
644: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
645: PetscScalar t = 0;
646: for (qj = 0 ; qj < Nq ; qj++) {
647: const PetscReal *BJq = &BB[qj*Nb];
648: const PetscInt jpidx = d_ip_offset[grid] + qj + loc_elem * Nq;
649: if (dim==2) {
650: t += BJq[f] * d_w[jpidx]*shift * BJq[g] * 2. * PETSC_PI;
651: } else {
652: t += BJq[f] * d_w[jpidx]*shift * BJq[g];
653: }
654: }
655: if (elemMat) {
656: const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
657: elemMat[fOff] += t; // ????
658: } else s_fieldMats[f][g] = t;
659: }
660: }
661: if (!elemMat) {
662: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[loc_elem][fieldA][0];
663: __syncthreads();
664: if (threadIdx.y == 0) {
665: for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
666: idx = Idxs[f];
667: if (idx >= 0) {
668: s_idx[f][0] = idx + moffset;
669: s_scale[f][0] = 1.;
670: } else {
671: idx = -idx - 1;
672: for (q = 0; q < d_maps[grid]->num_face; q++) {
673: if (d_maps[grid]->c_maps[idx][q].gid >= 0) s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
674: else s_idx[f][q] = -1;
675: s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
676: }
677: }
678: }
679: }
680: __syncthreads();
681: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
682: idx = Idxs[f];
683: if (idx >= 0) {
684: nr = 1;
685: } else {
686: nr = d_maps[grid]->num_face;
687: }
688: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
689: idx = Idxs[g];
690: if (idx >= 0) {
691: nc = 1;
692: } else {
693: nc = d_maps[grid]->num_face;
694: }
695: for (q = 0; q < nr; q++) {
696: for (d = 0; d < nc; d++) {
697: vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
698: }
699: }
700: MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
701: }
702: }
703: }
704: __syncthreads();
705: }
706: }
707: }
709: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[],
710: const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscReal shift,
711: const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
712: {
713: cudaError_t cerr;
714: PetscInt Nb=Nq,dim,nip_global,num_cells_batch,elem_mat_size_tot;
715: PetscInt *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
716: PetscInt szf=sizeof(PetscReal),szs=sizeof(PetscScalar),Nftot=a_species_offset[num_grids];
717: PetscReal *d_BB=NULL,*d_DD=NULL,*d_invJj=NULL,*d_nu_alpha=NULL,*d_nu_beta=NULL,*d_invMass=NULL,*d_Eq_m=NULL,*d_x=NULL,*d_y=NULL,*d_w=NULL;
718: PetscScalar *d_elem_mats=NULL,*d_vertex_f=NULL;
719: PetscReal *d_f=NULL,*d_dfdx=NULL,*d_dfdy=NULL;
720: #if LANDAU_DIM==3
721: PetscReal *d_dfdz=NULL, *d_z = NULL;
722: #endif
723: LandauCtx *ctx;
724: PetscSplitCSRDataStructure d_mat=NULL;
725: P4estVertexMaps **d_maps,*maps[LANDAU_MAX_GRIDS];
726: int nnn = 256/Nq; // machine dependent
727: PetscContainer container;
729: PetscLogEventBegin(events[3],0,0,0,0);
730: while (nnn & nnn - 1) nnn = nnn & nnn - 1;
731: if (nnn>16) nnn = 16; // printf("DEBUG\n");
732: DMGetApplicationContext(plex[0], &ctx);
734: DMGetDimension(plex[0], &dim);
736: if (ctx->gpu_assembly) {
737: PetscObjectQuery((PetscObject) JacP, "assembly_maps", (PetscObject *) &container);
738: if (container) { // not here first call
739: static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
740: if (!init++) {
741: P4estVertexMaps *h_maps=NULL;
742: PetscContainerGetPointer(container, (void **) &h_maps);
743: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
744: if (h_maps[grid].d_self) {
745: maps[grid] = h_maps[grid].d_self;
746: } else {
747: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
748: }
749: }
750: cudaMemcpy(SData_d->maps, maps, num_grids*sizeof(P4estVertexMaps*), cudaMemcpyHostToDevice);
751: }
752: d_maps = (P4estVertexMaps**)SData_d->maps;
753: // this does the setup the first time called
754: MatCUSPARSEGetDeviceMatWrite(JacP,&d_mat);
755: } else {
756: d_maps = NULL;
757: }
758: } else {
759: container = NULL;
760: d_maps = NULL;
761: }
762: PetscLogEventEnd(events[3],0,0,0,0);
763: {
764: PetscInt elem_mat_size = 0;
765: nip_global = num_cells_batch = 0;
766: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
767: PetscInt Nfloc = a_species_offset[grid+1] - a_species_offset[grid], totDim = Nfloc*Nb;
768: nip_global += a_numCells[grid]*Nq;
769: num_cells_batch += a_numCells[grid]; // is in d_elem_offset, but not on host
770: elem_mat_size += a_numCells[grid]*totDim*totDim; // could save in an offset here -- batch major ordering
771: }
772: elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
773: }
774: dim3 dimGrid(batch_sz,num_cells_batch);
775: if (elem_mat_size_tot) {
776: cudaMalloc((void **)&d_elem_mats, batch_sz*elem_mat_size_tot*szs); // kernel output - first call is on CPU
777: } else d_elem_mats = NULL;
778: // create data
779: d_BB = (PetscReal*)SData_d->B;
780: d_DD = (PetscReal*)SData_d->D;
781: if (a_elem_closure || a_xarray) { // form f and df
782: PetscLogEventBegin(events[1],0,0,0,0);
783: cudaMemcpy(SData_d->Eq_m, a_Eq_m, Nftot*szf, cudaMemcpyHostToDevice);
784: d_invJj = (PetscReal*)SData_d->invJ;
785: d_nu_alpha = (PetscReal*)SData_d->alpha;
786: d_nu_beta = (PetscReal*)SData_d->beta;
787: d_invMass = (PetscReal*)SData_d->invMass;
788: d_x = (PetscReal*)SData_d->x;
789: d_y = (PetscReal*)SData_d->y;
790: d_w = (PetscReal*)SData_d->w;
791: d_Eq_m = (PetscReal*)SData_d->Eq_m;
792: d_dfdx = (PetscReal*)SData_d->dfdx;
793: d_dfdy = (PetscReal*)SData_d->dfdy;
794: #if LANDAU_DIM==3
795: d_dfdz = (PetscReal*)SData_d->dfdz;
796: d_z = (PetscReal*)SData_d->z;
797: #endif
798: d_f = (PetscReal*)SData_d->f;
799: // get a d_vertex_f
800: if (a_elem_closure) {
801: PetscInt closure_sz = 0; // argh, don't have this on the host!!!
802: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
803: PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
804: closure_sz += Nq*nfloc*a_numCells[grid];
805: }
806: closure_sz *= batch_sz;
807: cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure));
808: cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz*sizeof(*a_elem_closure), cudaMemcpyHostToDevice);
809: } else {
810: d_vertex_f = (PetscScalar*)a_xarray;
811: }
812: PetscLogEventEnd(events[1],0,0,0,0);
813: } else {
814: d_w = (PetscReal*)SData_d->w; // mass just needs the weights
815: }
816: //
817: d_numCells = (PetscInt*)SData_d->NCells; // redundant -- remove
818: d_species_offset = (PetscInt*)SData_d->species_offset;
819: d_mat_offset = (PetscInt*)SData_d->mat_offset;
820: d_ip_offset = (PetscInt*)SData_d->ip_offset;
821: d_ipf_offset = (PetscInt*)SData_d->ipf_offset;
822: d_elem_offset = (PetscInt*)SData_d->elem_offset;
823: if (a_elem_closure || a_xarray) { // form f and df
824: dim3 dimBlockFDF(nnn>Nftot ? Nftot : nnn, Nq), dimBlock((nip_global>nnn) ? nnn : nip_global, Nq);
825: PetscLogEventBegin(events[8],0,0,0,0);
826: PetscLogGpuTimeBegin();
827: PetscInfo(plex[0], "Form F and dF/dx vectors: nip_global=%D num_grids=%D\n",nip_global,num_grids);
828: landau_form_fdf<<<dimGrid,dimBlockFDF>>>(dim, Nb, num_grids, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
829: #if LANDAU_DIM==3
830: d_dfdz,
831: #endif
832: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
833: CHECK_LAUNCH_ERROR();
834: PetscLogGpuFlops(batch_sz*nip_global*(PetscLogDouble)(2*Nb*(1+dim)));
835: if (a_elem_closure) {
836: cudaFree(d_vertex_f);
837: d_vertex_f = NULL;
838: }
839: PetscLogGpuTimeEnd();
840: PetscLogEventEnd(events[8],0,0,0,0);
841: // Jacobian
842: PetscLogEventBegin(events[4],0,0,0,0);
843: PetscLogGpuTimeBegin();
844: PetscLogGpuFlops(batch_sz*nip_global*(PetscLogDouble)(a_elem_closure ? (nip_global*(11*Nftot+ 4*dim*dim) + 6*Nftot*dim*dim*dim + 10*Nftot*dim*dim + 4*Nftot*dim + Nb*Nftot*Nb*Nq*dim*dim*5) : Nb*Nftot*Nb*Nq*4));
845: PetscInt ii = 2*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM) + 3*LANDAU_MAX_SPECIES + (1+LANDAU_DIM)*dimBlock.x*LANDAU_MAX_SPECIES + LANDAU_MAX_NQ*LANDAU_MAX_NQ + 2*LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
846: if (ii*szf >= 49152) {
847: cerr = cudaFuncSetAttribute(landau_jacobian,
848: cudaFuncAttributeMaxDynamicSharedMemorySize,
849: 98304);cerr;
850: }
851: PetscInfo(plex[0], "Jacobian shared memory size: %D bytes, d_elem_mats=%p d_maps=%p\n",ii,d_elem_mats,d_maps);
852: landau_jacobian<<<dimGrid,dimBlock,ii*szf>>>(nip_global, dim, Nb, num_grids, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m,
853: d_BB, d_DD, d_x, d_y, d_w,
854: d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
855: #if LANDAU_DIM==3
856: d_z, d_dfdz,
857: #endif
858: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
859: CHECK_LAUNCH_ERROR(); // has sync
860: PetscLogGpuTimeEnd();
861: PetscLogEventEnd(events[4],0,0,0,0);
862: } else { // mass
863: dim3 dimBlock(nnn,Nq);
864: PetscInfo(plex[0], "Mass d_maps = %p. Nq=%d, vector size %d num_cells_batch=%d\n",d_maps,Nq,nnn,num_cells_batch);
865: PetscLogEventBegin(events[16],0,0,0,0);
866: PetscLogGpuTimeBegin();
867: landau_mass<<<dimGrid,dimBlock>>>(dim, Nb, num_grids, d_w, d_BB, d_DD, d_elem_mats,
868: d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_elem_offset);
869: CHECK_LAUNCH_ERROR(); // has sync
870: PetscLogGpuTimeEnd();
871: PetscLogEventEnd(events[16],0,0,0,0);
872: }
873: // First time assembly with or without GPU assembly
874: if (d_elem_mats) {
875: PetscInt elem_mats_idx = 0;
876: for (PetscInt b_id = 0 ; b_id < batch_sz ; b_id++) { // OpenMP (once)
877: for (PetscInt grid=0 ; grid<num_grids ; grid++) { // elem_mats_idx += totDim*totDim*a_numCells[grid];
878: const PetscInt Nfloc = a_species_offset[grid+1]-a_species_offset[grid], totDim = Nfloc*Nq;
879: PetscScalar *elemMats = NULL, *elMat;
880: PetscSection section, globalSection;
881: PetscInt cStart,cEnd,ej;
882: PetscInt moffset = LAND_MOFFSET(b_id,grid,batch_sz,num_grids,a_mat_offset), nloc, nzl, colbuf[1024], row;
883: const PetscInt *cols;
884: const PetscScalar *vals;
885: Mat B = subJ[ LAND_PACK_IDX(b_id,grid) ];
886: PetscLogEventBegin(events[5],0,0,0,0);
887: DMPlexGetHeightStratum(plex[grid],0,&cStart,&cEnd);
888: DMGetLocalSection(plex[grid], §ion);
889: DMGetGlobalSection(plex[grid], &globalSection);
890: PetscMalloc1(totDim*totDim*a_numCells[grid],&elemMats);
891: cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim*totDim*a_numCells[grid]*sizeof(*elemMats), cudaMemcpyDeviceToHost);
892: PetscLogEventEnd(events[5],0,0,0,0);
893: PetscLogEventBegin(events[6],0,0,0,0);
894: for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
895: DMPlexMatSetClosure(plex[grid], section, globalSection, B, ej, elMat, ADD_VALUES);
896: if (ej==-1) {
897: int d,f;
898: PetscPrintf(PETSC_COMM_SELF,"GPU Element matrix\n");
899: for (d = 0; d < totDim; ++d) {
900: for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF," %12.5e", PetscRealPart(elMat[d*totDim + f]));
901: PetscPrintf(PETSC_COMM_SELF,"\n");
902: }
903: //exit(14);
904: }
905: }
906: PetscFree(elemMats);
907: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
908: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
909: // move nest matrix to global JacP
910: MatGetSize(B, &nloc, NULL);
911: for (int i=0 ; i<nloc ; i++) {
912: MatGetRow(B,i,&nzl,&cols,&vals);
914: for (int j=0; j<nzl; j++) colbuf[j] = cols[j] + moffset;
915: row = i + moffset;
916: MatSetValues(JacP,1,&row,nzl,colbuf,vals,ADD_VALUES);
917: MatRestoreRow(B,i,&nzl,&cols,&vals);
918: }
919: MatDestroy(&B);
920: PetscLogEventEnd(events[6],0,0,0,0);
921: elem_mats_idx += totDim*totDim*a_numCells[grid]; // this can be a stored offset?
922: } // grids
923: }
925: cudaFree(d_elem_mats);
926: }
928: return 0;
929: }