Actual source code: landaucu.cu
1: /*
2: Implements the Landau kernel
3: */
4: #include <petscconf.h>
5: #include <petsc/private/dmpleximpl.h>
6: #include <petsclandau.h>
7: #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
8: #include <../src/mat/impls/aij/seq/aij.h>
9: #include <petscmat.h>
10: #include <petscdevice.h>
12: #include "../land_tensors.h"
13: #include <petscaijdevice.h>
15: #define CHECK_LAUNCH_ERROR() \
16: do { \
17: /* Check synchronous errors, i.e. pre-launch */ \
18: cudaError_t err = cudaGetLastError(); \
19: if (cudaSuccess != err) { \
20: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
21: } \
22: /* Check asynchronous errors, i.e. kernel failed (ULF) */ \
23: err = cudaDeviceSynchronize(); \
24: if (cudaSuccess != err) { \
25: SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
26: } \
27: } while (0)
29: PETSC_EXTERN PetscErrorCode LandauCUDACreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE],
30: PetscInt Nf[], PetscInt Nq, PetscInt grid)
31: {
32: P4estVertexMaps h_maps;
33: cudaError_t cerr;
35: h_maps.num_elements = maps[grid].num_elements;
36: h_maps.num_face = maps[grid].num_face;
37: h_maps.num_reduced = maps[grid].num_reduced;
38: h_maps.deviceType = maps[grid].deviceType;
39: h_maps.Nf = Nf[grid];
40: h_maps.numgrids = maps[grid].numgrids;
41: h_maps.Nq = Nq;
42: cerr = cudaMalloc((void **)&h_maps.c_maps, maps[grid].num_reduced * sizeof *pointMaps);CHKERRCUDA(cerr);
43: cerr = cudaMemcpy( h_maps.c_maps, maps[grid].c_maps, maps[grid].num_reduced * sizeof *pointMaps, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
44: cerr = cudaMalloc((void **)&h_maps.gIdx, maps[grid].num_elements * sizeof *maps[grid].gIdx);CHKERRCUDA(cerr);
45: cerr = cudaMemcpy( h_maps.gIdx, maps[grid].gIdx,maps[grid].num_elements * sizeof *maps[grid].gIdx, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
46: cerr = cudaMalloc((void **)&maps[grid].d_self, sizeof(P4estVertexMaps));CHKERRCUDA(cerr);
47: cerr = cudaMemcpy( maps[grid].d_self, &h_maps, sizeof(P4estVertexMaps), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
48: return(0);
49: }
51: PETSC_EXTERN PetscErrorCode LandauCUDADestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
52: {
53: cudaError_t cerr;
55: for (PetscInt grid=0;grid<num_grids;grid++) {
56: P4estVertexMaps *d_maps = maps[grid].d_self, h_maps;
57: cerr = cudaMemcpy(&h_maps, d_maps, sizeof(P4estVertexMaps), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
58: cerr = cudaFree(h_maps.c_maps);CHKERRCUDA(cerr);
59: cerr = cudaFree(h_maps.gIdx);CHKERRCUDA(cerr);
60: cerr = cudaFree(d_maps);CHKERRCUDA(cerr);
61: }
62: return(0);
63: }
65: PetscErrorCode LandauCUDAStaticDataSet(DM plex, const PetscInt Nq, const PetscInt num_grids, PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[],
66: PetscReal nu_alpha[], PetscReal nu_beta[], PetscReal a_invMass[], PetscReal a_invJ[],
67: PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
68: {
69: PetscErrorCode ierr;
70: PetscTabulation *Tf;
71: PetscReal *BB,*DD;
72: PetscInt dim,Nb=Nq,szf=sizeof(PetscReal),szs=sizeof(PetscScalar),szi=sizeof(PetscInt);
73: PetscInt h_ip_offset[LANDAU_MAX_GRIDS+1],h_ipf_offset[LANDAU_MAX_GRIDS+1],h_elem_offset[LANDAU_MAX_GRIDS+1],nip,IPfdf_sz,Nf;
74: PetscDS prob;
75: cudaError_t cerr;
78: DMGetDimension(plex, &dim);
79: DMGetDS(plex, &prob);
80: if (LANDAU_DIM != dim) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_PLIB, "dim %D != LANDAU_DIM %d",dim,LANDAU_DIM);
81: PetscDSGetTabulation(prob, &Tf);
82: BB = Tf[0]->T[0]; DD = Tf[0]->T[1];
83: Nf = h_ip_offset[0] = h_ipf_offset[0] = h_elem_offset[0] = 0;
84: nip = 0;
85: IPfdf_sz = 0;
86: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
87: PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
88: h_elem_offset[grid+1] = h_elem_offset[grid] + a_numCells[grid];
89: nip += a_numCells[grid]*Nq;
90: h_ip_offset[grid+1] = nip;
91: IPfdf_sz += Nq*nfloc*a_numCells[grid];
92: h_ipf_offset[grid+1] = IPfdf_sz;
93: }
94: Nf = a_species_offset[num_grids];
95: {
96: cerr = cudaMalloc((void **)&SData_d->B, Nq*Nb*szf);CHKERRCUDA(cerr); // kernel input
97: cerr = cudaMemcpy( SData_d->B, BB, Nq*Nb*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
98: cerr = cudaMalloc((void **)&SData_d->D, Nq*Nb*dim*szf);CHKERRCUDA(cerr); // kernel input
99: cerr = cudaMemcpy( SData_d->D, DD, Nq*Nb*dim*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
101: cerr = cudaMalloc((void **)&SData_d->alpha, Nf*szf);CHKERRCUDA(cerr); // kernel input
102: cerr = cudaMalloc((void **)&SData_d->beta, Nf*szf);CHKERRCUDA(cerr); // kernel input
103: cerr = cudaMalloc((void **)&SData_d->invMass, Nf*szf);CHKERRCUDA(cerr); // kernel input
105: cerr = cudaMemcpy(SData_d->alpha, nu_alpha, Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
106: cerr = cudaMemcpy(SData_d->beta, nu_beta, Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
107: cerr = cudaMemcpy(SData_d->invMass,a_invMass,Nf*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
109: // collect geometry
110: cerr = cudaMalloc((void **)&SData_d->invJ, nip*dim*dim*szf);CHKERRCUDA(cerr); // kernel input
111: cerr = cudaMemcpy(SData_d->invJ, a_invJ, nip*dim*dim*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
112: cerr = cudaMalloc((void **)&SData_d->x, nip*szf);CHKERRCUDA(cerr); // kernel input
113: cerr = cudaMemcpy( SData_d->x, a_x, nip*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
114: cerr = cudaMalloc((void **)&SData_d->y, nip*szf);CHKERRCUDA(cerr); // kernel input
115: cerr = cudaMemcpy( SData_d->y, a_y, nip*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
116: #if LANDAU_DIM==3
117: cerr = cudaMalloc((void **)&SData_d->z, nip*szf);CHKERRCUDA(cerr); // kernel input
118: cerr = cudaMemcpy( SData_d->z, a_z, nip*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
119: #endif
120: cerr = cudaMalloc((void **)&SData_d->w, nip*szf);CHKERRCUDA(cerr); // kernel input
121: cerr = cudaMemcpy( SData_d->w, a_w, nip*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
123: cerr = cudaMalloc((void **)&SData_d->NCells, num_grids*szi);CHKERRCUDA(cerr);
124: cerr = cudaMemcpy( SData_d->NCells, a_numCells, num_grids*szi, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
125: cerr = cudaMalloc((void **)&SData_d->species_offset, (num_grids+1)*szi);CHKERRCUDA(cerr);
126: cerr = cudaMemcpy( SData_d->species_offset, a_species_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
127: cerr = cudaMalloc((void **)&SData_d->mat_offset, (num_grids+1)*szi);CHKERRCUDA(cerr);
128: cerr = cudaMemcpy( SData_d->mat_offset, a_mat_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
129: cerr = cudaMalloc((void **)&SData_d->ip_offset, (num_grids+1)*szi);CHKERRCUDA(cerr);
130: cerr = cudaMemcpy( SData_d->ip_offset, h_ip_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
131: cerr = cudaMalloc((void **)&SData_d->ipf_offset, (num_grids+1)*szi);CHKERRCUDA(cerr);
132: cerr = cudaMemcpy( SData_d->ipf_offset, h_ipf_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
133: cerr = cudaMalloc((void **)&SData_d->elem_offset, (num_grids+1)*szi);CHKERRCUDA(cerr);
134: cerr = cudaMemcpy( SData_d->elem_offset, h_elem_offset, (num_grids+1)*szi, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
135: // allocate space for dynamic data once
136: cerr = cudaMalloc((void **)&SData_d->Eq_m, Nf*szf);CHKERRCUDA(cerr);
137: cerr = cudaMalloc((void **)&SData_d->f, nip*Nf*szs);CHKERRCUDA(cerr);
138: cerr = cudaMalloc((void **)&SData_d->dfdx, nip*Nf*szs);CHKERRCUDA(cerr);
139: cerr = cudaMalloc((void **)&SData_d->dfdy, nip*Nf*szs);CHKERRCUDA(cerr);
140: #if LANDAU_DIM==3
141: cerr = cudaMalloc((void **)&SData_d->dfdz, nip*Nf*szs);CHKERRCUDA(cerr); // kernel input
142: #endif
143: cerr = cudaMalloc((void**)&SData_d->maps, num_grids*sizeof(P4estVertexMaps*));CHKERRCUDA(cerr);
144: }
145: return(0);
146: }
148: PetscErrorCode LandauCUDAStaticDataClear(LandauStaticData *SData_d)
149: {
150: cudaError_t cerr;
153: if (SData_d->alpha) {
154: cerr = cudaFree(SData_d->alpha);CHKERRCUDA(cerr);
155: SData_d->alpha = NULL;
156: cerr = cudaFree(SData_d->beta);CHKERRCUDA(cerr);
157: cerr = cudaFree(SData_d->invMass);CHKERRCUDA(cerr);
158: cerr = cudaFree(SData_d->B);CHKERRCUDA(cerr);
159: cerr = cudaFree(SData_d->D);CHKERRCUDA(cerr);
160: cerr = cudaFree(SData_d->invJ);CHKERRCUDA(cerr);
161: #if LANDAU_DIM==3
162: cerr = cudaFree(SData_d->z);CHKERRCUDA(cerr);
163: #endif
164: cerr = cudaFree(SData_d->x);CHKERRCUDA(cerr);
165: cerr = cudaFree(SData_d->y);CHKERRCUDA(cerr);
166: cerr = cudaFree(SData_d->w);CHKERRCUDA(cerr);
167: // dynamic data
168: cerr = cudaFree(SData_d->Eq_m);CHKERRCUDA(cerr);
169: cerr = cudaFree(SData_d->f);CHKERRCUDA(cerr);
170: cerr = cudaFree(SData_d->dfdx);CHKERRCUDA(cerr);
171: cerr = cudaFree(SData_d->dfdy);CHKERRCUDA(cerr);
172: #if LANDAU_DIM==3
173: cerr = cudaFree(SData_d->dfdz);CHKERRCUDA(cerr);
174: #endif
175: cerr = cudaFree(SData_d->NCells);CHKERRCUDA(cerr);
176: cerr = cudaFree(SData_d->species_offset);CHKERRCUDA(cerr);
177: cerr = cudaFree(SData_d->mat_offset);CHKERRCUDA(cerr);
178: cerr = cudaFree(SData_d->ip_offset);CHKERRCUDA(cerr);
179: cerr = cudaFree(SData_d->ipf_offset);CHKERRCUDA(cerr);
180: cerr = cudaFree(SData_d->elem_offset);CHKERRCUDA(cerr);
181: cerr = cudaFree(SData_d->maps);CHKERRCUDA(cerr);
182: }
183: return(0);
184: }
186: // The GPU Landau kernel
187: //
188: __global__
189: void landau_form_fdf(const PetscInt dim, const PetscInt Nb, const PetscReal d_invJ[],
190: const PetscReal * const BB, const PetscReal * const DD, PetscScalar *d_vertex_f, P4estVertexMaps *d_maps[],
191: PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
192: #if LANDAU_DIM==3
193: PetscReal d_dfdz[],
194: #endif
195: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[]) // output
196: {
197: const PetscInt Nq = blockDim.y, g_cell = blockIdx.x;
198: const PetscInt myQi = threadIdx.y;
199: const PetscReal *Bq = &BB[myQi*Nb], *Dq = &DD[myQi*Nb*dim];
200: PetscInt grid = 0, f,d,b,e,q;
201: while (g_cell >= d_elem_offset[grid+1]) grid++; // yuck search for grid
202: {
203: const PetscInt moffset = d_mat_offset[grid], nip_loc = d_numCells[grid]*Nq, Nfloc = d_species_offset[grid+1] - d_species_offset[grid], elem = g_cell - d_elem_offset[grid];
204: const PetscInt IP_idx = d_ip_offset[grid], IPf_vertex_idx = d_ipf_offset[grid];
205: const PetscInt ipidx_g = myQi + elem*Nq, ipidx = IP_idx + ipidx_g;
206: const PetscReal *const invJj = &d_invJ[ipidx*dim*dim];
207: PetscReal u_x[LANDAU_MAX_SPECIES][LANDAU_DIM];
208: const PetscScalar *coef;
209: PetscScalar coef_buff[LANDAU_MAX_SPECIES*LANDAU_MAX_NQ];
210: if (!d_maps) {
211: coef = &d_vertex_f[elem*Nb*Nfloc + IPf_vertex_idx]; // closure and IP indexing are the same
212: } else {
213: coef = coef_buff;
214: for (f = 0; f < Nfloc ; ++f) {
215: LandauIdx *const Idxs = &d_maps[grid]->gIdx[elem][f][0];
216: for (b = 0; b < Nb; ++b) {
217: PetscInt idx = Idxs[b];
218: if (idx >= 0) {
219: coef_buff[f*Nb+b] = d_vertex_f[idx+moffset];
220: } else {
221: idx = -idx - 1;
222: coef_buff[f*Nb+b] = 0;
223: for (q = 0; q < d_maps[grid]->num_face; q++) {
224: PetscInt id = d_maps[grid]->c_maps[idx][q].gid;
225: PetscReal scale = d_maps[grid]->c_maps[idx][q].scale;
226: coef_buff[f*Nb+b] += scale*d_vertex_f[id+moffset];
227: }
228: }
229: }
230: }
231: }
233: /* get f and df */
234: for (f = threadIdx.x; f < Nfloc; f += blockDim.x) {
235: PetscReal refSpaceDer[LANDAU_DIM];
236: const PetscInt idx = IPf_vertex_idx + f*nip_loc + ipidx_g;
237: d_f[idx] = 0.0;
238: for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
239: for (b = 0; b < Nb; ++b) {
240: const PetscInt cidx = b;
241: d_f[idx] += Bq[cidx]*PetscRealPart(coef[f*Nb+cidx]);
242: for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[cidx*dim+d]*PetscRealPart(coef[f*Nb+cidx]);
243: }
244: for (d = 0; d < dim; ++d) {
245: for (e = 0, u_x[f][d] = 0.0; e < dim; ++e) {
246: u_x[f][d] += invJj[e*dim+d]*refSpaceDer[e];
247: }
248: }
249: }
250: for (f = threadIdx.x; f < Nfloc ; f += blockDim.x) {
251: const PetscInt idx = IPf_vertex_idx + f*nip_loc + ipidx_g;
252: d_dfdx[idx] = u_x[f][0];
253: d_dfdy[idx] = u_x[f][1];
254: #if LANDAU_DIM==3
255: d_dfdz[idx] = u_x[f][2];
256: #endif
257: }
258: }
259: }
261: __device__ void
262: jac_kernel(const PetscInt myQi, const PetscInt jpidx, PetscInt nip_global, const PetscInt Nq, const PetscInt grid,
263: const PetscInt dim, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[], const PetscReal invJj[],
264: const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
265: const PetscReal * const BB, const PetscReal * const DD, PetscScalar *elemMat, P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, // output
266: PetscScalar s_fieldMats[][LANDAU_MAX_NQ], // all these arrays are in shared memory
267: PetscReal s_scale[][LANDAU_MAX_Q_FACE],
268: PetscInt s_idx[][LANDAU_MAX_Q_FACE],
269: PetscReal s_g2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
270: PetscReal s_g3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
271: PetscReal s_gg2[][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
272: PetscReal s_gg3[][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES],
273: PetscReal s_nu_alpha[],
274: PetscReal s_nu_beta[],
275: PetscReal s_invMass[],
276: PetscReal s_f[],
277: PetscReal s_dfx[],
278: PetscReal s_dfy[],
279: PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[], // global memory
280: #if LANDAU_DIM==3
281: const PetscReal zz[], PetscReal s_dfz[], PetscReal d_dfdz[],
282: #endif
283: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
284: {
285: int delta,d,f,g,d2,dp,d3,fieldA,ipidx_b;
286: PetscReal gg2_temp[LANDAU_DIM], gg3_temp[LANDAU_DIM][LANDAU_DIM];
287: #if LANDAU_DIM==2
288: const PetscReal vj[3] = {xx[jpidx], yy[jpidx]};
289: #else
290: const PetscReal vj[3] = {xx[jpidx], yy[jpidx], zz[jpidx]};
291: #endif
292: const PetscInt moffset = d_mat_offset[grid], Nfloc = d_species_offset[grid+1]-d_species_offset[grid], g_cell = blockIdx.x, elem = g_cell - d_elem_offset[grid];
293: const PetscInt f_off = d_species_offset[grid], Nb=Nq;
295: // create g2 & g3
296: for (f=threadIdx.x; f<Nfloc; f+=blockDim.x) {
297: for (d=0;d<dim;d++) { // clear accumulation data D & K
298: s_gg2[d][myQi][f] = 0;
299: for (d2=0;d2<dim;d2++) s_gg3[d][d2][myQi][f] = 0;
300: }
301: }
302: for (d2 = 0; d2 < dim; d2++) {
303: gg2_temp[d2] = 0;
304: for (d3 = 0; d3 < dim; d3++) {
305: gg3_temp[d2][d3] = 0;
306: }
307: }
308: if (threadIdx.y == 0) {
309: // copy species into shared memory
310: for (fieldA = threadIdx.x; fieldA < Nftot; fieldA += blockDim.x) {
311: s_nu_alpha[fieldA] = nu_alpha[fieldA];
312: s_nu_beta[fieldA] = nu_beta[fieldA];
313: s_invMass[fieldA] = invMass[fieldA];
314: }
315: }
316: __syncthreads();
317: // inner integral, collect gg2/3
318: for (ipidx_b = 0; ipidx_b < nip_global; ipidx_b += blockDim.x) {
319: const PetscInt ipidx = ipidx_b + threadIdx.x;
320: PetscInt f_off_r,grid_r,Nfloc_r,nip_loc_r,ipidx_g,fieldB,IPf_idx_r;
321: __syncthreads();
322: if (ipidx < nip_global) {
323: grid_r = 0;
324: while (ipidx >= d_ip_offset[grid_r+1]) grid_r++; // yuck search for grid
325: f_off_r = d_species_offset[grid_r];
326: ipidx_g = ipidx - d_ip_offset[grid_r];
327: nip_loc_r = d_numCells[grid_r]*Nq;
328: Nfloc_r = d_species_offset[grid_r+1] - d_species_offset[grid_r];
329: IPf_idx_r = d_ipf_offset[grid_r];
330: for (fieldB = threadIdx.y; fieldB < Nfloc_r ; fieldB += blockDim.y) {
331: const PetscInt idx = IPf_idx_r + fieldB*nip_loc_r + ipidx_g;
332: s_f [fieldB*blockDim.x + threadIdx.x] = d_f[idx]; // all vector threads get copy of data (Peng: why?)
333: s_dfx[fieldB*blockDim.x + threadIdx.x] = d_dfdx[idx];
334: s_dfy[fieldB*blockDim.x + threadIdx.x] = d_dfdy[idx];
335: #if LANDAU_DIM==3
336: s_dfz[fieldB*blockDim.x + threadIdx.x] = d_dfdz[idx];
337: #endif
338: }
339: }
340: __syncthreads();
341: if (ipidx < nip_global) {
342: const PetscReal wi = ww[ipidx], x = xx[ipidx], y = yy[ipidx];
343: PetscReal temp1[3] = {0, 0, 0}, temp2 = 0;
344: #if LANDAU_DIM==2
345: PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
346: LandauTensor2D(vj, x, y, Ud, Uk, mask);
347: #else
348: PetscReal U[3][3], z = zz[ipidx], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2]-z) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
349: LandauTensor3D(vj, x, y, z, U, mask);
350: #endif
351: for (fieldB = 0; fieldB < Nfloc_r; fieldB++) {
352: temp1[0] += s_dfx[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
353: temp1[1] += s_dfy[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
354: #if LANDAU_DIM==3
355: temp1[2] += s_dfz[fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r]*s_invMass[fieldB + f_off_r];
356: #endif
357: temp2 += s_f [fieldB*blockDim.x + threadIdx.x]*s_nu_beta[fieldB + f_off_r];
358: }
359: temp1[0] *= wi;
360: temp1[1] *= wi;
361: #if LANDAU_DIM==3
362: temp1[2] *= wi;
363: #endif
364: temp2 *= wi;
365: #if LANDAU_DIM==2
366: for (d2 = 0; d2 < 2; d2++) {
367: for (d3 = 0; d3 < 2; ++d3) {
368: /* K = U * grad(f): g2=e: i,A */
369: gg2_temp[d2] += Uk[d2][d3]*temp1[d3];
370: /* D = -U * (I \kron (fx)): g3=f: i,j,A */
371: gg3_temp[d2][d3] += Ud[d2][d3]*temp2;
372: }
373: }
374: #else
375: for (d2 = 0; d2 < 3; ++d2) {
376: for (d3 = 0; d3 < 3; ++d3) {
377: /* K = U * grad(f): g2 = e: i,A */
378: gg2_temp[d2] += U[d2][d3]*temp1[d3];
379: /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
380: gg3_temp[d2][d3] += U[d2][d3]*temp2;
381: }
382: }
383: #endif
384: }
385: } /* IPs */
387: /* reduce gg temp sums across threads */
388: for (delta = blockDim.x/2; delta > 0; delta /= 2) {
389: for (d2 = 0; d2 < dim; d2++) {
390: gg2_temp[d2] += __shfl_xor_sync(0xffffffff, gg2_temp[d2], delta, blockDim.x);
391: for (d3 = 0; d3 < dim; d3++) {
392: gg3_temp[d2][d3] += __shfl_xor_sync(0xffffffff, gg3_temp[d2][d3], delta, blockDim.x);
393: }
394: }
395: }
396: // add alpha and put in gg2/3
397: for (fieldA = threadIdx.x; fieldA < Nfloc; fieldA += blockDim.x) {
398: for (d2 = 0; d2 < dim; d2++) {
399: s_gg2[d2][myQi][fieldA] += gg2_temp[d2]*s_nu_alpha[fieldA+f_off];
400: for (d3 = 0; d3 < dim; d3++) {
401: s_gg3[d2][d3][myQi][fieldA] -= gg3_temp[d2][d3]*s_nu_alpha[fieldA+f_off]*s_invMass[fieldA+f_off];
402: }
403: }
404: }
405: __syncthreads();
406: /* add electric field term once per IP */
407: for (fieldA = threadIdx.x ; fieldA < Nfloc ; fieldA += blockDim.x) {
408: s_gg2[dim-1][myQi][fieldA] += Eq_m[fieldA+f_off];
409: }
410: __syncthreads();
411: /* Jacobian transform - g2 */
412: for (fieldA = threadIdx.x ; fieldA < Nfloc ; fieldA += blockDim.x) {
413: PetscReal wj = ww[jpidx];
414: for (d = 0; d < dim; ++d) {
415: s_g2[d][myQi][fieldA] = 0.0;
416: for (d2 = 0; d2 < dim; ++d2) {
417: s_g2[d][myQi][fieldA] += invJj[d*dim+d2]*s_gg2[d2][myQi][fieldA];
418: s_g3[d][d2][myQi][fieldA] = 0.0;
419: for (d3 = 0; d3 < dim; ++d3) {
420: for (dp = 0; dp < dim; ++dp) {
421: s_g3[d][d2][myQi][fieldA] += invJj[d*dim + d3]*s_gg3[d3][dp][myQi][fieldA]*invJj[d2*dim + dp];
422: }
423: }
424: s_g3[d][d2][myQi][fieldA] *= wj;
425: }
426: s_g2[d][myQi][fieldA] *= wj;
427: }
428: }
429: __syncthreads(); // Synchronize (ensure all the data is available) and sum IP matrices
430: /* FE matrix construction */
431: {
432: int fieldA,d,qj,d2,q,idx,totDim=Nb*Nfloc;
433: /* assemble */
434: for (fieldA = 0; fieldA < Nfloc; fieldA++) {
435: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
436: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
437: PetscScalar t = 0;
438: for (qj = 0 ; qj < Nq ; qj++) {
439: const PetscReal *BJq = &BB[qj*Nb], *DIq = &DD[qj*Nb*dim];
440: for (d = 0; d < dim; ++d) {
441: t += DIq[f*dim+d]*s_g2[d][qj][fieldA]*BJq[g];
442: for (d2 = 0; d2 < dim; ++d2) {
443: t += DIq[f*dim + d]*s_g3[d][d2][qj][fieldA]*DIq[g*dim + d2];
444: }
445: }
446: }
447: if (elemMat) {
448: const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
449: elemMat[fOff] += t; // ????
450: } else s_fieldMats[f][g] = t;
451: }
452: }
453: if (s_fieldMats) {
454: PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
455: PetscInt nr,nc;
456: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[elem][fieldA][0];
457: __syncthreads();
458: if (threadIdx.y == 0) {
459: for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
460: idx = Idxs[f];
461: if (idx >= 0) {
462: s_idx[f][0] = idx + moffset;
463: s_scale[f][0] = 1.;
464: } else {
465: idx = -idx - 1;
466: for (q = 0; q < d_maps[grid]->num_face; q++) {
467: s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
468: s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
469: }
470: }
471: }
472: }
473: __syncthreads();
474: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
475: idx = Idxs[f];
476: if (idx >= 0) {
477: nr = 1;
478: } else {
479: nr = d_maps[grid]->num_face;
480: }
481: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
482: idx = Idxs[g];
483: if (idx >= 0) {
484: nc = 1;
485: } else {
486: nc = d_maps[grid]->num_face;
487: }
488: for (q = 0; q < nr; q++) {
489: for (d = 0; d < nc; d++) {
490: vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
491: }
492: }
493: MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
494: }
495: }
496: __syncthreads();
497: }
498: }
499: }
500: }
502: //
503: // The CUDA Landau kernel
504: //
505: __global__
506: void __launch_bounds__(256,4)
507: landau_jacobian(const PetscInt nip_global, const PetscInt dim, const PetscInt Nb, const PetscReal invJj[],
508: const PetscInt Nftot, const PetscReal nu_alpha[], const PetscReal nu_beta[], const PetscReal invMass[], const PetscReal Eq_m[],
509: const PetscReal * const BB, const PetscReal * const DD, const PetscReal xx[], const PetscReal yy[], const PetscReal ww[],
510: PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal d_f[], PetscReal d_dfdx[], PetscReal d_dfdy[],
511: #if LANDAU_DIM==3
512: const PetscReal zz[], PetscReal d_dfdz[],
513: #endif
514: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
515: {
516: extern __shared__ PetscReal smem[];
517: int size = 0;
518: PetscReal (*s_g2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
519: (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
520: size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
521: PetscReal (*s_g3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
522: (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
523: size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
524: PetscReal (*s_gg2)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
525: (PetscReal (*)[LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
526: size += LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM;
527: PetscReal (*s_gg3)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES] =
528: (PetscReal (*)[LANDAU_DIM][LANDAU_DIM][LANDAU_MAX_NQ][LANDAU_MAX_SPECIES]) &smem[size];
529: size += LANDAU_DIM*LANDAU_DIM*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES;
530: PetscReal *s_nu_alpha = &smem[size];
531: size += LANDAU_MAX_SPECIES;
532: PetscReal *s_nu_beta = &smem[size];
533: size += LANDAU_MAX_SPECIES;
534: PetscReal *s_invMass = &smem[size];
535: size += LANDAU_MAX_SPECIES;
536: PetscReal *s_f = &smem[size];
537: size += blockDim.x*LANDAU_MAX_SPECIES;
538: PetscReal *s_dfx = &smem[size];
539: size += blockDim.x*LANDAU_MAX_SPECIES;
540: PetscReal *s_dfy = &smem[size];
541: size += blockDim.x*LANDAU_MAX_SPECIES;
542: #if LANDAU_DIM==3
543: PetscReal *s_dfz = &smem[size];
544: size += blockDim.x*LANDAU_MAX_SPECIES;
545: #endif
546: PetscScalar (*s_fieldMats)[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
547: PetscReal (*s_scale)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
548: PetscInt (*s_idx)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
549: PetscInt Nq = blockDim.y, g_cell = blockIdx.x, grid = 0;
550: PetscScalar *elemMat = NULL; /* my output */
552: while (g_cell >= d_elem_offset[grid+1]) grid++; // yuck search for grid
553: {
554: const PetscInt Nfloc = d_species_offset[grid+1]-d_species_offset[grid], totDim = Nfloc*Nq, elem = g_cell - d_elem_offset[grid];
555: const PetscInt IP_idx = d_ip_offset[grid];
556: const PetscInt myQi = threadIdx.y;
557: const PetscInt jpidx = IP_idx + myQi + elem * Nq;
558: const PetscReal *invJ = &invJj[jpidx*dim*dim];
559: if (d_elem_mats) {
560: elemMat = d_elem_mats; // start a beginning
561: for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
562: PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
563: elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids, could be in an offset
564: }
565: elemMat += elem*totDim*totDim; // index into local matrix & zero out
566: for (int i = threadIdx.x + threadIdx.y*blockDim.x; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
567: }
568: __syncthreads();
569: if (d_maps) {
570: // reuse the space for fieldMats
571: s_fieldMats = (PetscScalar (*)[LANDAU_MAX_NQ][LANDAU_MAX_NQ]) &smem[size];
572: size += LANDAU_MAX_NQ*LANDAU_MAX_NQ;
573: s_scale = (PetscReal (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
574: size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
575: s_idx = (PetscInt (*)[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE]) &smem[size];
576: size += LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE; // this is too big, idx is an integer
577: } else {
578: s_fieldMats = NULL;
579: }
580: __syncthreads();
581: jac_kernel(myQi, jpidx, nip_global, Nq, grid, dim, xx, yy, ww,
582: invJ, Nftot, nu_alpha, nu_beta, invMass, Eq_m, BB, DD,
583: elemMat, d_maps, d_mat,
584: *s_fieldMats, *s_scale, *s_idx,
585: *s_g2, *s_g3, *s_gg2, *s_gg3,
586: s_nu_alpha, s_nu_beta, s_invMass,
587: s_f, s_dfx, s_dfy, d_f, d_dfdx, d_dfdy,
588: #if LANDAU_DIM==3
589: zz, s_dfz, d_dfdz,
590: #endif
591: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
592: }
593: }
595: __global__
596: void __launch_bounds__(256,4) landau_mass(const PetscInt dim, const PetscInt Nb, const PetscReal d_w[], const PetscReal * const BB, const PetscReal * const DD,
597: PetscScalar d_elem_mats[], P4estVertexMaps *d_maps[], PetscSplitCSRDataStructure d_mat, PetscReal shift,
598: const PetscInt d_numCells[], const PetscInt d_species_offset[], const PetscInt d_mat_offset[], const PetscInt d_ip_offset[], const PetscInt d_ipf_offset[], const PetscInt d_elem_offset[])
599: {
600: const PetscInt Nq = blockDim.y, g_cell = blockIdx.x;
601: __shared__ PetscScalar s_fieldMats[LANDAU_MAX_NQ][LANDAU_MAX_NQ];
602: __shared__ PetscInt s_idx[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
603: __shared__ PetscReal s_scale[LANDAU_MAX_NQ][LANDAU_MAX_Q_FACE];
604: int tid = threadIdx.x + threadIdx.y*blockDim.x;
605: PetscScalar *elemMat = NULL; /* my output */
606: int fieldA,d,qj,q,idx,f,g, grid = 0;
608: while (g_cell >= d_elem_offset[grid+1]) grid++; // yuck search for grid
609: {
610: const PetscInt moffset = d_mat_offset[grid], Nfloc = d_species_offset[grid+1]-d_species_offset[grid], totDim = Nfloc*Nq, elem = g_cell-d_elem_offset[grid];
611: const PetscInt IP_idx = d_ip_offset[grid];
612: if (d_elem_mats) {
613: elemMat = d_elem_mats; // start a beginning
614: for (PetscInt grid2=0 ; grid2<grid ; grid2++) {
615: PetscInt Nfloc2 = d_species_offset[grid2+1] - d_species_offset[grid2], totDim2 = Nfloc2*Nb;
616: elemMat += d_numCells[grid2]*totDim2*totDim2; // jump past grids,could be in an offset
617: }
618: elemMat += elem*totDim*totDim;
619: for (int i = tid; i < totDim*totDim; i += blockDim.x*blockDim.y) elemMat[i] = 0;
620: }
621: __syncthreads();
622: /* FE mass matrix construction */
623: for (fieldA = 0; fieldA < Nfloc; fieldA++) {
624: PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE];
625: PetscInt nr,nc;
626: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
627: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
628: PetscScalar t = 0;
629: for (qj = 0 ; qj < Nq ; qj++) {
630: const PetscReal *BJq = &BB[qj*Nb];
631: const PetscInt jpidx = IP_idx + qj + elem * Nq;
632: if (dim==2) {
633: t += BJq[f] * d_w[jpidx]*shift * BJq[g] * 2. * PETSC_PI;
634: } else {
635: t += BJq[f] * d_w[jpidx]*shift * BJq[g];
636: }
637: }
638: if (elemMat) {
639: const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
640: elemMat[fOff] += t; // ????
641: } else s_fieldMats[f][g] = t;
642: }
643: }
644: if (!elemMat) {
645: const LandauIdx *const Idxs = &d_maps[grid]->gIdx[elem][fieldA][0];
646: __syncthreads();
647: if (threadIdx.y == 0) {
648: for (f = threadIdx.x; f < Nb ; f += blockDim.x) {
649: idx = Idxs[f];
650: if (idx >= 0) {
651: s_idx[f][0] = idx + moffset;
652: s_scale[f][0] = 1.;
653: } else {
654: idx = -idx - 1;
655: for (q = 0; q < d_maps[grid]->num_face; q++) {
656: s_idx[f][q] = d_maps[grid]->c_maps[idx][q].gid + moffset;
657: s_scale[f][q] = d_maps[grid]->c_maps[idx][q].scale;
658: }
659: }
660: }
661: }
662: __syncthreads();
663: for (f = threadIdx.y; f < Nb ; f += blockDim.y) {
664: idx = Idxs[f];
665: if (idx >= 0) {
666: nr = 1;
667: } else {
668: nr = d_maps[grid]->num_face;
669: }
670: for (g = threadIdx.x; g < Nb; g += blockDim.x) {
671: idx = Idxs[g];
672: if (idx >= 0) {
673: nc = 1;
674: } else {
675: nc = d_maps[grid]->num_face;
676: }
677: for (q = 0; q < nr; q++) {
678: for (d = 0; d < nc; d++) {
679: vals[q*nc + d] = s_scale[f][q]*s_scale[g][d]*s_fieldMats[f][g];
680: }
681: }
682: MatSetValuesDevice(d_mat,nr,s_idx[f],nc,s_idx[g],vals,ADD_VALUES);
683: }
684: }
685: }
686: __syncthreads();
687: }
688: }
689: }
691: PetscErrorCode LandauCUDAJacobian(DM plex[], const PetscInt Nq, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[],
692: const PetscInt N, const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscInt num_sub_blocks, const PetscReal shift,
693: const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
694: {
695: PetscErrorCode ierr;
696: cudaError_t cerr;
697: PetscInt Nb=Nq,dim,nip_global,num_cells_tot,elem_mat_size_tot;
698: PetscInt *d_numCells, *d_species_offset, *d_mat_offset, *d_ip_offset, *d_ipf_offset, *d_elem_offset;
699: PetscInt szf=sizeof(PetscReal),szs=sizeof(PetscScalar),Nftot=a_species_offset[num_grids];
700: PetscReal *d_BB=NULL,*d_DD=NULL,*d_invJj=NULL,*d_nu_alpha=NULL,*d_nu_beta=NULL,*d_invMass=NULL,*d_Eq_m=NULL,*d_x=NULL,*d_y=NULL,*d_w=NULL;
701: PetscScalar *d_elem_mats=NULL,*d_vertex_f=NULL;
702: PetscReal *d_f=NULL,*d_dfdx=NULL,*d_dfdy=NULL;
703: #if LANDAU_DIM==3
704: PetscReal *d_dfdz=NULL, *d_z = NULL;
705: #endif
706: LandauCtx *ctx;
707: PetscSplitCSRDataStructure d_mat=NULL;
708: P4estVertexMaps **d_maps,*maps[LANDAU_MAX_GRIDS];
709: int nnn = 256/Nq; // machine dependent
710: PetscContainer container;
712: PetscLogEventBegin(events[3],0,0,0,0);
713: while (nnn & nnn - 1) nnn = nnn & nnn - 1;
714: if (nnn>4) nnn = 4; // 16 debug
715: DMGetApplicationContext(plex[0], &ctx);
716: if (!ctx) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "no context");
717: DMGetDimension(plex[0], &dim);
718: if (dim!=LANDAU_DIM) SETERRQ2(PETSC_COMM_SELF, PETSC_ERR_PLIB, "LANDAU_DIM %D != dim %d",LANDAU_DIM,dim);
719: if (ctx->gpu_assembly) {
720: PetscObjectQuery((PetscObject) JacP, "assembly_maps", (PetscObject *) &container);
721: if (container) { // not here first call
722: static int init = 0; // hack. just do every time, or put in setup (but that is in base class code), or add init_maps flag
723: if (!init++) {
724: P4estVertexMaps *h_maps=NULL;
725: PetscContainerGetPointer(container, (void **) &h_maps);
726: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
727: if (h_maps[grid].d_self) {
728: maps[grid] = h_maps[grid].d_self;
729: } else {
730: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
731: }
732: }
733: cerr = cudaMemcpy(SData_d->maps, maps, num_grids*sizeof(P4estVertexMaps*), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
734: }
735: d_maps = (P4estVertexMaps**)SData_d->maps;
736: // this does the setup the first time called
737: MatCUSPARSEGetDeviceMatWrite(JacP,&d_mat);
738: } else {
739: d_maps = NULL;
740: }
741: } else {
742: container = NULL;
743: d_maps = NULL;
744: }
745: PetscLogEventEnd(events[3],0,0,0,0);
746: {
747: PetscInt elem_mat_size = 0;
748: nip_global = num_cells_tot = 0;
749: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
750: PetscInt Nfloc = a_species_offset[grid+1] - a_species_offset[grid], totDim = Nfloc*Nb;
751: nip_global += a_numCells[grid]*Nq;
752: num_cells_tot += a_numCells[grid]; // is in d_elem_offset
753: elem_mat_size += a_numCells[grid]*totDim*totDim; // could be in an offset
754: }
755: elem_mat_size_tot = d_maps ? 0 : elem_mat_size;
756: }
757: if (elem_mat_size_tot) {
758: cerr = cudaMalloc((void **)&d_elem_mats, elem_mat_size_tot*szs);CHKERRCUDA(cerr); // kernel output - first call is on CPU
759: } else d_elem_mats = NULL;
760: // create data
761: d_BB = (PetscReal*)SData_d->B;
762: d_DD = (PetscReal*)SData_d->D;
763: if (a_elem_closure || a_xarray) { // form f and df
764: PetscLogEventBegin(events[1],0,0,0,0);
765: cerr = cudaMemcpy(SData_d->Eq_m, a_Eq_m, Nftot*szf, cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
766: d_invJj = (PetscReal*)SData_d->invJ;
767: d_nu_alpha = (PetscReal*)SData_d->alpha;
768: d_nu_beta = (PetscReal*)SData_d->beta;
769: d_invMass = (PetscReal*)SData_d->invMass;
770: d_x = (PetscReal*)SData_d->x;
771: d_y = (PetscReal*)SData_d->y;
772: d_w = (PetscReal*)SData_d->w;
773: d_Eq_m = (PetscReal*)SData_d->Eq_m;
774: d_dfdx = (PetscReal*)SData_d->dfdx;
775: d_dfdy = (PetscReal*)SData_d->dfdy;
776: #if LANDAU_DIM==3
777: d_dfdz = (PetscReal*)SData_d->dfdz;
778: d_z = (PetscReal*)SData_d->z;
779: #endif
780: d_f = (PetscReal*)SData_d->f;
781: // get a d_vertex_f
782: if (a_elem_closure) {
783: PetscInt closure_sz = 0; // argh, don't have this on the host!!!
784: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
785: PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
786: closure_sz += Nq*nfloc*a_numCells[grid];
787: }
788: cerr = cudaMalloc((void **)&d_vertex_f, closure_sz * sizeof(*a_elem_closure));CHKERRCUDA(cerr);
789: cerr = cudaMemcpy(d_vertex_f, a_elem_closure, closure_sz*sizeof(*a_elem_closure), cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
790: } else {
791: d_vertex_f = (PetscScalar*)a_xarray;
792: }
793: PetscLogEventEnd(events[1],0,0,0,0);
794: } else {
795: d_w = (PetscReal*)SData_d->w;
796: }
797: //
798: d_numCells = (PetscInt*)SData_d->NCells; // redundant -- remove
799: d_species_offset = (PetscInt*)SData_d->species_offset;
800: d_mat_offset = (PetscInt*)SData_d->mat_offset;
801: d_ip_offset = (PetscInt*)SData_d->ip_offset;
802: d_ipf_offset = (PetscInt*)SData_d->ipf_offset;
803: d_elem_offset = (PetscInt*)SData_d->elem_offset;
804: if (a_elem_closure || a_xarray) { // form f and df
805: dim3 dimBlockFDF(nnn>Nftot ? Nftot : nnn, Nq), dimBlock((nip_global>nnn) ? nnn : nip_global , Nq);
806: PetscLogEventBegin(events[8],0,0,0,0);
807: PetscLogGpuTimeBegin();
808: PetscInfo2(plex[0], "Form F and dF/dx vectors: nip_global=%D num_grids=%D\n",nip_global,num_grids);
809: landau_form_fdf<<<num_cells_tot,dimBlockFDF>>>(dim, Nb, d_invJj, d_BB, d_DD, d_vertex_f, d_maps, d_f, d_dfdx, d_dfdy,
810: #if LANDAU_DIM==3
811: d_dfdz,
812: #endif
813: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
814: CHECK_LAUNCH_ERROR();
815: PetscLogGpuFlops(nip_global*(PetscLogDouble)(2*Nb*(1+dim)));
816: if (a_elem_closure) {
817: cerr = cudaFree(d_vertex_f);CHKERRCUDA(cerr);
818: d_vertex_f = NULL;
819: }
820: PetscLogGpuTimeEnd();
821: PetscLogEventEnd(events[8],0,0,0,0);
822: // Jacobian
823: PetscLogEventBegin(events[4],0,0,0,0);
824: PetscLogGpuTimeBegin();
825: PetscLogGpuFlops(nip_global*(PetscLogDouble)(a_elem_closure ? (nip_global*(11*Nftot+ 4*dim*dim) + 6*Nftot*dim*dim*dim + 10*Nftot*dim*dim + 4*Nftot*dim + Nb*Nftot*Nb*Nq*dim*dim*5) : Nb*Nftot*Nb*Nq*4));
826: PetscInt ii = 2*LANDAU_MAX_NQ*LANDAU_MAX_SPECIES*LANDAU_DIM*(1+LANDAU_DIM) + 3*LANDAU_MAX_SPECIES + (1+LANDAU_DIM)*dimBlock.x*LANDAU_MAX_SPECIES + LANDAU_MAX_NQ*LANDAU_MAX_NQ + 2*LANDAU_MAX_NQ*LANDAU_MAX_Q_FACE;
827: if (ii*szf >= 49152) {
828: cerr = cudaFuncSetAttribute(landau_jacobian,
829: cudaFuncAttributeMaxDynamicSharedMemorySize,
830: 98304);CHKERRCUDA(cerr);
831: }
832: PetscInfo3(plex[0], "Jacobian shared memory size: %D bytes, d_elem_mats=%p d_maps=%p\n",ii,d_elem_mats,d_maps);
833: landau_jacobian<<<num_cells_tot,dimBlock,ii*szf>>>(nip_global, dim, Nb, d_invJj, Nftot, d_nu_alpha, d_nu_beta, d_invMass, d_Eq_m,
834: d_BB, d_DD, d_x, d_y, d_w,
835: d_elem_mats, d_maps, d_mat, d_f, d_dfdx, d_dfdy,
836: #if LANDAU_DIM==3
837: d_z, d_dfdz,
838: #endif
839: d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
840: CHECK_LAUNCH_ERROR(); // has sync
841: PetscLogGpuTimeEnd();
842: PetscLogEventEnd(events[4],0,0,0,0);
843: } else { // mass
844: dim3 dimBlock(nnn,Nq);
845: PetscInfo4(plex[0], "Mass d_maps = %p. Nq=%d, vector size %d num_cells_tot=%d\n",d_maps,Nq,nnn,num_cells_tot);
846: PetscLogEventBegin(events[4],0,0,0,0);
847: PetscLogGpuTimeBegin();
848: landau_mass<<<num_cells_tot,dimBlock>>>(dim, Nb, d_w, d_BB, d_DD, d_elem_mats,
849: d_maps, d_mat, shift, d_numCells, d_species_offset, d_mat_offset, d_ip_offset, d_ipf_offset, d_elem_offset);
850: CHECK_LAUNCH_ERROR(); // has sync
851: PetscLogGpuTimeEnd();
852: PetscLogEventEnd(events[4],0,0,0,0);
853: }
854: // First time assembly with or without GPU assembly
855: if (d_elem_mats) {
856: for (PetscInt grid=0, elem_mats_idx = 0 ; grid<num_grids ; grid++) { // elem_mats_idx += totDim*totDim*a_numCells[grid];
857: const PetscInt Nfloc = a_species_offset[grid+1]-a_species_offset[grid], totDim = Nfloc*Nq;
858: PetscScalar *elemMats = NULL, *elMat;
859: PetscSection section, globalSection;
860: PetscInt cStart,cEnd,ej;
861: PetscLogEventBegin(events[5],0,0,0,0);
862: DMPlexGetHeightStratum(plex[grid],0,&cStart,&cEnd);
863: DMGetLocalSection(plex[grid], §ion);
864: DMGetGlobalSection(plex[grid], &globalSection);
865: PetscMalloc1(totDim*totDim*a_numCells[grid],&elemMats);
866: cerr = cudaMemcpy(elemMats, &d_elem_mats[elem_mats_idx], totDim*totDim*a_numCells[grid]*sizeof(*elemMats), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
867: PetscLogEventEnd(events[5],0,0,0,0);
868: PetscLogEventBegin(events[6],0,0,0,0);
869: for (ej = cStart, elMat = elemMats ; ej < cEnd; ++ej, elMat += totDim*totDim) {
870: DMPlexMatSetClosure(plex[grid], section, globalSection, subJ[grid], ej, elMat, ADD_VALUES);
871: if (ej==-1) {
872: int d,f;
873: PetscPrintf(PETSC_COMM_SELF,"GPU Element matrix\n");
874: for (d = 0; d < totDim; ++d) {
875: for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF," %12.5e", PetscRealPart(elMat[d*totDim + f]));
876: PetscPrintf(PETSC_COMM_SELF,"\n");
877: }
878: //exit(14);
879: }
880: }
881: PetscFree(elemMats);
882: elem_mats_idx += totDim*totDim*a_numCells[grid]; // this can be a stored offset?
883: //
884: if (!container) { // move nest matrix to global JacP
885: PetscInt moffset = a_mat_offset[grid], nloc, nzl, colbuf[1024], row;
886: const PetscInt *cols;
887: const PetscScalar *vals;
888: Mat B = subJ[grid];
889: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
890: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
891: MatGetSize(B, &nloc, NULL);
892: if (nloc != a_mat_offset[grid+1] - moffset) SETERRQ2(PetscObjectComm((PetscObject) B), PETSC_ERR_PLIB, "nloc %D != mat_offset[grid+1] - moffset = %D",nloc, a_mat_offset[grid+1] - moffset);
893: for (int i=0 ; i<nloc ; i++) {
894: MatGetRow(B,i,&nzl,&cols,&vals);
895: if (nzl>1024) SETERRQ1(PetscObjectComm((PetscObject) B), PETSC_ERR_PLIB, "Row too big: %D",nzl);
896: for (int j=0; j<nzl; j++) colbuf[j] = cols[j] + moffset;
897: row = i + moffset;
898: MatSetValues(JacP,1,&row,nzl,colbuf,vals,ADD_VALUES);
899: MatRestoreRow(B,i,&nzl,&cols,&vals);
900: }
901: MatDestroy(&subJ[grid]);
902: } else exit(34);
903: PetscLogEventEnd(events[6],0,0,0,0);
904: } // grids
905: cerr = cudaFree(d_elem_mats);CHKERRCUDA(cerr);
906: if (ctx->gpu_assembly) {
907: // transition to use of maps for VecGetClosure
908: if (!(a_elem_closure || a_xarray)) SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "transition without Jacobian");
909: }
910: }
912: return(0);
913: }