Actual source code: landau.kokkos.cxx
1: /*
2: Implements the Kokkos kernel
3: */
4: #include <petscconf.h>
5: #include <petscvec_kokkos.hpp>
6: #include <petsc/private/dmpleximpl.h>
7: #include <petsclandau.h>
8: #include <petscts.h>
10: #include <Kokkos_Core.hpp>
11: #include <cstdio>
12: typedef Kokkos::TeamPolicy<>::member_type team_member;
13: #include "../land_tensors.h"
14: #include <petscaijdevice.h>
16: namespace landau_inner_red { // namespace helps with name resolution in reduction identity
17: template< class ScalarType >
18: struct array_type {
19: ScalarType gg2[LANDAU_DIM];
20: ScalarType gg3[LANDAU_DIM][LANDAU_DIM];
22: KOKKOS_INLINE_FUNCTION // Default constructor - Initialize to 0's
23: array_type() {
24: for (int j = 0; j < LANDAU_DIM; j++) {
25: gg2[j] = 0;
26: for (int k = 0; k < LANDAU_DIM; k++) {
27: gg3[j][k] = 0;
28: }
29: }
30: }
31: KOKKOS_INLINE_FUNCTION // Copy Constructor
32: array_type(const array_type & rhs) {
33: for (int j = 0; j < LANDAU_DIM; j++) {
34: gg2[j] = rhs.gg2[j];
35: for (int k = 0; k < LANDAU_DIM; k++) {
36: gg3[j][k] = rhs.gg3[j][k];
37: }
38: }
39: }
40: KOKKOS_INLINE_FUNCTION // add operator
41: array_type& operator += (const array_type& src)
42: {
43: for (int j = 0; j < LANDAU_DIM; j++) {
44: gg2[j] += src.gg2[j];
45: for (int k = 0; k < LANDAU_DIM; k++) {
46: gg3[j][k] += src.gg3[j][k];
47: }
48: }
49: return *this;
50: }
51: KOKKOS_INLINE_FUNCTION // volatile add operator
52: void operator += (const volatile array_type& src) volatile
53: {
54: for (int j = 0; j < LANDAU_DIM; j++) {
55: gg2[j] += src.gg2[j];
56: for (int k = 0; k < LANDAU_DIM; k++) {
57: gg3[j][k] += src.gg3[j][k];
58: }
59: }
60: }
61: };
62: typedef array_type<PetscReal> TensorValueType; // used to simplify code below
63: }
65: namespace Kokkos { //reduction identity must be defined in Kokkos namespace
66: template<>
67: struct reduction_identity< landau_inner_red::TensorValueType > {
68: KOKKOS_FORCEINLINE_FUNCTION static landau_inner_red::TensorValueType sum() {
69: return landau_inner_red::TensorValueType();
70: }
71: };
72: }
74: extern "C" {
75: PetscErrorCode LandauKokkosCreateMatMaps(P4estVertexMaps maps[], pointInterpolationP4est (*pointMaps)[LANDAU_MAX_Q_FACE], PetscInt Nf[], PetscInt Nq, PetscInt grid)
76: {
77: P4estVertexMaps h_maps; /* host container */
78: const Kokkos::View<pointInterpolationP4est*[LANDAU_MAX_Q_FACE], Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_points ((pointInterpolationP4est*)pointMaps, maps[grid].num_reduced);
79: const Kokkos::View< LandauIdx*[LANDAU_MAX_SPECIES][LANDAU_MAX_NQ], Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_gidx ((LandauIdx*)maps[grid].gIdx, maps[grid].num_elements);
80: Kokkos::View<pointInterpolationP4est*[LANDAU_MAX_Q_FACE], Kokkos::LayoutRight> *d_points = new Kokkos::View<pointInterpolationP4est*[LANDAU_MAX_Q_FACE], Kokkos::LayoutRight>("points", maps[grid].num_reduced);
81: Kokkos::View<LandauIdx*[LANDAU_MAX_SPECIES][LANDAU_MAX_NQ], Kokkos::LayoutRight> *d_gidx = new Kokkos::View<LandauIdx*[LANDAU_MAX_SPECIES][LANDAU_MAX_NQ], Kokkos::LayoutRight>("gIdx", maps[grid].num_elements);
83: Kokkos::deep_copy (*d_gidx, h_gidx);
84: Kokkos::deep_copy (*d_points, h_points);
85: h_maps.num_elements = maps[grid].num_elements;
86: h_maps.num_face = maps[grid].num_face;
87: h_maps.num_reduced = maps[grid].num_reduced;
88: h_maps.deviceType = maps[grid].deviceType;
89: h_maps.numgrids = maps[grid].numgrids;
90: h_maps.Nf = Nf[grid];
91: h_maps.c_maps = (pointInterpolationP4est (*)[LANDAU_MAX_Q_FACE]) d_points->data();
92: maps[grid].vp1 = (void*)d_points;
93: h_maps.gIdx = (LandauIdx (*)[LANDAU_MAX_SPECIES][LANDAU_MAX_NQ]) d_gidx->data();
94: maps[grid].vp2 = (void*)d_gidx;
95: {
96: Kokkos::View<P4estVertexMaps, Kokkos::HostSpace> h_maps_k(&h_maps);
97: Kokkos::View<P4estVertexMaps> *d_maps_k = new Kokkos::View<P4estVertexMaps>(Kokkos::create_mirror(Kokkos::DefaultExecutionSpace::memory_space(),h_maps_k));
98: Kokkos::deep_copy (*d_maps_k, h_maps_k);
99: maps[grid].d_self = d_maps_k->data();
100: maps[grid].vp3 = (void*)d_maps_k;
101: }
102: return 0;
103: }
104: PetscErrorCode LandauKokkosDestroyMatMaps(P4estVertexMaps maps[], PetscInt num_grids)
105: {
106: for (PetscInt grid=0;grid<num_grids;grid++) {
107: auto a = static_cast<Kokkos::View<pointInterpolationP4est*[LANDAU_MAX_Q_FACE], Kokkos::LayoutRight>*>(maps[grid].vp1);
108: auto b = static_cast<Kokkos::View<LandauIdx*[LANDAU_MAX_SPECIES][LANDAU_MAX_NQ], Kokkos::LayoutRight>*>(maps[grid].vp2);
109: auto c = static_cast<Kokkos::View<P4estVertexMaps*>*>(maps[grid].vp3);
110: delete a; delete b; delete c;
111: }
112: return 0;
113: }
115: PetscErrorCode LandauKokkosStaticDataSet(DM plex, const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids,
116: PetscInt a_numCells[], PetscInt a_species_offset[], PetscInt a_mat_offset[],
117: PetscReal a_nu_alpha[], PetscReal a_nu_beta[], PetscReal a_invMass[], PetscReal a_invJ[],
118: PetscReal a_x[], PetscReal a_y[], PetscReal a_z[], PetscReal a_w[], LandauStaticData *SData_d)
119: {
120: PetscReal *BB,*DD;
121: PetscTabulation *Tf;
122: PetscInt dim;
123: PetscInt Nb=Nq,ip_offset[LANDAU_MAX_GRIDS+1],ipf_offset[LANDAU_MAX_GRIDS+1],elem_offset[LANDAU_MAX_GRIDS+1],nip,IPf_sz,Nftot;
124: PetscDS prob;
126: DMGetDimension(plex, &dim);
127: DMGetDS(plex, &prob);
129: PetscDSGetTabulation(prob, &Tf);
130: BB = Tf[0]->T[0]; DD = Tf[0]->T[1];
131: ip_offset[0] = ipf_offset[0] = elem_offset[0] = 0;
132: nip = 0;
133: IPf_sz = 0;
134: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
135: PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
136: elem_offset[grid+1] = elem_offset[grid] + a_numCells[grid];
137: nip += a_numCells[grid]*Nq;
138: ip_offset[grid+1] = nip;
139: IPf_sz += Nq*nfloc*a_numCells[grid];
140: ipf_offset[grid+1] = IPf_sz;
141: }
142: Nftot = a_species_offset[num_grids];
143: PetscKokkosInitializeCheck();
144: {
145: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_alpha (a_nu_alpha, Nftot);
146: auto alpha = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("alpha", Nftot);
147: SData_d->alpha = static_cast<void*>(alpha);
148: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_beta (a_nu_beta, Nftot);
149: auto beta = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("beta", Nftot);
150: SData_d->beta = static_cast<void*>(beta);
151: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_invMass (a_invMass,Nftot);
152: auto invMass = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("invMass", Nftot);
153: SData_d->invMass = static_cast<void*>(invMass);
154: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_BB (BB,Nq*Nb);
155: auto B = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("B", Nq*Nb);
156: SData_d->B = static_cast<void*>(B);
157: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_DD (DD,Nq*Nb*dim);
158: auto D = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("D", Nq*Nb*dim);
159: SData_d->D = static_cast<void*>(D);
160: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_invJ (a_invJ, nip*dim*dim);
161: auto invJ = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("invJ", nip*dim*dim);
162: SData_d->invJ = static_cast<void*>(invJ);
163: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_x (a_x, nip);
164: auto x = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("x", nip);
165: SData_d->x = static_cast<void*>(x);
166: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_y (a_y, nip);
167: auto y = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("y", nip);
168: SData_d->y = static_cast<void*>(y);
169: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_w (a_w, nip);
170: auto w = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("w", nip);
171: SData_d->w = static_cast<void*>(w);
173: Kokkos::deep_copy (*alpha, h_alpha);
174: Kokkos::deep_copy (*beta, h_beta);
175: Kokkos::deep_copy (*invMass, h_invMass);
176: Kokkos::deep_copy (*B, h_BB);
177: Kokkos::deep_copy (*D, h_DD);
178: Kokkos::deep_copy (*invJ, h_invJ);
179: Kokkos::deep_copy (*x, h_x);
180: Kokkos::deep_copy (*y, h_y);
181: Kokkos::deep_copy (*w, h_w);
183: if (dim==3) {
184: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_z (a_z , dim==3 ? nip : 0);
185: auto z = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("z", nip);
186: SData_d->z = static_cast<void*>(z);
187: Kokkos::deep_copy (*z, h_z);
188: } else SData_d->z = NULL;
190: //
191: const Kokkos::View<PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_NCells (a_numCells, num_grids);
192: auto nc = new Kokkos::View<PetscInt*, Kokkos::LayoutLeft> ("NCells",num_grids);
193: SData_d->NCells = static_cast<void*>(nc);
194: Kokkos::deep_copy (*nc, h_NCells);
196: const Kokkos::View<PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_species_offset (a_species_offset, num_grids+1);
197: auto soff = new Kokkos::View<PetscInt*, Kokkos::LayoutLeft> ("species_offset",num_grids+1);
198: SData_d->species_offset = static_cast<void*>(soff);
199: Kokkos::deep_copy (*soff, h_species_offset);
201: const Kokkos::View<PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_mat_offset (a_mat_offset, num_grids+1);
202: auto moff = new Kokkos::View<PetscInt*, Kokkos::LayoutLeft> ("mat_offset",num_grids+1);
203: SData_d->mat_offset = static_cast<void*>(moff);
204: Kokkos::deep_copy (*moff, h_mat_offset);
206: const Kokkos::View<PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ip_offset (ip_offset, num_grids+1);
207: auto ipoff = new Kokkos::View<PetscInt*, Kokkos::LayoutLeft> ("ip_offset",num_grids+1);
208: SData_d->ip_offset = static_cast<void*>(ipoff);
209: Kokkos::deep_copy (*ipoff, h_ip_offset);
211: const Kokkos::View<PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_elem_offset (elem_offset, num_grids+1);
212: auto eoff = new Kokkos::View<PetscInt*, Kokkos::LayoutLeft> ("elem_offset",num_grids+1);
213: SData_d->elem_offset = static_cast<void*>(eoff);
214: Kokkos::deep_copy (*eoff, h_elem_offset);
216: const Kokkos::View<PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ipf_offset (ipf_offset, num_grids+1);
217: auto ipfoff = new Kokkos::View<PetscInt*, Kokkos::LayoutLeft> ("ipf_offset",num_grids+1);
218: SData_d->ipf_offset = static_cast<void*>(ipfoff);
219: Kokkos::deep_copy (*ipfoff, h_ipf_offset);
220: #if defined(LANDAU_LAYOUT_LEFT) // preallocate dynamic data, no copy
221: auto ipfdf_data = new Kokkos::View<PetscReal***, Kokkos::LayoutLeft > ("fdf", batch_sz, dim+1, IPf_sz);
222: #else
223: auto ipfdf_data = new Kokkos::View<PetscReal***, Kokkos::LayoutRight > ("fdf",batch_sz, dim+1, IPf_sz);
224: #endif
225: SData_d->ipfdf_data = static_cast<void*>(ipfdf_data);
226: auto Eq_m = new Kokkos::View<PetscReal*, Kokkos::LayoutLeft> ("Eq_m",Nftot); // allocate but do not set, same for whole batch
227: SData_d->Eq_m = static_cast<void*>(Eq_m);
228: const Kokkos::View<LandauIdx*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_coo_elem_offsets ((LandauIdx*)SData_d->coo_elem_offsets, SData_d->coo_n_cellsTot+1);
229: auto coo_elem_offsets = new Kokkos::View<LandauIdx*, Kokkos::LayoutLeft> ("coo_elem_offsets",SData_d->coo_n_cellsTot+1);
230: const Kokkos::View<LandauIdx*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_coo_elem_fullNb ((LandauIdx*)SData_d->coo_elem_fullNb, SData_d->coo_n_cellsTot);
231: auto coo_elem_fullNb = new Kokkos::View<LandauIdx*, Kokkos::LayoutLeft> ("coo_elem_offsets",SData_d->coo_n_cellsTot);
232: const Kokkos::View<LandauIdx*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_coo_elem_point_offsets ((LandauIdx*)SData_d->coo_elem_point_offsets, SData_d->coo_n_cellsTot*(LANDAU_MAX_NQ+1));
233: auto coo_elem_point_offsets = new Kokkos::View<LandauIdx*, Kokkos::LayoutLeft> ("coo_elem_point_offsets",SData_d->coo_n_cellsTot*(LANDAU_MAX_NQ+1));
234: // assign & copy
235: Kokkos::deep_copy (*coo_elem_offsets, h_coo_elem_offsets);
236: Kokkos::deep_copy (*coo_elem_fullNb, h_coo_elem_fullNb);
237: Kokkos::deep_copy (*coo_elem_point_offsets, h_coo_elem_point_offsets);
238: // need to free this now and use pointer space
239: PetscFree3(SData_d->coo_elem_offsets,SData_d->coo_elem_fullNb,SData_d->coo_elem_point_offsets);
240: SData_d->coo_elem_offsets = static_cast<void*>(coo_elem_offsets);
241: SData_d->coo_elem_fullNb = static_cast<void*>(coo_elem_fullNb);
242: SData_d->coo_elem_point_offsets = static_cast<void*>(coo_elem_point_offsets);
243: }
244: SData_d->maps = NULL; // not used
245: return 0;
246: }
248: PetscErrorCode LandauKokkosStaticDataClear(LandauStaticData *SData_d)
249: {
250: if (SData_d->alpha) {
251: auto alpha = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->alpha);
252: delete alpha;
253: SData_d->alpha = NULL;
254: auto beta = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->beta);
255: delete beta;
256: auto invMass = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->invMass);
257: delete invMass;
258: auto B = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->B);
259: delete B;
260: auto D = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->D);
261: delete D;
262: auto invJ = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->invJ);
263: delete invJ;
264: auto x = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->x);
265: delete x;
266: auto y = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->y);
267: delete y;
268: if (SData_d->z) {
269: auto z = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->z);
270: delete z;
271: }
272: #if defined(LANDAU_LAYOUT_LEFT) // preallocate dynamic data, no copy
273: auto z = static_cast<Kokkos::View<PetscReal***, Kokkos::LayoutLeft>*>(SData_d->ipfdf_data);
274: #else
275: auto z = static_cast<Kokkos::View<PetscReal***, Kokkos::LayoutRight>*>(SData_d->ipfdf_data);
276: #endif
277: delete z;
278: auto w = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->w);
279: delete w;
280: auto Eq_m = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->Eq_m);
281: delete Eq_m;
282: // offset
283: auto nc = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->NCells);
284: delete nc;
285: auto soff = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->species_offset);
286: delete soff;
287: auto moff = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->mat_offset);
288: delete moff;
289: auto ipoff = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->ip_offset);
290: delete ipoff;
291: auto eoff = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->elem_offset);
292: delete eoff;
293: auto ipfoff = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->ipf_offset);
294: delete ipfoff;
295: auto coo_elem_offsets = static_cast<Kokkos::View<LandauIdx*, Kokkos::LayoutLeft>*>((void*)SData_d->coo_elem_offsets);
296: delete coo_elem_offsets;
297: auto coo_elem_fullNb = static_cast<Kokkos::View<LandauIdx*, Kokkos::LayoutLeft>*>((void*)SData_d->coo_elem_fullNb);
298: delete coo_elem_fullNb;
299: auto coo_elem_point_offsets = static_cast<Kokkos::View<LandauIdx*, Kokkos::LayoutLeft>*>((void*)SData_d->coo_elem_point_offsets);
300: delete coo_elem_point_offsets;
301: SData_d->coo_elem_offsets = NULL;
302: SData_d->coo_elem_point_offsets = NULL;
303: SData_d->coo_elem_fullNb = NULL;
304: }
305: return 0;
306: }
308: #define KOKKOS_SHARED_LEVEL 1
309: KOKKOS_INLINE_FUNCTION
310: PetscErrorCode landau_mat_assemble(PetscSplitCSRDataStructure d_mat, PetscScalar *coo_vals, const PetscScalar Aij, const PetscInt f, const PetscInt g, const PetscInt Nb,
311: PetscInt moffset, const PetscInt elem, const PetscInt fieldA, const P4estVertexMaps *d_maps,
312: const LandauIdx coo_elem_offsets[], const LandauIdx coo_elem_fullNb[], const LandauIdx (*coo_elem_point_offsets)[LANDAU_MAX_NQ+1],
313: const PetscInt glb_elem_idx, const PetscInt bid_coo_sz_batch)
314: {
315: PetscInt idx,q,nr,nc,d;
316: const LandauIdx *const Idxs = &d_maps->gIdx[elem][fieldA][0];
317: PetscScalar row_scale[LANDAU_MAX_Q_FACE]={0},col_scale[LANDAU_MAX_Q_FACE]={0};
318: if (coo_vals) { // mirror (i,j) in CreateStaticGPUData
319: const int fullNb = coo_elem_fullNb[glb_elem_idx],fullNb2=fullNb*fullNb;
320: idx = Idxs[f];
321: if (idx >= 0) {
322: nr = 1;
323: row_scale[0] = 1.;
324: } else {
325: idx = -idx - 1;
326: for (q = 0, nr = 0; q < d_maps->num_face; q++, nr++) {
327: if (d_maps->c_maps[idx][q].gid < 0) break;
328: row_scale[q] = d_maps->c_maps[idx][q].scale;
329: }
330: }
331: idx = Idxs[g];
332: if (idx >= 0) {
333: nc = 1;
334: col_scale[0] = 1.;
335: } else {
336: idx = -idx - 1;
337: nc = d_maps->num_face;
338: for (q = 0, nc = 0; q < d_maps->num_face; q++, nc++) {
339: if (d_maps->c_maps[idx][q].gid < 0) break;
340: col_scale[q] = d_maps->c_maps[idx][q].scale;
341: }
342: }
343: const int idx0 = bid_coo_sz_batch + coo_elem_offsets[glb_elem_idx] + fieldA*fullNb2 + fullNb * coo_elem_point_offsets[glb_elem_idx][f] + nr * coo_elem_point_offsets[glb_elem_idx][g];
344: for (int q = 0, idx2 = idx0; q < nr; q++) {
345: for (int d = 0; d < nc; d++, idx2++) {
346: coo_vals[idx2] = row_scale[q]*col_scale[d]*Aij;
347: }
348: }
349: } else {
350: PetscScalar vals[LANDAU_MAX_Q_FACE*LANDAU_MAX_Q_FACE]={0};
351: PetscInt rows[LANDAU_MAX_Q_FACE],cols[LANDAU_MAX_Q_FACE];
352: idx = Idxs[f];
353: if (idx >= 0) {
354: nr = 1;
355: rows[0] = idx;
356: row_scale[0] = 1.;
357: } else {
358: idx = -idx - 1;
359: for (q = 0, nr = 0; q < d_maps->num_face; q++, nr++) {
360: if (d_maps->c_maps[idx][q].gid < 0) break;
361: rows[q] = d_maps->c_maps[idx][q].gid;
362: row_scale[q] = d_maps->c_maps[idx][q].scale;
363: }
364: }
365: idx = Idxs[g];
366: if (idx >= 0) {
367: nc = 1;
368: cols[0] = idx;
369: col_scale[0] = 1.;
370: } else {
371: idx = -idx - 1;
372: for (q = 0, nc = 0; q < d_maps->num_face; q++, nc++) {
373: if (d_maps->c_maps[idx][q].gid < 0) break;
374: cols[q] = d_maps->c_maps[idx][q].gid;
375: col_scale[q] = d_maps->c_maps[idx][q].scale;
376: }
377: }
379: for (q = 0; q < nr; q++) rows[q] = rows[q] + moffset;
380: for (q = 0; q < nc; q++) cols[q] = cols[q] + moffset;
381: for (q = 0; q < nr; q++) {
382: for (d = 0; d < nc; d++) {
383: vals[q*nc + d] = row_scale[q]*col_scale[d]*Aij;
384: }
385: }
386: MatSetValuesDevice(d_mat,nr,rows,nc,cols,vals,ADD_VALUES);
387: }
388: return 0;
389: }
391: PetscErrorCode LandauKokkosJacobian(DM plex[], const PetscInt Nq, const PetscInt batch_sz, const PetscInt num_grids, const PetscInt a_numCells[], PetscReal a_Eq_m[], PetscScalar a_elem_closure[],
392: const PetscScalar a_xarray[], const LandauStaticData *SData_d, const PetscReal shift,
393: const PetscLogEvent events[], const PetscInt a_mat_offset[], const PetscInt a_species_offset[], Mat subJ[], Mat JacP)
394: {
395: using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space;
396: using real2_scr_t = Kokkos::View<PetscScalar**, Kokkos::LayoutRight, scr_mem_t>;
397: using g2_scr_t = Kokkos::View<PetscReal***, Kokkos::LayoutRight, scr_mem_t>;
398: using g3_scr_t = Kokkos::View<PetscReal****, Kokkos::LayoutRight, scr_mem_t>;
399: PetscInt Nb=Nq,dim,num_cells_max,Nf_max,num_cells_batch;
400: int nfaces=0;
401: LandauCtx *ctx;
402: PetscReal *d_Eq_m=NULL;
403: PetscScalar *d_vertex_f=NULL;
404: P4estVertexMaps *maps[LANDAU_MAX_GRIDS]; // this gets captured
405: PetscSplitCSRDataStructure d_mat;
406: PetscContainer container;
407: const int conc = Kokkos::DefaultExecutionSpace().concurrency(), openmp = !!(conc < 1000), team_size = (openmp==0) ? Nq : 1;
408: const PetscInt coo_sz_batch = SData_d->coo_size/batch_sz; // capture
409: auto d_alpha_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->alpha); //static data
410: const PetscReal *d_alpha = d_alpha_k->data();
411: const PetscInt Nftot = d_alpha_k->size(); // total number of species
412: auto d_beta_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->beta);
413: const PetscReal *d_beta = d_beta_k->data();
414: auto d_invMass_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->invMass);
415: const PetscReal *d_invMass = d_invMass_k->data();
416: auto d_B_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->B);
417: const PetscReal *d_BB = d_B_k->data();
418: auto d_D_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->D);
419: const PetscReal *d_DD = d_D_k->data();
420: auto d_invJ_k = *static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->invJ); // use Kokkos vector in kernels
421: auto d_x_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->x); //static data
422: const PetscReal *d_x = d_x_k->data();
423: auto d_y_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->y); //static data
424: const PetscReal *d_y = d_y_k->data();
425: auto d_z_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->z); //static data
426: const PetscReal *d_z = (LANDAU_DIM==3) ? d_z_k->data() : NULL;
427: auto d_w_k = *static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->w); //static data
428: const PetscReal *d_w = d_w_k.data();
429: // grid offsets - single vertex grid data
430: auto d_numCells_k = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->NCells);
431: const PetscInt *d_numCells = d_numCells_k->data();
432: auto d_species_offset_k = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->species_offset);
433: const PetscInt *d_species_offset = d_species_offset_k->data();
434: auto d_mat_offset_k = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->mat_offset);
435: const PetscInt *d_mat_offset = d_mat_offset_k->data();
436: auto d_ip_offset_k = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->ip_offset);
437: const PetscInt *d_ip_offset = d_ip_offset_k->data();
438: auto d_ipf_offset_k = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->ipf_offset);
439: const PetscInt *d_ipf_offset = d_ipf_offset_k->data();
440: auto d_elem_offset_k = static_cast<Kokkos::View<PetscInt*, Kokkos::LayoutLeft>*>(SData_d->elem_offset);
441: const PetscInt *d_elem_offset = d_elem_offset_k->data();
442: #if defined(LANDAU_LAYOUT_LEFT) // preallocate dynamic data, including batched vertices
443: Kokkos::View<PetscReal***, Kokkos::LayoutLeft > d_fdf_k = *static_cast<Kokkos::View<PetscReal***, Kokkos::LayoutLeft >*>(SData_d->ipfdf_data);
444: #else
445: Kokkos::View<PetscReal***, Kokkos::LayoutRight > d_fdf_k = *static_cast<Kokkos::View<PetscReal***, Kokkos::LayoutRight >*>(SData_d->ipfdf_data);
446: #endif
447: auto d_Eq_m_k = static_cast<Kokkos::View<PetscReal*, Kokkos::LayoutLeft>*>(SData_d->Eq_m); // static storage, dynamci data - E(t), copy later, single vertex
448: // COO
449: auto d_coo_elem_offsets_k = static_cast<Kokkos::View<LandauIdx*, Kokkos::LayoutLeft>*>(SData_d->coo_elem_offsets);
450: LandauIdx *d_coo_elem_offsets = (SData_d->coo_size==0) ? NULL : d_coo_elem_offsets_k->data();
451: auto d_coo_elem_fullNb_k = static_cast<Kokkos::View<LandauIdx*, Kokkos::LayoutLeft>*>(SData_d->coo_elem_fullNb);
452: LandauIdx *d_coo_elem_fullNb = (SData_d->coo_size==0) ? NULL : d_coo_elem_fullNb_k->data();
453: auto d_coo_elem_point_offsets_k = static_cast<Kokkos::View<LandauIdx*, Kokkos::LayoutLeft>*>(SData_d->coo_elem_point_offsets);
454: LandauIdx (*d_coo_elem_point_offsets)[LANDAU_MAX_NQ+1] = (SData_d->coo_size==0) ? NULL : (LandauIdx (*)[LANDAU_MAX_NQ+1])d_coo_elem_point_offsets_k->data();
456: Kokkos::View<PetscScalar*, Kokkos::LayoutRight,Kokkos::DefaultExecutionSpace> d_coo_vals_k("coo_vals", SData_d->coo_size); // device data (default space)
457: PetscScalar* d_coo_vals = (SData_d->coo_size==0) ? NULL : d_coo_vals_k.data();
459: PetscLogEventBegin(events[3],0,0,0,0);
460: DMGetApplicationContext(plex[0], &ctx);
462: DMGetDimension(plex[0], &dim);
464: if (ctx->gpu_assembly) {
465: PetscObjectQuery((PetscObject) JacP, "assembly_maps", (PetscObject *) &container);
466: if (container) {
467: P4estVertexMaps *h_maps=NULL;
468: PetscContainerGetPointer(container, (void **) &h_maps);
469: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
470: if (h_maps[grid].d_self) {
471: maps[grid] = h_maps[grid].d_self;
472: nfaces = h_maps[grid].num_face; // nface=0 for CPU assembly
473: } else {
474: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
475: }
476: }
477: if (!d_coo_vals) {
478: // this does the setup the first time called
479: MatKokkosGetDeviceMatWrite(JacP,&d_mat);
480: } else {
481: d_mat = NULL;
482: }
483: } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_PLIB, "GPU assembly but no metadata in container");
484: } else {
485: for (PetscInt grid=0 ; grid<num_grids ; grid++) maps[grid] = NULL;
486: nfaces = 0;
487: d_mat = NULL;
488: container = NULL;
489: }
490: num_cells_batch = Nf_max = num_cells_max = 0;
491: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
492: int Nfloc = a_species_offset[grid+1] - a_species_offset[grid];
493: if (Nfloc > Nf_max) Nf_max = Nfloc;
494: if (a_numCells[grid] > num_cells_max) num_cells_max = a_numCells[grid];
495: num_cells_batch += a_numCells[grid]; // we don't have a host element offset here (but in ctx)
496: }
497: const PetscInt totDim_max = Nf_max*Nq, elem_mat_size_max = totDim_max*totDim_max;
498: const PetscInt elem_mat_num_cells_max_grid = container ? 0 : num_cells_max;
499: Kokkos::View<PetscScalar****, Kokkos::LayoutRight> d_elem_mats("element matrices", batch_sz, num_grids, elem_mat_num_cells_max_grid, elem_mat_size_max); // first call have large set of global (Jac) element matrices
500: const Kokkos::View<PetscReal*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_Eq_m_k (a_Eq_m, Nftot);
501: if (a_elem_closure || a_xarray) {
502: Kokkos::deep_copy (*d_Eq_m_k, h_Eq_m_k);
503: d_Eq_m = d_Eq_m_k->data();
504: } else d_Eq_m = NULL;
505: PetscKokkosInitializeCheck();
506: PetscLogEventEnd(events[3],0,0,0,0);
507: if (a_elem_closure || a_xarray) { // Jacobian, create f & df
508: Kokkos::View<PetscScalar*, Kokkos::LayoutLeft> *d_vertex_f_k = NULL;
509: PetscLogEventBegin(events[1],0,0,0,0);
510: if (a_elem_closure) {
511: PetscInt closure_sz = 0; // argh, don't have this on the host!!!
512: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
513: PetscInt nfloc = a_species_offset[grid+1] - a_species_offset[grid];
514: closure_sz += Nq*nfloc*a_numCells[grid];
515: }
516: closure_sz *= batch_sz;
517: d_vertex_f_k = new Kokkos::View<PetscScalar*, Kokkos::LayoutLeft> ("closure",closure_sz);
518: const Kokkos::View<PetscScalar*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_closure_k (a_elem_closure, closure_sz); // Vertex data for each element
519: Kokkos::deep_copy (*d_vertex_f_k, h_closure_k);
520: d_vertex_f = d_vertex_f_k->data();
521: } else {
522: d_vertex_f = (PetscScalar*)a_xarray;
523: }
524: PetscLogEventEnd(events[1],0,0,0,0);
525: PetscLogEventBegin(events[8],0,0,0,0);
526: PetscLogGpuTimeBegin();
528: const int scr_bytes_fdf = real2_scr_t::shmem_size(Nf_max,Nb);
529: PetscInfo(plex[0], "Jacobian shared memory size: %d bytes in level %d num cells total=%D team size=%D #face=%D Nf_max=%D\n",scr_bytes_fdf,KOKKOS_SHARED_LEVEL,num_cells_batch*batch_sz,team_size,nfaces,Nf_max);
530: Kokkos::parallel_for("f, df", Kokkos::TeamPolicy<>(num_cells_batch*batch_sz, team_size, /* Kokkos::AUTO */ 16).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerTeam(scr_bytes_fdf)), KOKKOS_LAMBDA (const team_member team) {
531: const PetscInt b_Nelem = d_elem_offset[num_grids], b_elem_idx = team.league_rank()%b_Nelem, b_id = team.league_rank()/b_Nelem, IPf_sz_glb = d_ipf_offset[num_grids];
532: // find my grid
533: PetscInt grid = 0;
534: while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
535: {
536: const PetscInt loc_nip = d_numCells[grid]*Nq, loc_Nf = d_species_offset[grid+1] - d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
537: const PetscInt moffset = LAND_MOFFSET(b_id,grid,batch_sz,num_grids,d_mat_offset);
538: {
539: real2_scr_t s_coef_k(team.team_scratch(KOKKOS_SHARED_LEVEL),Nf_max,Nb);
540: PetscScalar *coef;
541: const PetscReal *invJe = &d_invJ_k((d_ip_offset[grid] + loc_elem*Nq)*dim*dim);
542: // un pack IPData
543: if (!maps[grid]) {
544: coef = &d_vertex_f[b_id*IPf_sz_glb + d_ipf_offset[grid] + loc_elem*Nb*loc_Nf]; // closure and IP indexing are the same
545: } else {
546: coef = s_coef_k.data();
547: Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,(int)loc_Nf), [=] (int f) {
548: //for (int f = 0; f < loc_Nf; ++f) {
549: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,0,(int)Nb), [=] (int b) {
550: //for (int b = 0; b < Nb; ++b) {
551: LandauIdx idx = maps[grid]->gIdx[loc_elem][f][b];
552: if (idx >= 0) {
553: coef[f*Nb+b] = d_vertex_f[idx+moffset]; // xarray data, not IP, need raw array to deal with CPU assemble case (not used)
554: } else {
555: idx = -idx - 1;
556: coef[f*Nb+b] = 0;
557: for (int q = 0; q < maps[grid]->num_face; q++) {
558: PetscInt id = maps[grid]->c_maps[idx][q].gid;
559: PetscScalar scale = maps[grid]->c_maps[idx][q].scale;
560: if (id >= 0) coef[f*Nb+b] += scale*d_vertex_f[id+moffset];
561: }
562: }
563: });
564: });
565: }
566: team.team_barrier();
567: Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,Nq), [=] (int myQi) {
568: const PetscReal *const invJ = &invJe[myQi*dim*dim]; // b_elem_idx: batch element index
569: const PetscReal *Bq = &d_BB[myQi*Nb], *Dq = &d_DD[myQi*Nb*dim];
570: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,0,(int)loc_Nf), [=] (int f) {
571: PetscInt b, e, d;
572: PetscReal refSpaceDer[LANDAU_DIM];
573: const PetscInt idx = d_ipf_offset[grid] + f*loc_nip + loc_elem*Nq + myQi;
574: d_fdf_k(b_id,0,idx) = 0.0;
575: for (d = 0; d < LANDAU_DIM; ++d) refSpaceDer[d] = 0.0;
576: for (b = 0; b < Nb; ++b) {
577: d_fdf_k(b_id,0,idx) += Bq[b]*PetscRealPart(coef[f*Nb+b]);
578: for (d = 0; d < dim; ++d) refSpaceDer[d] += Dq[b*dim+d]*PetscRealPart(coef[f*Nb+b]);
579: }
580: for (d = 0; d < dim; ++d) {
581: for (e = 0, d_fdf_k(b_id,d+1,idx) = 0.0; e < dim; ++e) {
582: d_fdf_k(b_id,d+1,idx) += invJ[e*dim+d]*refSpaceDer[e];
583: }
584: }
585: }); // f
586: }); // myQi
587: }
588: team.team_barrier();
589: } // 'grid' scope
590: }); // global elems - fdf
591: Kokkos::fence();
592: PetscLogGpuTimeEnd(); // is this a fence?
593: PetscLogEventEnd(events[8],0,0,0,0);
594: // Jacobian
595: auto jac_lambda = KOKKOS_LAMBDA (const team_member team) {
596: const PetscInt b_Nelem = d_elem_offset[num_grids], b_elem_idx = team.league_rank()%b_Nelem, b_id = team.league_rank()/b_Nelem;
597: // find my grid
598: PetscInt grid = 0;
599: while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
600: {
601: const PetscInt loc_Nf = d_species_offset[grid+1]-d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid];
602: const PetscInt moffset = LAND_MOFFSET(b_id,grid,batch_sz,num_grids,d_mat_offset);
603: const PetscInt f_off = d_species_offset[grid], totDim = loc_Nf*Nq;
604: g2_scr_t g2(team.team_scratch(KOKKOS_SHARED_LEVEL),dim,loc_Nf,Nq);
605: g3_scr_t g3(team.team_scratch(KOKKOS_SHARED_LEVEL),dim,dim,loc_Nf,Nq);
606: g2_scr_t gg2(team.team_scratch(KOKKOS_SHARED_LEVEL),dim,loc_Nf,Nq);
607: g3_scr_t gg3(team.team_scratch(KOKKOS_SHARED_LEVEL),dim,dim,loc_Nf,Nq);
608: // get g2[] & g3[]
609: Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,Nq), [=] (int myQi) {
610: using Kokkos::parallel_reduce;
611: const PetscInt jpidx_glb = d_ip_offset[grid] + loc_elem * Nq + myQi;
612: const PetscReal *invJ = &d_invJ_k(jpidx_glb*dim*dim);
613: const PetscReal vj[3] = {d_x[jpidx_glb], d_y[jpidx_glb], d_z ? d_z[jpidx_glb] : 0}, wj = d_w[jpidx_glb];
614: landau_inner_red::TensorValueType gg_temp; // reduce on part of gg2 and g33 for IP jpidx_g
615: Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (team, (int)d_ip_offset[num_grids]), [=] (const int& ipidx, landau_inner_red::TensorValueType & ggg) {
616: const PetscReal wi = d_w[ipidx], x = d_x[ipidx], y = d_y[ipidx];
617: PetscReal temp1[3] = {0, 0, 0}, temp2 = 0;
618: PetscInt fieldA,d2,d3,f_off_r,grid_r,ipidx_g,nip_loc_r,loc_Nf_r;
619: #if LANDAU_DIM==2
620: PetscReal Ud[2][2], Uk[2][2], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
621: LandauTensor2D(vj, x, y, Ud, Uk, mask);
622: #else
623: PetscReal U[3][3], z = d_z[ipidx], mask = (PetscAbs(vj[0]-x) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[1]-y) < 100*PETSC_SQRT_MACHINE_EPSILON && PetscAbs(vj[2]-z) < 100*PETSC_SQRT_MACHINE_EPSILON) ? 0. : 1.;
624: LandauTensor3D(vj, x, y, z, U, mask);
625: #endif
626: grid_r = 0;
627: while (ipidx >= d_ip_offset[grid_r+1]) grid_r++; // yuck search for grid
628: f_off_r = d_species_offset[grid_r];
629: ipidx_g = ipidx - d_ip_offset[grid_r];
630: nip_loc_r = d_numCells[grid_r]*Nq;
631: loc_Nf_r = d_species_offset[grid_r+1] - d_species_offset[grid_r];
632: for (fieldA = 0; fieldA < loc_Nf_r; ++fieldA) {
633: const PetscInt idx = d_ipf_offset[grid_r] + fieldA*nip_loc_r + ipidx_g;
634: temp1[0] += d_fdf_k(b_id,1,idx)*d_beta[fieldA+f_off_r]*d_invMass[fieldA+f_off_r];
635: temp1[1] += d_fdf_k(b_id,2,idx)*d_beta[fieldA+f_off_r]*d_invMass[fieldA+f_off_r];
636: #if LANDAU_DIM==3
637: temp1[2] += d_fdf_k(b_id,3,idx)*d_beta[fieldA+f_off_r]*d_invMass[fieldA+f_off_r];
638: #endif
639: temp2 += d_fdf_k(b_id,0,idx)*d_beta[fieldA+f_off_r];
640: }
641: temp1[0] *= wi;
642: temp1[1] *= wi;
643: #if LANDAU_DIM==3
644: temp1[2] *= wi;
645: #endif
646: temp2 *= wi;
647: #if LANDAU_DIM==2
648: for (d2 = 0; d2 < 2; d2++) {
649: for (d3 = 0; d3 < 2; ++d3) {
650: /* K = U * grad(f): g2=e: i,A */
651: ggg.gg2[d2] += Uk[d2][d3]*temp1[d3];
652: /* D = -U * (I \kron (fx)): g3=f: i,j,A */
653: ggg.gg3[d2][d3] += Ud[d2][d3]*temp2;
654: }
655: }
656: #else
657: for (d2 = 0; d2 < 3; ++d2) {
658: for (d3 = 0; d3 < 3; ++d3) {
659: /* K = U * grad(f): g2 = e: i,A */
660: ggg.gg2[d2] += U[d2][d3]*temp1[d3];
661: /* D = -U * (I \kron (fx)): g3 = f: i,j,A */
662: ggg.gg3[d2][d3] += U[d2][d3]*temp2;
663: }
664: }
665: #endif
666: }, Kokkos::Sum<landau_inner_red::TensorValueType>(gg_temp));
667: // add alpha and put in gg2/3
668: Kokkos::parallel_for(Kokkos::ThreadVectorRange (team, (int)loc_Nf), [&] (const int& fieldA) {
669: PetscInt d2,d3;
670: for (d2 = 0; d2 < dim; d2++) {
671: gg2(d2,fieldA,myQi) = gg_temp.gg2[d2]*d_alpha[fieldA+f_off];
672: for (d3 = 0; d3 < dim; d3++) {
673: gg3(d2,d3,fieldA,myQi) = -gg_temp.gg3[d2][d3]*d_alpha[fieldA+f_off]*d_invMass[fieldA+f_off];
674: }
675: }
676: });
677: /* add electric field term once per IP */
678: Kokkos::parallel_for(Kokkos::ThreadVectorRange (team, (int)loc_Nf), [&] (const int& fieldA) {
679: gg2(dim-1,fieldA,myQi) += d_Eq_m[fieldA+f_off];
680: });
681: Kokkos::parallel_for(Kokkos::ThreadVectorRange (team, (int)loc_Nf), [=] (const int& fieldA) {
682: int d,d2,d3,dp;
683: /* Jacobian transform - g2, g3 - per thread (2D) */
684: for (d = 0; d < dim; ++d) {
685: g2(d,fieldA,myQi) = 0;
686: for (d2 = 0; d2 < dim; ++d2) {
687: g2(d,fieldA,myQi) += invJ[d*dim+d2]*gg2(d2,fieldA,myQi);
688: g3(d,d2,fieldA,myQi) = 0;
689: for (d3 = 0; d3 < dim; ++d3) {
690: for (dp = 0; dp < dim; ++dp) {
691: g3(d,d2,fieldA,myQi) += invJ[d*dim + d3]*gg3(d3,dp,fieldA,myQi)*invJ[d2*dim + dp];
692: }
693: }
694: g3(d,d2,fieldA,myQi) *= wj;
695: }
696: g2(d,fieldA,myQi) *= wj;
697: }
698: });
699: }); // Nq
700: team.team_barrier();
701: { /* assemble */
702: for (PetscInt fieldA = 0; fieldA < loc_Nf; fieldA++) {
703: /* assemble */
704: Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,Nb), [=] (int f) {
705: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,0,Nb), [=] (int g) {
706: PetscScalar t = 0;
707: for (int qj = 0 ; qj < Nq ; qj++) { // look at others integration points
708: const PetscReal *BJq = &d_BB[qj*Nb], *DIq = &d_DD[qj*Nb*dim];
709: for (int d = 0; d < dim; ++d) {
710: t += DIq[f*dim+d]*g2(d,fieldA,qj)*BJq[g];
711: for (int d2 = 0; d2 < dim; ++d2) {
712: t += DIq[f*dim + d]*g3(d,d2,fieldA,qj)*DIq[g*dim + d2];
713: }
714: }
715: }
716: if (elem_mat_num_cells_max_grid) { // CPU assembly
717: const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
718: d_elem_mats(b_id,grid,loc_elem,fOff) = t;
719: } else {
720: landau_mat_assemble (d_mat, d_coo_vals, t, f, g, Nb, moffset, loc_elem, fieldA, maps[grid], d_coo_elem_offsets, d_coo_elem_fullNb, d_coo_elem_point_offsets, b_elem_idx, b_id*coo_sz_batch);
721: }
722: });
723: });
724: }
725: }
726: } // scope with 'grid'
727: };
728: PetscLogEventBegin(events[4],0,0,0,0);
729: PetscLogGpuTimeBegin();
730: const int scr_bytes = 2*(g2_scr_t::shmem_size(dim,Nf_max,Nq) + g3_scr_t::shmem_size(dim,dim,Nf_max,Nq));
731: Kokkos::parallel_for("Jacobian", Kokkos::TeamPolicy<>(num_cells_batch*batch_sz, /*team_size*/1, 1).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerTeam(scr_bytes)), jac_lambda);
732: PetscLogGpuTimeEnd();
733: PetscLogEventEnd(events[4],0,0,0,0);
734: if (d_vertex_f_k) delete d_vertex_f_k;
735: } else { // mass - 2*Nb is guess at max size (2D Q3 is 31 =< 2*Nb = 32)
736: using fieldMats_scr_t = Kokkos::View<PetscScalar*, Kokkos::LayoutRight, scr_mem_t>;
737: PetscInt loc_ass_sz = 1; // captured
738: PetscLogEventBegin(events[16],0,0,0,0);
739: PetscLogGpuTimeBegin();
740: if (loc_ass_sz) loc_ass_sz = ctx->SData_d.coo_max_fullnb*ctx->SData_d.coo_max_fullnb;
741: int scr_bytes = fieldMats_scr_t::shmem_size(loc_ass_sz); // + idx_scr_t::shmem_size(Nb,nfaces) + scale_scr_t::shmem_size(Nb,nfaces);
742: PetscInfo(plex[0], "Mass shared memory size: %d bytes in level %d conc=%D team size=%D #face=%D Nb=%D, %s assembly\n",scr_bytes,KOKKOS_SHARED_LEVEL,conc,team_size,nfaces,Nb, d_coo_vals ? (loc_ass_sz==0 ? "COO" : "optimized COO") : "CSR");
743: Kokkos::parallel_for("Mass", Kokkos::TeamPolicy<>(num_cells_batch*batch_sz, team_size, /* Kokkos::AUTO */ 16).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerTeam(scr_bytes)), KOKKOS_LAMBDA (const team_member team) {
744: fieldMats_scr_t s_fieldMats(team.team_scratch(KOKKOS_SHARED_LEVEL),loc_ass_sz); // Only used for GPU assembly (ie, not first pass)
745: const PetscInt b_Nelem = d_elem_offset[num_grids], b_elem_idx = team.league_rank()%b_Nelem, b_id = team.league_rank()/b_Nelem;
746: // find my grid
747: PetscInt grid = 0;
748: while (b_elem_idx >= d_elem_offset[grid+1]) grid++;
749: {
750: const PetscInt loc_Nf = d_species_offset[grid+1]-d_species_offset[grid], loc_elem = b_elem_idx - d_elem_offset[grid], totDim = loc_Nf*Nq, jpidx_0 = d_ip_offset[grid] + loc_elem * Nq;
751: const PetscInt moffset = LAND_MOFFSET(b_id,grid,batch_sz,num_grids,d_mat_offset);
752: for (int fieldA = 0; fieldA < loc_Nf; fieldA++) {
753: /* assemble */
754: Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,Nb), [=] (int f) {
755: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,0,Nb), [=] (int g) {
756: PetscScalar t = 0;
757: for (int qj = 0 ; qj < Nq ; qj++) { // look at others integration points
758: const PetscReal *BJq = &d_BB[qj*Nb];
759: const PetscInt jpidx_glb = jpidx_0 + qj;
760: if (dim==2) {
761: t += BJq[f] * d_w_k(jpidx_glb) * shift * BJq[g] * 2. * PETSC_PI;
762: } else {
763: t += BJq[f] * d_w_k(jpidx_glb) * shift * BJq[g];
764: }
765: }
766: if (elem_mat_num_cells_max_grid) {
767: const PetscInt fOff = (fieldA*Nb + f)*totDim + fieldA*Nb + g;
768: d_elem_mats(b_id,grid,loc_elem,fOff) = t;
769: } else {
770: if (d_coo_vals && loc_ass_sz) {
771: PetscInt idx,q,nr,nc;
772: const LandauIdx *const Idxs = &maps[grid]->gIdx[loc_elem][fieldA][0];
773: PetscScalar row_scale[LANDAU_MAX_Q_FACE]={0},col_scale[LANDAU_MAX_Q_FACE]={0};
774: const int fullNb = d_coo_elem_fullNb[b_elem_idx], num_face = maps[grid]->num_face;
775: const PetscScalar Aij = t;
776: idx = Idxs[f];
777: if (idx >= 0) {
778: nr = 1;
779: row_scale[0] = 1.;
780: } else {
781: idx = -idx - 1;
782: for (q = 0, nr = 0; q < num_face; q++, nr++) {
783: if (maps[grid]->c_maps[idx][q].gid < 0) break;
784: row_scale[q] = maps[grid]->c_maps[idx][q].scale;
785: }
786: }
787: idx = Idxs[g];
788: if (idx >= 0) {
789: nc = 1;
790: col_scale[0] = 1.;
791: } else {
792: idx = -idx - 1;
793: for (q = 0, nc = 0; q < num_face; q++, nc++) {
794: if (maps[grid]->c_maps[idx][q].gid < 0) break;
795: col_scale[q] = maps[grid]->c_maps[idx][q].scale;
796: }
797: }
798: const int idx0 = fullNb * d_coo_elem_point_offsets[b_elem_idx][f] + nr * d_coo_elem_point_offsets[b_elem_idx][g];
799: for (int q = 0, idx2 = idx0; q < nr; q++) {
800: for (int d = 0; d < nc; d++, idx2++) {
801: s_fieldMats(idx2) = row_scale[q]*col_scale[d]*Aij;
802: }
803: }
804: } else {
805: landau_mat_assemble (d_mat, d_coo_vals, t, f, g, Nb, moffset, loc_elem, fieldA, maps[grid], d_coo_elem_offsets, d_coo_elem_fullNb, d_coo_elem_point_offsets, b_elem_idx, b_id*coo_sz_batch);
806: }
807: }
808: });
809: });
810: if (!elem_mat_num_cells_max_grid) {
811: if (d_coo_vals && loc_ass_sz) {
812: const int fullNb = d_coo_elem_fullNb[b_elem_idx], fullNb2=fullNb*fullNb;
813: const int idx0 = b_id*coo_sz_batch + d_coo_elem_offsets[b_elem_idx] + fieldA*fullNb2;
814: team.team_barrier();
815: #if 0
816: Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,0,fullNb), [=] (int gg) {
817: Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,fullNb), [=] (int ff) {
818: const int idx = fullNb * ff + gg;
819: d_coo_vals[idx0 + idx] = s_fieldMats(idx);
820: });
821: });
822: #else
823: Kokkos::parallel_for(Kokkos::TeamVectorRange(team,0,fullNb2), [=] (int idx) { d_coo_vals[idx0 + idx] = s_fieldMats(idx); });
824: #endif
825: }
826: }
827: } // field
828: } // grid
829: });
830: PetscLogGpuTimeEnd();
831: PetscLogEventEnd(events[16],0,0,0,0);
832: }
833: Kokkos::fence();
834: if (d_coo_vals) {
835: MatSetValuesCOO(JacP,d_coo_vals,ADD_VALUES);
836: } else if (elem_mat_num_cells_max_grid) { // CPU assembly
837: Kokkos::View<PetscScalar****, Kokkos::LayoutRight>::HostMirror h_elem_mats = Kokkos::create_mirror_view(d_elem_mats);
838: Kokkos::deep_copy (h_elem_mats, d_elem_mats);
840: for (PetscInt b_id = 0 ; b_id < batch_sz ; b_id++) { // OpenMP (once)
841: for (PetscInt grid=0 ; grid<num_grids ; grid++) {
842: PetscSection section, globalSection;
843: PetscInt cStart,cEnd;
844: Mat B = subJ[ LAND_PACK_IDX(b_id,grid) ];
845: PetscInt moffset = LAND_MOFFSET(b_id,grid,batch_sz,num_grids,a_mat_offset), nloc, nzl, colbuf[1024], row;
846: const PetscInt *cols;
847: const PetscScalar *vals;
848: PetscLogEventBegin(events[5],0,0,0,0);
849: DMPlexGetHeightStratum(plex[grid],0,&cStart,&cEnd);
850: DMGetLocalSection(plex[grid], §ion);
851: DMGetGlobalSection(plex[grid], &globalSection);
852: PetscLogEventEnd(events[5],0,0,0,0);
853: PetscLogEventBegin(events[6],0,0,0,0);
854: for (PetscInt ej = cStart ; ej < cEnd; ++ej) {
855: const PetscScalar *elMat = &h_elem_mats(b_id,grid,ej-cStart,0);
856: DMPlexMatSetClosure(plex[grid], section, globalSection, B, ej, elMat, ADD_VALUES);
857: if (grid==0 && ej==-1) {
858: const PetscInt loc_Nf = a_species_offset[grid+1]-a_species_offset[grid], totDim = loc_Nf*Nq;
859: int d,f;
860: PetscPrintf(PETSC_COMM_SELF,"Kokkos Element matrix %d/%d\n",1,(int)a_numCells[grid]);
861: for (d = 0; d < totDim; ++d){
862: for (f = 0; f < totDim; ++f) PetscPrintf(PETSC_COMM_SELF," %12.5e", PetscRealPart(elMat[d*totDim + f]));
863: PetscPrintf(PETSC_COMM_SELF,"\n");
864: }
865: exit(14);
866: }
867: }
868: // move nest matrix to global JacP
869: MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY);
870: MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY);
871: MatGetSize(B, &nloc, NULL);
872: for (int i=0 ; i<nloc ; i++) {
873: MatGetRow(B,i,&nzl,&cols,&vals);
875: for (int j=0; j<nzl; j++) colbuf[j] = cols[j] + moffset;
876: row = i + moffset;
877: MatSetValues(JacP,1,&row,nzl,colbuf,vals,ADD_VALUES);
878: MatRestoreRow(B,i,&nzl,&cols,&vals);
879: }
880: MatDestroy(&subJ[ LAND_PACK_IDX(b_id,grid) ]);
881: PetscLogEventEnd(events[6],0,0,0,0);
882: } // grids
883: }
884: }
885: return 0;
886: }
887: } // extern "C"