Actual source code: cupminit.inc
1: /* A template file for the CUDA Programming Model (CUPM) initialization, to be included in init.c. CUPM is either CUDA or HIP. */
3: PetscBool PetscCUPMSynchronize = PETSC_FALSE;
4: PetscBool PetscCUPMInitialized = PETSC_FALSE;
6: cupmStream_t PetscDefaultCupmStream = NULL;
8: static PetscBool PetscNotUseCUPM = PETSC_FALSE; /* Assert the code will not use this type of devices */
10: cupmEvent_t petsc_gputimer_begin = NULL; /* The GPU event for begin */
11: cupmEvent_t petsc_gputimer_end = NULL; /* The GPU event for end */
13: /* Device validation after it is lazily initialized */
14: static PetscErrorCode PetscCUPMValidate(void)
15: {
16: PetscBool mpi_gpu_awareness;
19: if (use_gpu_aware_mpi) {
20: /* For OpenMPI, we could do a compile time check with "defined(PETSC_HAVE_OMPI_MAJOR_VERSION) && defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT"
21: to see if it is CUDA-aware. However, recent versions of IBM Spectrum MPI (e.g., 10.3.1) on Summit meet above conditions, but one has to use
22: jsrun --smpiargs=-gpu to really enable GPU-aware MPI. So we do the check at runtime with a code that works only with GPU-aware MPI.
23: */
24: mpi_gpu_awareness = PetscMPICUPMAwarenessCheck();
25: if (!mpi_gpu_awareness) {
26: (*PetscErrorPrintf)("PETSc is configured with GPU support, but your MPI is not GPU-aware. For better performance, please use a GPU-aware MPI.\n");
27: (*PetscErrorPrintf)("If you do not care, add option -use_gpu_aware_mpi 0. To not see the message again, add the option to your .petscrc, OR add it to the env var PETSC_OPTIONS.\n");
28: (*PetscErrorPrintf)("If you do care, for IBM Spectrum MPI on OLCF Summit, you may need jsrun --smpiargs=-gpu.\n");
29: (*PetscErrorPrintf)("For OpenMPI, you need to configure it --with-cuda (https://www.open-mpi.org/faq/?category=buildcuda)\n");
30: (*PetscErrorPrintf)("For MVAPICH2-GDR, you need to set MV2_USE_CUDA=1 (http://mvapich.cse.ohio-state.edu/userguide/gdr/)\n");
31: (*PetscErrorPrintf)("For Cray-MPICH, you need to set MPICH_RDMA_ENABLED_CUDA=1 (https://www.olcf.ornl.gov/tutorials/gpudirect-mpich-enabled-cuda/)\n");
32: PETSCABORT(PETSC_COMM_SELF,PETSC_ERR_LIB);
33: }
34: }
35: return(0);
36: }
38: /*@C
39: PetscCUDAInitializeCheck - Check if CUDA is initialized. If not, initialize it.
41: Logically collective
43: Level: beginner
45: Notes:
46: In PETSc lazy device initialization, PETSc calls this function right before creating the first CUDA/HIP object.
47: It can be used by application developers who want to lazily initialize CUDA/HIP when they start to use it (which may before a PETSc CUDA/HIP object is created.)
49: .seealso: PetscCUDAInitialize(), PetscHIPInitialize(), PetscHIPInitializeCheck()
50: @*/
51: PETSC_EXTERN PetscErrorCode PetscCUDAInitializeCheck(void);
54: /*@C
55: PetscHIPInitializeCheck - Check if HIP is initialized. If not, initialize it.
57: Logically collective
59: Level: beginner
61: Notes:
62: See notes of PetscCUDAInitializeCheck() for details.
64: .seealso: PetscHIPInitialize(), PetscCUDAInitialize(), PetscCUDAInitializeCheck()
65: @*/
66: PETSC_EXTERN PetscErrorCode PetscHIPInitializeCheck(void);
68: PetscErrorCode PetscCUPMInitializeCheck(void)
69: {
70: PetscErrorCode ierr;
71: cupmError_t cerr;
72: int devId,devCount;
73: PetscMPIInt rank;
74: static PetscBool cupmValdidateChecked = PETSC_FALSE;
75: PetscBool useNull = PETSC_TRUE; /* use the default (NULL) stream as petsc's default stream */
78: if (PetscNotUseCUPM) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"You asserted the code wouldn't use devices with -device_set none, but now trying to create a device object. Remove this option or see manpage of PetscCUPMInitialize().");
79: if (!PetscCUPMInitialized) {
80: cerr = cupmGetDeviceCount(&devCount);
81: cupmGetLastError(); /* Reset the last error */
82: if (cerr != cupmSuccess) devCount = 0;
83: if (devCount > 0) {
84: cerr = cupmSetDeviceFlags(cupmDeviceMapHost);
85: cupmGetLastError(); /* Reset the last error */
86: if (cerr == cupmSuccess) { /* It implies device runtime has not been initialized? */
87: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
88: devId = rank % devCount;
89: for (int i=0; i<3; i++) {
90: cerr = cupmSetDevice(devId);
91: if (cerr == cupmSuccess) break;
92: if (cerr != cupmErrorMemoryAllocation && cerr != cupmErrorLaunchOutOfResources) CHKERRCUPM(cerr);
93: if (i < 2) {PetscSleep(3);}
94: }
95: if (cerr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU_RESOURCE,"Unable to initialize the GPU");
96: #if defined(PETSC_CUDA_GENERATION)
97: {
98: struct cudaDeviceProp dp;
99: int gen;
100: cerr = cudaGetDeviceProperties(&dp,devId);CHKERRCUPM(cerr);
101: gen = 10*dp.major+dp.minor;
102: if (gen < PETSC_CUDA_GENERATION) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP_SYS,"PETSc compiled for NVIDIA generation %d but hardware is generation %d",PETSC_CUDA_GENERATION,gen);
103: }
104: #endif
105: } else if (cerr == cupmErrorSetOnActiveProcess) {
106: /* It implies user has initialized device runtime outside of petsc. We do nothing to respect the device choice. */
107: }
108: }
109: if (!PetscDefaultCupmStream) { /* user may have set the default stream already */
110: PetscOptionsGetBool(NULL,NULL,"-petsc_default_use_null_stream",&useNull,NULL);
111: if (!useNull) {cerr = cupmStreamCreate(&PetscDefaultCupmStream);CHKERRCUPM(cerr);}
112: }
113: PetscCUPMBLASInitializeHandle();
114: PetscCUPMSOLVERDnInitializeHandle();
115: PetscCUPMInitialized = PETSC_TRUE;
116: /* Initialize CUDA event timers */
117: cerr = cupmEventCreate(&petsc_gputimer_begin);CHKERRCUPM(cerr);
118: cerr = cupmEventCreate(&petsc_gputimer_end);CHKERRCUPM(cerr);
119: }
120: if (!cupmValdidateChecked) {
121: PetscCUPMValidate();
122: cupmValdidateChecked = PETSC_TRUE;
123: }
124: PetscCreatedGpuObjects = PETSC_TRUE;
125: return(0);
126: }
128: /*@C
129: PetscCUDAInitialize - Initializes CUDA (eagerly in PetscInitialize() or soon after PetscInitialize()) and cuBLAS/cuSPARSE libraries on the device
131: Logically collective
133: Input Parameters:
134: + comm - the MPI communicator that will utilize the devices
135: - device - the device assigned to current MPI process. Special values like PETSC_DECIDE or PETSC_DEFAULT have special meanings (see details below)
137: Options Database:
138: + -cuda_device <device> - the device assigned to current MPI rank. <device> is case-insensitive and can be:
139: NONE (or none, or -3) : the code will not use any device, otherwise it will error out;
140: PETSC_DEFAULT(or DEFAULT, or -2) : do not explicitly set device, i.e., use whatever device already set by user (probably before PetscInitialize()). Init device runtime etc;
141: PETSC_DECIDE (or DECIDE, or -1) : assign MPI ranks in comm to available devices in round-robin, and init device runtime etc on the selected device;
142: >= 0 integer : assign the device with this id to current MPI process. Error out if <device> is invalid. Init device runtime etc on this device;
143: With PETSC_{DECIDE, DEFAULT}, if there are actually no devices, the code can still run, but it will error out when trying to create device objects.
144: . -cuda_view - view information about the devices.
145: . -cuda_synchronize - wait at the end of asynchronize device calls so that their time gets credited to the current event. With -log_view, the default is true, otherwise false.
146: . -log_view - logging, however if alone or combined with `-cuda_device DEFAULT | DECIDE | >=0 int`, will init device; if combined with `-cuda_device none`, won't init device.
147: . -petsc_default_use_null_stream - If true (default), petsc will use the default NULL stream to launch its kernels and call vendor libraries such as cuBLAS, cuSPARSE etc.
148: - -use_gpu_aware_mpi - assume the MPI is device/GPU-aware when communicating data on devices. Default true.
150: Level: beginner
152: Notes:
153: Unless the input parameter <device> = -3, this routine initializes the CUDA device. It also initializes the cuBLAS/cuSPARSE libraries, which
154: takes a lot of time. Initializing them early helps avoid skewing timings in -log_view.
156: If this routine is triggered by command line options, it is called in PetscInitialize(). If users want to directly call it, they should call it immediately after PetscInitialize().
158: If this is not called then the CUDA initialization is delayed until first creation of a CUDA object and this can affect the timing since they happen asynchronously on different nodes and take a lot of time.
160: .seealso: PetscCUDAInitializeCheck(), PetscHIPInitialize(), PetscHIPInitializeCheck()
161: @*/
162: PETSC_EXTERN PetscErrorCode PetscCUDAInitialize(MPI_Comm comm,PetscInt device);
163: /*@C
164: PetscHIPInitialize - Initializes HIP (eagerly in PetscInitialize() or soon after PetscInitialize()) and hipBLAS/hipSPARSE libraries on the device
166: Logically collective
168: Input Parameter:
169: (see notes)
171: Options Database:
172: (see notes)
174: Level: beginner
176: Notes:
177: The functionality, parameters and options database of this routine are similar to that of PetscCUDAInitialize(), except that the option names
178: are -hip_device, -hip_view, -hip_synchronize instead. See manpage of PetscCUDAInitialize() for details.
180: .seealso: PetscHIPInitializeCheck(), PetscCUDAInitialize(), PetscCUDAInitializeCheck()
181: @*/
182: PETSC_EXTERN PetscErrorCode PetscHIPInitialize(MPI_Comm comm,PetscInt device);
184: PetscErrorCode PetscCUPMInitialize(MPI_Comm comm,PetscInt device)
185: {
186: PetscErrorCode ierr;
187: cupmError_t cerr;
188: int devId,devCount=0;
189: const PetscInt PETSC_NONE=-3; /* Unlike PETSC_DECIDE, we don't have a macro PETSC_NONE in petsc headers */
190: PetscMPIInt rank;
193: if (!PetscCUPMInitialized) {
194: cerr = cupmGetDeviceCount(&devCount);
195: cupmGetLastError(); /* Reset the last error */
196: if (cerr != cupmSuccess) devCount = 0;
197: if (device >= 0) { /* User wants to use this specific device */
198: cerr = cupmSetDeviceFlags(cupmDeviceMapHost); /* Allow it to fail since user might have already initialized the device. */
199: cupmGetLastError(); /* Reset the last error */
200: cerr = cupmSetDevice((int)device);CHKERRCUPM(cerr);
201: } else if (device == PETSC_DECIDE) { /* Assign MPI ranks to available devices in round-robin */
202: if (devCount > 0) { /* Allow no device as long as user does not use devices */
203: /* Set the device flags so that it can map host memory */
204: cerr = cupmSetDeviceFlags(cupmDeviceMapHost);CHKERRCUPM(cerr);
205: MPI_Comm_rank(comm,&rank);
206: devId = rank % devCount;
207: cerr = cupmSetDevice(devId);CHKERRCUPM(cerr);
208: }
209: } else if (device == PETSC_DEFAULT) {
210: /* Do nothing, i.e., use whatever device set by user before PetscInitialize() */
211: } else if (device == PETSC_NONE) {
212: PetscNotUseCUPM = PETSC_TRUE; /* Assert the code won't use devices even there are */
213: } else SETERRQ1(comm,PETSC_ERR_ARG_OUTOFRANGE,"Wrong device (%D) passed to -device_set <dev>. Must be NONE(-3),PETSC_DEFAULT(-2),PETSC_DECIDE(-1) or a non-negative integer.",device);
215: if (devCount > 0 && device != PETSC_NONE) {
216: /* Do costly device handles initialization here to not to distort petsc logging later */
217: PetscBool useNull = PETSC_TRUE; /* use the default (NULL) stream as petsc's default stream */
218: PetscOptionsGetBool(NULL,NULL,"-petsc_default_use_null_stream",&useNull,NULL);
219: if (!useNull) {cerr = cupmStreamCreate(&PetscDefaultCupmStream);CHKERRCUPM(cerr);}
220: PetscCUPMBLASInitializeHandle();
221: PetscCUPMSOLVERDnInitializeHandle();
222: PetscCUPMInitialized = PETSC_TRUE;
223: }
224: /* Initialize CUDA event timers */
225: cerr = cupmEventCreate(&petsc_gputimer_begin);CHKERRCUPM(cerr);
226: cerr = cupmEventCreate(&petsc_gputimer_end);CHKERRCUPM(cerr);
227: }
228: PetscCUPMInitializeCheck();
229: return(0);
230: }
232: /*
233: The routine works as a driver to initialize and view the device
235: Input Parameter:
236: initDevice: True if user explicitly has -cuda/hip_device xxx
237: device: Significant when <initDeivce>. Basically, it is the integer presentation of the xxx above
238: logView: True if -log_view or -log_summary
239: devView: True if -{cuda,hip}_view
240: */
241: static PetscErrorCode PetscCUPMInitializeAndView(PetscBool initDevice,PetscInt device,PetscBool logView,PetscBool devView)
242: {
243: PetscErrorCode ierr;
244: cupmError_t cerr;
245: PetscMPIInt rank;
246: int devId,devCount;
247: cupmDeviceProp prop;
250: PetscCUPMSynchronize = logView;
251: if (initDevice) {PetscCUPMInitialize(PETSC_COMM_WORLD,device);}
252: else if (logView || devView) { /* With -{log,cuda,hip}_view, we want to do costly gpu runtime initialization early so that not to distort the timing later. */
253: devCount = 0;
254: cerr = cupmGetDeviceCount(&devCount);
255: cupmGetLastError(); /* Reset the last error */
256: if (cerr == cupmSuccess && devCount >= 1) { /* There are devices */
257: devId = 0;
258: if (devCount > 1) { /* Decide which device to init when there are multiple */
259: cerr = cupmSetDeviceFlags(cupmDeviceMapHost);
260: cupmGetLastError(); /* Reset the last error */
261: if (cerr == cupmSuccess) { /* It implies gpu runtime has not been initialized */
262: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
263: devId = rank % devCount;
264: cerr = cupmSetDevice(devId);CHKERRCUPM(cerr);
265: } else if (cerr == cupmErrorSetOnActiveProcess) {
266: /* It means user initialized gpu runtime outside of petsc. We respect the device choice. */
267: cerr = cupmGetDevice(&devId);CHKERRCUPM(cerr);
268: }
269: }
270: PetscCUPMInitialize(PETSC_COMM_WORLD,(PetscInt)devId);
271: #if defined(PETSC_HAVE_KOKKOS)
272: if (logView) { /* With -log_view, we always do eager init */
273: PetscKokkosInitializeCheck();
274: }
275: #endif
276: }
277: }
279: if (devView) {
280: MPI_Comm_rank(PETSC_COMM_WORLD,&rank);
281: cerr = cupmGetDeviceCount(&devCount);CHKERRCUPM(cerr);
282: if (rank == 0) {
283: for (devId = 0; devId < devCount; ++devId) {
284: cerr = cupmGetDeviceProperties(&prop,devId);CHKERRCUPM(cerr);
285: PetscPrintf(PETSC_COMM_WORLD, "device %d: %s\n", devId, prop.name);
286: }
287: }
288: cerr = cupmGetDevice(&devId);CHKERRCUPM(cerr);
289: PetscSynchronizedPrintf(PETSC_COMM_WORLD,"[%d] Using device %d.\n",rank,devId);
290: PetscSynchronizedFlush(PETSC_COMM_WORLD,PETSC_STDOUT);
291: }
292: return(0);
293: }
295: /*
296: The routine checks user's device related options and initializes the device if instructed.
298: Input Parameter:
299: logView: True if -log_view or -log_summary
300: */
301: static PetscErrorCode PetscOptionsCheckCUPM(PetscBool logView)
302: {
304: PetscBool initDevice = PETSC_FALSE,devView = PETSC_FALSE,devNone = PETSC_FALSE;
305: PetscInt device = 0;
306: char devStr[32]={0};
307: #if defined(PETSC_HAVE_KOKKOS)
308: PetscBool set,kinited,devDefault;
309: #endif
312: #if defined(PETSC_HAVE_KOKKOS)
313: PetscKokkosIsInitialized_Private(&kinited);
314: if (kinited) { /* Check if Petsc device options conform with Kokkos' device if Kokkos is init'ed before PetscInitialize() */
315: PetscOptionsGetString(NULL,NULL,cupmSetDeviceStr,devStr,sizeof(devStr),&set);
316: if (set) { /* If users have initialized Kokkos themselves, but also had e.g., -cuda_device XXX, for simplicity, make sure XXX is DEFAULT */
317: PetscStrcasecmp("DEFAULT",devStr,&devDefault);
318: if (!devDefault) {PetscStrcasecmp("PETSC_DEFAULT",devStr,&devDefault);}
319: if (!devDefault) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"Kokkos was initialized before PetscInitialize(), but you have %s %s. Remove the option or use %s default.",cupmSetDeviceStr,devStr,cupmSetDeviceStr);
320: } else { /* If users did not have e.g., '-cuda_device XXX', insert one here so that petsc can continue its own device initialization */
321: PetscOptionsSetValue(NULL,cupmSetDeviceStr,"DEFAULT");
322: }
323: }
324: #endif
326: PetscOptionsBegin(PETSC_COMM_WORLD,NULL,cupmOptionsStr,"Sys");
327: PetscOptionsString(cupmSetDeviceStr,NULL,PetscCUPMInitializeStr,devStr,devStr,sizeof(devStr),&initDevice);
328: PetscStrcasecmp("none",devStr,&devNone);
329: if (devNone) device = -3; /* -3 is the locally used PETSC_NONE in Petsc{CUDA/HIP}Initialize() */
330: else {PetscOptionsInt(cupmSetDeviceStr,"Set which MPI ranks to use which devices",PetscCUPMInitializeStr,device,&device,&initDevice);}
331: PetscOptionsBool(cupmSynchronizeStr,"Wait for the device to complete operations before returning to the CPU (on by default with -log_summary or -log_view)",NULL,PetscCUPMSynchronize,&PetscCUPMSynchronize,NULL);
332: PetscOptionsName(cupmViewStr,"Display device information and assignments",NULL,&devView);
333: PetscOptionsEnd();
334: PetscCUPMInitializeAndView(initDevice,device,logView,devView);
335: return(0);
336: }