Actual source code: xmllogevent.c

  1: /*************************************************************************************
  2:  *    M A R I T I M E  R E S E A R C H  I N S T I T U T E  N E T H E R L A N D S     *
  3:  *************************************************************************************
  4:  *    authors: Bas van 't Hof, Koos Huijssen, Christiaan M. Klaij                    *
  5:  *************************************************************************************
  6:  *    content: Support for nested PetscTimers                                        *
  7:  *************************************************************************************/
  8: #include <petsclog.h>
  9: #include <petsc/private/logimpl.h>
 10: #include <petsctime.h>
 11: #include <petscviewer.h>
 12: #include "../src/sys/logging/xmlviewer.h"

 14: #if defined(PETSC_USE_LOG)

 16: /*
 17:  * Support for nested PetscTimers
 18:  *
 19:  * PetscTimers keep track of a lot of useful information: Wall clock times,
 20:  * message passing statistics, flop counts.  Information about the nested structure
 21:  * of the timers is lost. Example:
 22:  *
 23:  * 7:30   Start: awake
 24:  * 7:30      Start: morning routine
 25:  * 7:40         Start: eat
 26:  * 7:49         Done:  eat
 27:  * 7:43      Done:  morning routine
 28:  * 8:15      Start: work
 29:  * 12:15        Start: eat
 30:  * 12:45        Done:  eat
 31:  * 16:00     Done:  work
 32:  * 16:30     Start: evening routine
 33:  * 18:30        Start: eat
 34:  * 19:15        Done:  eat
 35:  * 22:00     Done:  evening routine
 36:  * 22:00  Done:  awake
 37:  *
 38:  * Petsc timers provide the following timer results:
 39:  *
 40:  *    awake:              1 call    14:30 hours
 41:  *    morning routine:    1 call     0:13 hours
 42:  *    eat:                3 calls    1:24 hours
 43:  *    work:               1 call     7:45 hours
 44:  *    evening routine     1 call     5:30 hours
 45:  *
 46:  * Nested timers can be used to get the following table:
 47:  *
 48:  *   [1 call]: awake                14:30 hours
 49:  *   [1 call]:    morning routine         0:13 hours         ( 2 % of awake)
 50:  *   [1 call]:       eat                       0:09 hours         (69 % of morning routine)
 51:  *                   rest (morning routine)    0:04 hours         (31 % of morning routine)
 52:  *   [1 call]:    work                    7:45 hours         (53 % of awake)
 53:  *   [1 call]:       eat                       0:30 hours         ( 6 % of work)
 54:  *                   rest (work)               7:15 hours         (94 % of work)
 55:  *   [1 call]:    evening routine         5:30 hours         (38 % of awake)
 56:  *   [1 call]:       eat                       0:45 hours         (14 % of evening routine)
 57:  *                   rest (evening routine)    4:45 hours         (86 % of morning routine)
 58:  *
 59:  * We ignore the concept of 'stages', because these seem to be conflicting notions, or at least,
 60:  * the nested timers make the stages unnecessary.
 61:  *
 62:  */

 64: /*
 65:  * Data structures for keeping track of nested timers:
 66:  *
 67:  *   nestedEvents: information about the timers that have actually been activated
 68:  *   dftParentActive: if a timer is started now, it is part of (nested inside) the dftParentActive
 69:  *
 70:  * The Default-timers are used to time the nested timers. Every nested timer corresponds to
 71:  * (one or more) default timers, where one of the default timers has the same event-id as the
 72:  * nested one.
 73:  *
 74:  * Because of the risk of confusion between nested timer ids and default timer ids, we
 75:  * introduce a typedef for nested events (NestedEventId) and use the existing type PetscLogEvent
 76:  * only for default events. Also, all nested event variables are prepended with 'nst', and
 77:  * default timers with 'dft'.
 78:  */

 80: #define DFT_ID_AWAKE -1

 82: typedef PetscLogEvent NestedEventId;
 83: typedef struct {
 84:   NestedEventId   nstEvent;         /* event-code for this nested event, argument 'event' in PetscLogEventStartNested */
 85:   int             nParents;         /* number of 'dftParents': the default timer which was the dftParentActive when this nested timer was activated */
 86:   PetscLogEvent  *dftParentsSorted; /* The default timers which were the dftParentActive when this nested event was started */
 87:   PetscLogEvent  *dftEvents;        /* The default timers which represent the different 'instances' of this nested event */

 89:   PetscLogEvent  *dftParents;       /* The default timers which were the dftParentActive when this nested event was started */
 90:   PetscLogEvent  *dftEventsSorted;  /* The default timers which represent the different 'instances' of this nested event */
 91: } PetscNestedEvent;

 93: static PetscLogEvent    dftParentActive        = DFT_ID_AWAKE;
 94: static int              nNestedEvents          = 0;
 95: static int              nNestedEventsAllocated = 0;
 96: static PetscNestedEvent *nestedEvents          = NULL;
 97: static PetscLogDouble   thresholdTime          = 0.01; /* initial value was 0.1 */

 99: #define THRESHOLD (thresholdTime/100.0+1e-12)

101: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
102: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4);
103: PETSC_INTERN PetscErrorCode PetscLogView_Nested(PetscViewer);
104: PETSC_INTERN PetscErrorCode PetscLogView_Flamegraph(PetscViewer);

106: /*@C
107:   PetscLogNestedBegin - Turns on nested logging of objects and events. This logs flop
108:   rates and object creation and should not slow programs down too much.

110:   Logically Collective over PETSC_COMM_WORLD

112:   Options Database Keys:
113: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

115:   Usage:
116: .vb
117:       PetscInitialize(...);
118:       PetscLogNestedBegin();
119:        ... code ...
120:       PetscLogView(viewer);
121:       PetscFinalize();
122: .ve

124:   Level: advanced

126: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin()
127: @*/
128: PetscErrorCode PetscLogNestedBegin(void)
129: {

132:   nNestedEventsAllocated = 10;
133:   PetscMalloc1(nNestedEventsAllocated,&nestedEvents);
134:   dftParentActive = DFT_ID_AWAKE;
135:   nNestedEvents =1;

137:   /* 'Awake' is nested event 0. It has no parents */
138:   nestedEvents[0].nstEvent          = 0;
139:   nestedEvents[0].nParents          = 0;
140:   nestedEvents[0].dftParentsSorted  = NULL;
141:   nestedEvents[0].dftEvents         = NULL;
142:   nestedEvents[0].dftParents        = NULL;
143:   nestedEvents[0].dftEventsSorted   = NULL;

145:   PetscLogSet(PetscLogEventBeginNested,PetscLogEventEndNested);
146:   return 0;
147: }

149: /* Delete the data structures for the nested timers */
150: PetscErrorCode PetscLogNestedEnd(void)
151: {
152:   int            i;

154:   if (!nestedEvents) return 0;
155:   for (i=0; i<nNestedEvents; i++) {
156:     PetscFree4(nestedEvents[i].dftParentsSorted,nestedEvents[i].dftEventsSorted,nestedEvents[i].dftParents,nestedEvents[i].dftEvents);
157:   }
158:   PetscFree(nestedEvents);
159:   nestedEvents           = NULL;
160:   nNestedEvents          = 0;
161:   nNestedEventsAllocated = 0;
162:   return 0;
163: }

165: /*
166:  UTILITIES: FIND STUFF IN SORTED ARRAYS

168:     dftIndex - index to be found
169:     dftArray - sorted array of PetscLogEvent-ids
170:     narray - dimension of dftArray
171:     entry - entry in the array where dftIndex may be found;

173:      if dftArray[entry] != dftIndex, then dftIndex is not part of dftArray
174:      In that case, the dftIndex can be inserted at this entry.
175: */
176: static PetscErrorCode PetscLogEventFindDefaultTimer(PetscLogEvent dftIndex,const PetscLogEvent *dftArray,int narray,int *entry)
177: {
178:   if (narray==0 || dftIndex <= dftArray[0]) {
179:     *entry = 0;
180:   } else if (dftIndex > dftArray[narray-1]) {
181:     *entry = narray;
182:   } else {
183:     int ihigh = narray-1, ilow=0;
184:     while (ihigh>ilow) {
185:       const int imiddle = (ihigh+ilow)/2;
186:       if (dftArray[imiddle] > dftIndex) {
187:         ihigh = imiddle;
188:       } else if (dftArray[imiddle]<dftIndex) {
189:         ilow = imiddle+1;
190:       } else {
191:         ihigh = imiddle;
192:         ilow  = imiddle;
193:       }
194:     }
195:     *entry = ihigh;
196:   }
197:   return 0;
198: }

200: /*
201:     Utility: find the nested event with given identification

203:     nstEvent - Nested event to be found
204:     entry - entry in the nestedEvents where nstEvent may be found;

206:     if nestedEvents[entry].nstEvent != nstEvent, then index is not part of iarray
207: */
208: static PetscErrorCode PetscLogEventFindNestedTimer(NestedEventId nstEvent,int *entry)
209: {
210:   if (nNestedEvents==0 || nstEvent <= nestedEvents[0].nstEvent) {
211:     *entry = 0;
212:   } else if (nstEvent > nestedEvents[nNestedEvents-1].nstEvent) {
213:     *entry = nNestedEvents;
214:   } else {
215:     int ihigh = nNestedEvents-1,  ilow = 0;
216:     while (ihigh>ilow) {
217:       const int imiddle = (ihigh+ilow)/2;
218:       if (nestedEvents[imiddle].nstEvent > nstEvent) {
219:         ihigh = imiddle;
220:       } else if (nestedEvents[imiddle].nstEvent<nstEvent) {
221:         ilow = imiddle+1;
222:       } else {
223:         ihigh = imiddle;
224:         ilow  = imiddle;
225:       }
226:     }
227:     *entry = ihigh;
228:   }
229:   return 0;
230: }

232: /*
233:  Nested logging is not prepared yet to support user-defined logging stages, so for now we force logging on the main stage.
234:  Using PetscLogStage{Push/Pop}() would be more appropriate, but these two calls do extra bookkeeping work we don't need.
235: */

237: #define MAINSTAGE 0

239: static PetscLogStage savedStage = 0;

241: static inline PetscErrorCode PetscLogStageOverride(void)
242: {
243:   PetscStageLog  stageLog = petsc_stageLog;

245:   if (stageLog->curStage == MAINSTAGE) return 0;
246:   savedStage = stageLog->curStage;
247:   stageLog->curStage = MAINSTAGE;
248:   PetscIntStackPush(stageLog->stack, MAINSTAGE);
249:   return 0;
250: }

252: static inline PetscErrorCode PetscLogStageRestore(void)
253: {
254:   PetscStageLog  stageLog = petsc_stageLog;

256:   if (savedStage == MAINSTAGE) return 0;
257:   stageLog->curStage = savedStage;
258:   PetscIntStackPop(stageLog->stack, &savedStage);
259:   return 0;
260: }

262: /******************************************************************************************/
263: /* Start a nested event */
264: static PetscErrorCode PetscLogEventBeginNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
265: {
266:   int             entry, pentry, tentry,i;
267:   PetscLogEvent   dftEvent;

269:   PetscLogEventFindNestedTimer(nstEvent, &entry);
270:   if (entry>=nNestedEvents || nestedEvents[entry].nstEvent != nstEvent) {
271:     /* Nested event doesn't exist yet: create it */

273:     if (nNestedEvents==nNestedEventsAllocated) {
274:       /* Enlarge and re-allocate nestedEvents if needed */
275:       PetscNestedEvent *tmp = nestedEvents;
276:       PetscMalloc1(2*nNestedEvents,&nestedEvents);
277:       nNestedEventsAllocated*=2;
278:       PetscArraycpy(nestedEvents, tmp, nNestedEvents);
279:       PetscFree(tmp);
280:     }

282:     /* Clear space in nestedEvents for new nested event */
283:     nNestedEvents++;
284:     for (i = nNestedEvents-1; i>entry; i--) {
285:       nestedEvents[i] = nestedEvents[i-1];
286:     }

288:     /* Create event in nestedEvents */
289:     nestedEvents[entry].nstEvent = nstEvent;
290:     nestedEvents[entry].nParents=1;
291:     PetscMalloc4(1,&nestedEvents[entry].dftParentsSorted,1,&nestedEvents[entry].dftEventsSorted,1,&nestedEvents[entry].dftParents,1,&nestedEvents[entry].dftEvents);

293:     /* Fill in new event */
294:     pentry = 0;
295:     dftEvent = (PetscLogEvent) nstEvent;

297:     nestedEvents[entry].nstEvent                 = nstEvent;
298:     nestedEvents[entry].dftParents[pentry]       = dftParentActive;
299:     nestedEvents[entry].dftEvents[pentry]        = dftEvent;
300:     nestedEvents[entry].dftParentsSorted[pentry] = dftParentActive;
301:     nestedEvents[entry].dftEventsSorted[pentry]  = dftEvent;

303:   } else {
304:     /* Nested event exists: find current dftParentActive among parents */
305:     PetscLogEvent *dftParentsSorted = nestedEvents[entry].dftParentsSorted;
306:     PetscLogEvent *dftEvents        = nestedEvents[entry].dftEvents;
307:     int           nParents          = nestedEvents[entry].nParents;

309:     PetscLogEventFindDefaultTimer( dftParentActive, dftParentsSorted, nParents, &pentry);

311:     if (pentry>=nParents || dftParentActive != dftParentsSorted[pentry]) {
312:       /* dftParentActive not in the list: add it to the list */
313:       int           i;
314:       PetscLogEvent *dftParents      = nestedEvents[entry].dftParents;
315:       PetscLogEvent *dftEventsSorted = nestedEvents[entry].dftEventsSorted;
316:       char          name[100];

318:       /* Register a new default timer */
319:       sprintf(name, "%d -> %d", (int) dftParentActive, (int) nstEvent);
320:       PetscLogEventRegister(name, 0, &dftEvent);
321:       PetscLogEventFindDefaultTimer( dftEvent, dftEventsSorted, nParents, &tentry);

323:       /* Reallocate parents and dftEvents to make space for new parent */
324:       PetscMalloc4(1+nParents,&nestedEvents[entry].dftParentsSorted,1+nParents,&nestedEvents[entry].dftEventsSorted,1+nParents,&nestedEvents[entry].dftParents,1+nParents,&nestedEvents[entry].dftEvents);
325:       PetscArraycpy(nestedEvents[entry].dftParentsSorted, dftParentsSorted, nParents);
326:       PetscArraycpy(nestedEvents[entry].dftEventsSorted,  dftEventsSorted,  nParents);
327:       PetscArraycpy(nestedEvents[entry].dftParents,       dftParents,       nParents);
328:       PetscArraycpy(nestedEvents[entry].dftEvents,        dftEvents,        nParents);
329:       PetscFree4(dftParentsSorted,dftEventsSorted,dftParents,dftEvents);

331:       dftParents       = nestedEvents[entry].dftParents;
332:       dftEvents        = nestedEvents[entry].dftEvents;
333:       dftParentsSorted = nestedEvents[entry].dftParentsSorted;
334:       dftEventsSorted  = nestedEvents[entry].dftEventsSorted;

336:       nestedEvents[entry].nParents++;
337:       nParents++;

339:       for (i = nParents-1; i>pentry; i--) {
340:         dftParentsSorted[i] = dftParentsSorted[i-1];
341:         dftEvents[i]        = dftEvents[i-1];
342:       }
343:       for (i = nParents-1; i>tentry; i--) {
344:         dftParents[i]      = dftParents[i-1];
345:         dftEventsSorted[i] = dftEventsSorted[i-1];
346:       }

348:       /* Fill in the new default timer */
349:       dftParentsSorted[pentry] = dftParentActive;
350:       dftEvents[pentry]        = dftEvent;
351:       dftParents[tentry]       = dftParentActive;
352:       dftEventsSorted[tentry]  = dftEvent;

354:     } else {
355:       /* dftParentActive was found: find the corresponding default 'dftEvent'-timer */
356:       dftEvent = nestedEvents[entry].dftEvents[pentry];
357:     }
358:   }

360:   /* Start the default 'dftEvent'-timer and update the dftParentActive */
361:   PetscLogStageOverride();
362:   PetscLogEventBeginDefault(dftEvent,t,o1,o2,o3,o4);
363:   PetscLogStageRestore();
364:   dftParentActive = dftEvent;
365:   return 0;
366: }

368: /* End a nested event */
369: static PetscErrorCode PetscLogEventEndNested(NestedEventId nstEvent, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4)
370: {
371:   int             entry, pentry, nParents;
372:   PetscLogEvent  *dftEventsSorted;

374:   /* Find the nested event */
375:   PetscLogEventFindNestedTimer(nstEvent, &entry);
378:   dftEventsSorted = nestedEvents[entry].dftEventsSorted;
379:   nParents        = nestedEvents[entry].nParents;

381:   /* Find the current default timer among the 'dftEvents' of this event */
382:   PetscLogEventFindDefaultTimer( dftParentActive, dftEventsSorted, nParents, &pentry);


387:   /* Stop the default timer and update the dftParentActive */
388:   PetscLogStageOverride();
389:   PetscLogEventEndDefault(dftParentActive,t,o1,o2,o3,o4);
390:   PetscLogStageRestore();
391:   dftParentActive = nestedEvents[entry].dftParents[pentry];
392:   return 0;
393: }

395: /*@
396:    PetscLogSetThreshold - Set the threshold time for logging the events; this is a percentage out of 100, so 1. means any event
397:           that takes 1 or more percent of the time.

399:   Logically Collective over PETSC_COMM_WORLD

401:   Input Parameter:
402: .   newThresh - the threshold to use

404:   Output Parameter:
405: .   oldThresh - the previously set threshold value

407:   Options Database Keys:
408: . -log_view :filename.xml:ascii_xml - Prints an XML summary of flop and timing information to the file

410:   Usage:
411: .vb
412:       PetscInitialize(...);
413:       PetscLogNestedBegin();
414:       PetscLogSetThreshold(0.1,&oldthresh);
415:        ... code ...
416:       PetscLogView(viewer);
417:       PetscFinalize();
418: .ve

420:   Level: advanced

422: .seealso: PetscLogDump(), PetscLogAllBegin(), PetscLogView(), PetscLogTraceBegin(), PetscLogDefaultBegin(),
423:           PetscLogNestedBegin()
424: @*/
425: PetscErrorCode PetscLogSetThreshold(PetscLogDouble newThresh, PetscLogDouble *oldThresh)
426: {
427:   if (oldThresh) *oldThresh = thresholdTime;
428:   if (newThresh == PETSC_DECIDE)  newThresh = 0.01;
429:   if (newThresh == PETSC_DEFAULT) newThresh = 0.01;
430:   thresholdTime = PetscMax(newThresh, 0.0);
431:   return 0;
432: }

434: static PetscErrorCode PetscPrintExeSpecs(PetscViewer viewer)
435: {
436:   char               arch[128],hostname[128],username[128],pname[PETSC_MAX_PATH_LEN],date[128];
437:   char               version[256], buildoptions[128] = "";
438:   PetscMPIInt        size;
439:   size_t             len;

441:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
442:   PetscGetArchType(arch,sizeof(arch));
443:   PetscGetHostName(hostname,sizeof(hostname));
444:   PetscGetUserName(username,sizeof(username));
445:   PetscGetProgramName(pname,sizeof(pname));
446:   PetscGetDate(date,sizeof(date));
447:   PetscGetVersion(version,sizeof(version));

449:   PetscViewerXMLStartSection(viewer, "runspecification", "Run Specification");
450:   PetscViewerXMLPutString(   viewer, "executable"  , "Executable"   , pname);
451:   PetscViewerXMLPutString(   viewer, "architecture", "Architecture" , arch);
452:   PetscViewerXMLPutString(   viewer, "hostname"    , "Host"         , hostname);
453:   PetscViewerXMLPutInt(      viewer, "nprocesses"  , "Number of processes", size);
454:   PetscViewerXMLPutString(   viewer, "user"        , "Run by user"  , username);
455:   PetscViewerXMLPutString(   viewer, "date"        , "Started at"   , date);
456:   PetscViewerXMLPutString(   viewer, "petscrelease", "Petsc Release", version);

458:   if (PetscDefined(USE_DEBUG)) {
459:     PetscStrlcat(buildoptions, "Debug ", sizeof(buildoptions));
460:   }
461:   if (PetscDefined(USE_COMPLEX)) {
462:     PetscStrlcat(buildoptions, "Complex ", sizeof(buildoptions));
463:   }
464:   if (PetscDefined(USE_REAL_SINGLE)) {
465:     PetscStrlcat(buildoptions, "Single ", sizeof(buildoptions));
466:   } else if (PetscDefined(USE_REAL___FLOAT128)) {
467:     PetscStrlcat(buildoptions, "Quadruple ", sizeof(buildoptions));
468:   } else if (PetscDefined(USE_REAL___FP16)) {
469:     PetscStrlcat(buildoptions, "Half ", sizeof(buildoptions));
470:   }
471:   if (PetscDefined(USE_64BIT_INDICES)) {
472:     PetscStrlcat(buildoptions, "Int64 ", sizeof(buildoptions));
473:   }
474: #if defined(__cplusplus)
475:   PetscStrlcat(buildoptions, "C++ ", sizeof(buildoptions));
476: #endif
477:   PetscStrlen(buildoptions,&len);
478:   if (len) {
479:     PetscViewerXMLPutString(viewer, "petscbuildoptions", "Petsc build options", buildoptions);
480:   }
481:   PetscViewerXMLEndSection(viewer, "runspecification");
482:   return 0;
483: }

485: /* Print the global performance: max, max/min, average and total of
486:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
487:  */
488: static PetscErrorCode PetscPrintXMLGlobalPerformanceElement(PetscViewer viewer, const char *name, const char *desc, PetscLogDouble local_val, const PetscBool print_average, const PetscBool print_total)
489: {
490:   PetscLogDouble  min, tot, ratio, avg;
491:   MPI_Comm        comm;
492:   PetscMPIInt     rank, size;
493:   PetscLogDouble  valrank[2], max[2];

495:   PetscObjectGetComm((PetscObject)viewer,&comm);
496:   MPI_Comm_size(PetscObjectComm((PetscObject)viewer),&size);
497:   MPI_Comm_rank(comm, &rank);

499:   valrank[0] = local_val;
500:   valrank[1] = (PetscLogDouble) rank;
501:   MPIU_Allreduce(&local_val, &min, 1, MPIU_PETSCLOGDOUBLE,  MPI_MIN,    comm);
502:   MPIU_Allreduce(valrank,    &max, 1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
503:   MPIU_Allreduce(&local_val, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);
504:   avg  = tot/((PetscLogDouble) size);
505:   if (min != 0.0) ratio = max[0]/min;
506:   else ratio = 0.0;

508:   PetscViewerXMLStartSection(viewer, name, desc);
509:   PetscViewerXMLPutDouble(viewer, "max", NULL, max[0], "%e");
510:   PetscViewerXMLPutInt(   viewer, "maxrank"  , "rank at which max was found" , (PetscMPIInt) max[1]);
511:   PetscViewerXMLPutDouble(viewer, "ratio", NULL, ratio, "%f");
512:   if (print_average) {
513:     PetscViewerXMLPutDouble(viewer, "average", NULL, avg, "%e");
514:   }
515:   if (print_total) {
516:     PetscViewerXMLPutDouble(viewer, "total", NULL, tot, "%e");
517:   }
518:   PetscViewerXMLEndSection(viewer, name);
519:   return 0;
520: }

522: /* Print the global performance: max, max/min, average and total of
523:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
524:  */
525: static PetscErrorCode PetscPrintGlobalPerformance(PetscViewer viewer, PetscLogDouble locTotalTime)
526: {
527:   PetscLogDouble  flops, mem, red, mess;
528:   const PetscBool print_total_yes   = PETSC_TRUE,
529:                   print_total_no    = PETSC_FALSE,
530:                   print_average_no  = PETSC_FALSE,
531:                   print_average_yes = PETSC_TRUE;

533:   /* Must preserve reduction count before we go on */
534:   red = petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct;

536:   /* Calculate summary information */
537:   PetscViewerXMLStartSection(viewer, "globalperformance", "Global performance");

539:   /*   Time */
540:   PetscPrintXMLGlobalPerformanceElement(viewer, "time", "Time (sec)", locTotalTime, print_average_yes, print_total_no);

542:   /*   Objects */
543:   PetscPrintXMLGlobalPerformanceElement(viewer, "objects", "Objects", (PetscLogDouble) petsc_numObjects, print_average_yes, print_total_no);

545:   /*   Flop */
546:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflop", "MFlop", petsc_TotalFlops/1.0E6, print_average_yes, print_total_yes);

548:   /*   Flop/sec -- Must talk to Barry here */
549:   if (locTotalTime != 0.0) flops = petsc_TotalFlops/locTotalTime;
550:   else flops = 0.0;
551:   PetscPrintXMLGlobalPerformanceElement(viewer, "mflops", "MFlop/sec", flops/1.0E6, print_average_yes, print_total_yes);

553:   /*   Memory */
554:   PetscMallocGetMaximumUsage(&mem);
555:   if (mem > 0.0) {
556:     PetscPrintXMLGlobalPerformanceElement(viewer, "memory", "Memory (MiB)", mem/1024.0/1024.0, print_average_yes, print_total_yes);
557:   }
558:   /*   Messages */
559:   mess = 0.5*(petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct);
560:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagetransfers", "MPI Message Transfers", mess, print_average_yes, print_total_yes);

562:   /*   Message Volume */
563:   mess = 0.5*(petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len);
564:   PetscPrintXMLGlobalPerformanceElement(viewer, "messagevolume", "MPI Message Volume (MiB)", mess/1024.0/1024.0, print_average_yes, print_total_yes);

566:   /*   Reductions */
567:   PetscPrintXMLGlobalPerformanceElement(viewer, "reductions", "MPI Reductions", red , print_average_no, print_total_no);
568:   PetscViewerXMLEndSection(viewer, "globalperformance");
569:   return 0;
570: }

572: typedef struct {
573:   PetscLogEvent  dftEvent;
574:   NestedEventId  nstEvent;
575:   PetscLogEvent  dftParent;
576:   NestedEventId  nstParent;
577:   PetscBool      own;
578:   int            depth;
579:   NestedEventId* nstPath;
580: } PetscNestedEventTree;

582: /* Compare timers to sort them in the tree */
583: static int compareTreeItems(const void *item1_, const void *item2_)
584: {
585:   int                  i;
586:   PetscNestedEventTree *item1 = (PetscNestedEventTree *) item1_;
587:   PetscNestedEventTree *item2 = (PetscNestedEventTree *) item2_;

589:   for (i=0; i<PetscMin(item1->depth,item2->depth); i++) {
590:     if (item1->nstPath[i]<item2->nstPath[i]) return -1;
591:     if (item1->nstPath[i]>item2->nstPath[i]) return +1;
592:   }
593:   if (item1->depth < item2->depth) return -1;
594:   if (item1->depth > item2->depth) return 1;
595:   return 0;
596: }
597: /*
598:  * Do MPI communication to get the complete, nested calling tree for all processes: there may be
599:  * calls that happen in some processes, but not in others.
600:  *
601:  * The output, tree[nTimers] is an array of PetscNestedEventTree-structs.
602:  * The tree is sorted so that the timers can be printed in the order of appearance.
603:  *
604:  * For tree-items which appear in the trees of multiple processes (which will be most items), the
605:  * following rule is followed:
606:  * + if information from my own process is available, then that is the information stored in tree.
607:  *   otherwise it is some other process's information.
608:  */
609: static PetscErrorCode PetscLogNestedTreeCreate(PetscViewer viewer, PetscNestedEventTree **p_tree, int *p_nTimers)
610: {
611:   PetscNestedEventTree *tree = NULL, *newTree;
612:   int                  *treeIndices;
613:   int                  nTimers, totalNTimers, i, j, iTimer0, maxDefaultTimer;
614:   int                  yesno;
615:   PetscBool            done;
616:   int                  maxdepth;
617:   int                  depth;
618:   int                  illegalEvent;
619:   int                  iextra;
620:   NestedEventId        *nstPath, *nstMyPath;
621:   MPI_Comm             comm;

623:   PetscObjectGetComm((PetscObject)viewer,&comm);

625:   /* Calculate memory needed to store everybody's information and allocate tree */
626:   nTimers = 0;
627:   for (i=0; i<nNestedEvents; i++) nTimers += nestedEvents[i].nParents;

629:   PetscMalloc1(nTimers,&tree);

631:   /* Fill tree with readily available information */
632:   iTimer0 = 0;
633:   maxDefaultTimer =0;
634:   for (i=0; i<nNestedEvents; i++) {
635:     int           nParents          = nestedEvents[i].nParents;
636:     NestedEventId nstEvent          = nestedEvents[i].nstEvent;
637:     PetscLogEvent *dftParentsSorted = nestedEvents[i].dftParentsSorted;
638:     PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
639:     for (j=0; j<nParents; j++) {
640:       maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);

642:       tree[iTimer0+j].dftEvent   = dftEvents[j];
643:       tree[iTimer0+j].nstEvent   = nstEvent;
644:       tree[iTimer0+j].dftParent  = dftParentsSorted[j];
645:       tree[iTimer0+j].own        = PETSC_TRUE;

647:       tree[iTimer0+j].nstParent  = 0;
648:       tree[iTimer0+j].depth      = 0;
649:       tree[iTimer0+j].nstPath    = NULL;
650:     }
651:     iTimer0 += nParents;
652:   }

654:   /* Calculate the global maximum for the default timer index, so array treeIndices can
655:    * be allocated only once */
656:   MPIU_Allreduce(&maxDefaultTimer, &j, 1, MPI_INT, MPI_MAX, comm);
657:   maxDefaultTimer = j;

659:   /* Find default timer's place in the tree */
660:   PetscCalloc1(maxDefaultTimer+1,&treeIndices);
661:   treeIndices[0] = 0;
662:   for (i=0; i<nTimers; i++) {
663:     PetscLogEvent dftEvent = tree[i].dftEvent;
664:     treeIndices[dftEvent] = i;
665:   }

667:   /* Find each dftParent's nested identification */
668:   for (i=0; i<nTimers; i++) {
669:     PetscLogEvent dftParent = tree[i].dftParent;
670:     if (dftParent!= DFT_ID_AWAKE) {
671:       int j = treeIndices[dftParent];
672:       tree[i].nstParent = tree[j].nstEvent;
673:     }
674:   }

676:   /* Find depths for each timer path */
677:   done = PETSC_FALSE;
678:   maxdepth = 0;
679:   while (!done) {
680:     done = PETSC_TRUE;
681:     for (i=0; i<nTimers; i++) {
682:       if (tree[i].dftParent == DFT_ID_AWAKE) {
683:         tree[i].depth = 1;
684:         maxdepth = PetscMax(1,maxdepth);
685:       } else {
686:         int j = treeIndices[tree[i].dftParent];
687:         depth = 1+tree[j].depth;
688:         if (depth>tree[i].depth) {
689:           done          = PETSC_FALSE;
690:           tree[i].depth = depth;
691:           maxdepth      = PetscMax(depth,maxdepth);
692:         }
693:       }
694:     }
695:   }

697:   /* Allocate the paths in the entire tree */
698:   for (i=0; i<nTimers; i++) {
699:     depth = tree[i].depth;
700:     PetscCalloc1(depth,&tree[i].nstPath);
701:   }

703:   /* Calculate the paths for all timers */
704:   for (depth=1; depth<=maxdepth; depth++) {
705:     for (i=0; i<nTimers; i++) {
706:       if (tree[i].depth==depth) {
707:         if (depth>1) {
708:           int    j = treeIndices[tree[i].dftParent];
709:           PetscArraycpy(tree[i].nstPath,tree[j].nstPath,depth-1);
710:         }
711:         tree[i].nstPath[depth-1] = tree[i].nstEvent;
712:       }
713:     }
714:   }
715:   PetscFree(treeIndices);

717:   /* Sort the tree on basis of the paths */
718:   qsort(tree, nTimers, sizeof(PetscNestedEventTree), compareTreeItems);

720:   /* Allocate an array to store paths */
721:   depth = maxdepth;
722:   MPIU_Allreduce(&depth, &maxdepth, 1, MPI_INT, MPI_MAX, comm);
723:   PetscMalloc1(maxdepth+1, &nstPath);
724:   PetscMalloc1(maxdepth+1, &nstMyPath);

726:   /* Find an illegal nested event index (1+largest nested event index) */
727:   illegalEvent = 1+nestedEvents[nNestedEvents-1].nstEvent;
728:   i = illegalEvent;
729:   MPIU_Allreduce(&i, &illegalEvent, 1, MPI_INT, MPI_MAX, comm);

731:   /* First, detect timers which are not available in this process, but are available in others
732:    *        Allocate a new tree, that can contain all timers
733:    * Then,  fill the new tree with all (own and not-own) timers */
734:   newTree= NULL;
735:   for (yesno=0; yesno<=1; yesno++) {
736:     depth  = 1;
737:     i      = 0;
738:     iextra = 0;
739:     while (depth>0) {
740:       int       j;
741:       PetscBool same;

743:       /* Construct the next path in this process's tree:
744:        * if necessary, supplement with invalid path entries */
745:       depth++;
747:       if (i<nTimers) {
748:         for (j=0;             j<tree[i].depth; j++) nstMyPath[j] = tree[i].nstPath[j];
749:         for (j=tree[i].depth; j<depth;         j++) nstMyPath[j] = illegalEvent;
750:       } else {
751:         for (j=0;             j<depth;         j++) nstMyPath[j] = illegalEvent;
752:       }

754:       /* Communicate with other processes to obtain the next path and its depth */
755:       MPIU_Allreduce(nstMyPath, nstPath, depth, MPI_INT, MPI_MIN, comm);
756:       for (j=depth-1; (int) j>=0; j--) {
757:         if (nstPath[j]==illegalEvent) depth=j;
758:       }

760:       if (depth>0) {
761:         /* If the path exists */

763:         /* check whether the next path is the same as this process's next path */
764:         same = PETSC_TRUE;
765:         for (j=0; same && j<depth; j++) { same = (same &&  nstMyPath[j] == nstPath[j]) ? PETSC_TRUE : PETSC_FALSE;}

767:         if (same) {
768:           /* Register 'own path' */
769:           if (newTree) newTree[i+iextra] = tree[i];
770:           i++;
771:         } else {
772:           /* Register 'not an own path' */
773:           if (newTree) {
774:             newTree[i+iextra].nstEvent   = nstPath[depth-1];
775:             newTree[i+iextra].own        = PETSC_FALSE;
776:             newTree[i+iextra].depth      = depth;
777:             PetscMalloc1(depth, &newTree[i+iextra].nstPath);
778:             for (j=0; j<depth; j++) {newTree[i+iextra].nstPath[j] = nstPath[j];}

780:             newTree[i+iextra].dftEvent  = 0;
781:             newTree[i+iextra].dftParent = 0;
782:             newTree[i+iextra].nstParent = 0;
783:           }
784:           iextra++;
785:         }

787:       }
788:     }

790:     /* Determine the size of the complete tree (with own and not-own timers) and allocate the new tree */
791:     totalNTimers = nTimers + iextra;
792:     if (!newTree) {
793:       PetscMalloc1(totalNTimers, &newTree);
794:     }
795:   }
796:   PetscFree(nstPath);
797:   PetscFree(nstMyPath);
798:   PetscFree(tree);
799:   tree = newTree;
800:   newTree = NULL;

802:   /* Set return value and return */
803:   *p_tree    = tree;
804:   *p_nTimers = totalNTimers;
805:   return 0;
806: }

808: /*
809:  * Delete the nested timer tree
810:  */
811: static PetscErrorCode PetscLogNestedTreeDestroy(PetscNestedEventTree *tree, int nTimers)
812: {
813:   int             i;

815:   for (i=0; i<nTimers; i++) {
816:     PetscFree(tree[i].nstPath);
817:   }
818:   PetscFree(tree);
819:   return 0;
820: }

822: /* Print the global performance: max, max/min, average and total of
823:  *      time, objects, flops, flops/sec, memory, MPI messages, MPI message lengths, MPI reductions.
824:  */
825: static PetscErrorCode PetscPrintXMLNestedLinePerfResults(PetscViewer viewer,const char *name,PetscLogDouble value,PetscLogDouble minthreshold,PetscLogDouble maxthreshold,PetscLogDouble minmaxtreshold)
826: {
827:   MPI_Comm       comm;                          /* MPI communicator in reduction */
828:   PetscMPIInt    rank;                          /* rank of this process */
829:   PetscLogDouble val_in[2], max[2], min[2];
830:   PetscLogDouble minvalue, maxvalue, tot;
831:   PetscMPIInt    size;
832:   PetscMPIInt    minLoc, maxLoc;

834:   PetscObjectGetComm((PetscObject)viewer,&comm);
835:   MPI_Comm_size(comm, &size);
836:   MPI_Comm_rank(comm, &rank);
837:   val_in[0] = value;
838:   val_in[1] = (PetscLogDouble) rank;
839:   MPIU_Allreduce(val_in, max,  1, MPIU_2PETSCLOGDOUBLE, MPI_MAXLOC, comm);
840:   MPIU_Allreduce(val_in, min,  1, MPIU_2PETSCLOGDOUBLE, MPI_MINLOC, comm);
841:   maxvalue = max[0];
842:   maxLoc   = (PetscMPIInt) max[1];
843:   minvalue = min[0];
844:   minLoc   = (PetscMPIInt) min[1];
845:   MPIU_Allreduce(&value, &tot, 1, MPIU_PETSCLOGDOUBLE,  MPI_SUM,    comm);

847:   if (maxvalue<maxthreshold && minvalue>=minthreshold) {
848:     /* One call per parent or NO value: don't print */
849:   } else {
850:      PetscViewerXMLStartSection(viewer, name, NULL);
851:      if (maxvalue>minvalue*minmaxtreshold) {
852:        PetscViewerXMLPutDouble(viewer, "avgvalue", NULL, tot/size, "%g");
853:        PetscViewerXMLPutDouble(viewer, "minvalue", NULL, minvalue, "%g");
854:        PetscViewerXMLPutDouble(viewer, "maxvalue", NULL, maxvalue, "%g");
855:        PetscViewerXMLPutInt(   viewer, "minloc"  , NULL, minLoc);
856:        PetscViewerXMLPutInt(   viewer, "maxloc"  , NULL, maxLoc);
857:      } else {
858:        PetscViewerXMLPutDouble(viewer, "value", NULL, tot/size, "%g");
859:      }
860:      PetscViewerXMLEndSection(viewer, name);
861:   }
862:   return 0;
863: }

865: #define N_COMM 8
866: static PetscErrorCode PetscLogNestedTreePrintLine(PetscViewer viewer,PetscEventPerfInfo perfInfo,PetscLogDouble countsPerCall,int parentCount,int depth,const char *name,PetscLogDouble totalTime,PetscBool *isPrinted)
867: {
868:   PetscLogDouble time = perfInfo.time;
869:   PetscLogDouble timeMx;
870:   MPI_Comm       comm;

872:   PetscObjectGetComm((PetscObject)viewer,&comm);
873:   MPIU_Allreduce(&time, &timeMx, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
874:   *isPrinted = ((timeMx/totalTime) >= THRESHOLD) ? PETSC_TRUE : PETSC_FALSE;
875:   if (*isPrinted) {
876:     PetscViewerXMLStartSection(viewer, "event", NULL);
877:     PetscViewerXMLPutString(viewer, "name", NULL, name);
878:     PetscPrintXMLNestedLinePerfResults(viewer, "time", time/totalTime*100.0, 0, 0, 1.02);
879:     PetscPrintXMLNestedLinePerfResults(viewer, "ncalls", parentCount>0 ? countsPerCall : 0, 0.99, 1.01, 1.02);
880:     PetscPrintXMLNestedLinePerfResults(viewer, "mflops", time>=timeMx*0.001 ? 1e-6*perfInfo.flops/time : 0, 0, 0.01, 1.05);
881:     PetscPrintXMLNestedLinePerfResults(viewer, "mbps",time>=timeMx*0.001 ? perfInfo.messageLength/(1024*1024*time) : 0, 0, 0.01, 1.05);
882:     PetscPrintXMLNestedLinePerfResults(viewer, "nreductsps", time>=timeMx*0.001 ? perfInfo.numReductions/time : 0, 0, 0.01, 1.05);
883:   }
884:   return 0;
885: }

887: /* Count the number of times the parent event was called */

889: static int countParents( const PetscNestedEventTree *tree, PetscEventPerfInfo *eventPerfInfo, int i)
890: {
891:   if (tree[i].depth<=1) {
892:     return 1;  /* Main event: only once */
893:   } else if (!tree[i].own) {
894:     return 1;  /* This event didn't happen in this process, but did in another */
895:   } else {
896:     int iParent;
897:     for (iParent=i-1; iParent>=0; iParent--) {
898:       if (tree[iParent].depth == tree[i].depth-1) break;
899:     }
900:     if (tree[iParent].depth != tree[i].depth-1) {
901:       /* *****  Internal error: cannot find parent */
902:       return -2;
903:     } else {
904:       PetscLogEvent dftEvent  = tree[iParent].dftEvent;
905:       return eventPerfInfo[dftEvent].count;
906:     }
907:   }
908: }

910: typedef struct {
911:   int             id;
912:   PetscLogDouble  val;
913: } PetscSortItem;

915: static int compareSortItems(const void *item1_, const void *item2_)
916: {
917:   PetscSortItem *item1 = (PetscSortItem *) item1_;
918:   PetscSortItem *item2 = (PetscSortItem *) item2_;
919:   if (item1->val > item2->val) return -1;
920:   if (item1->val < item2->val) return +1;
921:   return 0;
922: }

924: /*
925:  * Find the number of child events.
926:  */
927: static PetscErrorCode PetscLogNestedTreeGetChildrenCount(const PetscNestedEventTree *tree,int nTimers,int iStart,int depth,int *nChildren)
928: {
929:   int n=0;

931:   for (int i=iStart+1; i<nTimers; i++) {
932:     if (tree[i].depth <= depth) break;
933:     if (tree[i].depth == depth + 1) n++;
934:   }
935:   *nChildren = n;
936:   return 0;
937: }

939: /*
940:  * Initialize child event sort items with ID and times.
941:  */
942: static PetscErrorCode PetscLogNestedTreeSetChildrenSortItems(const PetscViewer viewer,const PetscNestedEventTree *tree,int nTimers,int iStart,int depth,int nChildren,PetscSortItem **children)
943: {
944:   MPI_Comm        comm;
945:   PetscLogDouble  *times, *maxTimes;
946:   PetscStageLog   stageLog;
947:   PetscEventPerfInfo *eventPerfInfo;
948:   const int          stage = MAINSTAGE;

950:   PetscObjectGetComm((PetscObject)viewer,&comm);
951:   PetscLogGetStageLog(&stageLog);
952:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

954:   if (nChildren>0) {
955:     /* Create an array for the id-s and maxTimes of the children,
956:      *  leaving 2 spaces for self-time and other-time */

958:     PetscMalloc1(nChildren+2,children);
959:     nChildren = 0;
960:     for (int i=iStart+1; i<nTimers; i++) {
961:       if (tree[i].depth<=depth) break;
962:       if (tree[i].depth == depth + 1) {
963:         (*children)[nChildren].id  = i;
964:         (*children)[nChildren].val = eventPerfInfo[tree[i].dftEvent].time ;
965:         nChildren++;
966:       }
967:     }

969:     /* Calculate the children's maximum times, to see whether children will be ignored or printed */
970:     PetscMalloc1(nChildren,&times);
971:     for (int i=0; i<nChildren; i++) { times[i] = (*children)[i].val; }

973:     PetscMalloc1(nChildren,&maxTimes);
974:     MPIU_Allreduce(times, maxTimes, nChildren, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
975:     PetscFree(times);

977:     for (int i=0; i<nChildren; i++) { (*children)[i].val = maxTimes[i]; }
978:     PetscFree(maxTimes);
979:   }
980:   return 0;
981: }

983: /*
984:  * Set 'self' and 'other' performance info.
985:  */
986: static PetscErrorCode PetscLogNestedTreeSetSelfOtherPerfInfo(const PetscNestedEventTree *tree,int iStart,PetscLogDouble totalTime,const PetscSortItem *children,int nChildren,
987:                                                              PetscEventPerfInfo *myPerfInfo,PetscEventPerfInfo *selfPerfInfo,PetscEventPerfInfo *otherPerfInfo,int *parentCount,PetscLogDouble *countsPerCall)
988: {
989:   const int          stage = MAINSTAGE;
990:   PetscStageLog      stageLog;
991:   PetscEventPerfInfo *eventPerfInfo;

993:   PetscLogGetStageLog(&stageLog);
994:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;
995:   if (!tree[iStart].own) {
996:   /* Set values for a timer that was not activated in this process
997:    * (but was, in other processes of this run) */
998:     PetscMemzero(myPerfInfo,sizeof(*myPerfInfo));

1000:     *selfPerfInfo  = *myPerfInfo;
1001:     *otherPerfInfo = *myPerfInfo;

1003:     *parentCount   = 1;
1004:     *countsPerCall = 0;
1005:   } else {
1006:   /* Set the values for a timer that was activated in this process */
1007:     PetscLogEvent dftEvent   = tree[iStart].dftEvent;

1009:     *parentCount    = countParents(tree,eventPerfInfo,iStart);
1010:     *myPerfInfo     = eventPerfInfo[dftEvent];
1011:     *countsPerCall  = (PetscLogDouble) myPerfInfo->count / (PetscLogDouble) *parentCount;

1013:     *selfPerfInfo                = *myPerfInfo;
1014:     otherPerfInfo->time          = 0;
1015:     otherPerfInfo->flops         = 0;
1016:     otherPerfInfo->numMessages   = 0;
1017:     otherPerfInfo->messageLength = 0;
1018:     otherPerfInfo->numReductions = 0;

1020:     for (int i=0; i<nChildren; i++) {
1021:       /* For all child counters: subtract the child values from self-timers */

1023:       PetscLogEvent      dftChild      = tree[children[i].id].dftEvent;
1024:       PetscEventPerfInfo childPerfInfo = eventPerfInfo[dftChild];

1026:       selfPerfInfo->time          -= childPerfInfo.time;
1027:       selfPerfInfo->flops         -= childPerfInfo.flops;
1028:       selfPerfInfo->numMessages   -= childPerfInfo.numMessages;
1029:       selfPerfInfo->messageLength -= childPerfInfo.messageLength;
1030:       selfPerfInfo->numReductions -= childPerfInfo.numReductions;

1032:       if ((children[i].val/totalTime) < THRESHOLD) {
1033:         /* Add them to 'other' if the time is ignored in the output */
1034:         otherPerfInfo->time          += childPerfInfo.time;
1035:         otherPerfInfo->flops         += childPerfInfo.flops;
1036:         otherPerfInfo->numMessages   += childPerfInfo.numMessages;
1037:         otherPerfInfo->messageLength += childPerfInfo.messageLength;
1038:         otherPerfInfo->numReductions += childPerfInfo.numReductions;
1039:       }
1040:     }
1041:   }
1042:   return 0;
1043: }

1045: /*
1046:  * Set max times across ranks for 'self' and 'other'.
1047:  */
1048: static PetscErrorCode PetscLogNestedTreeSetMaxTimes(MPI_Comm comm,int nChildren,const PetscEventPerfInfo selfPerfInfo,const PetscEventPerfInfo otherPerfInfo,PetscSortItem *children)
1049: {
1050:   PetscLogDouble times[2], maxTimes[2];

1052:   times[0] = selfPerfInfo.time;
1053:   times[1] = otherPerfInfo.time;

1055:   MPIU_Allreduce(times,maxTimes,2,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1056:   children[nChildren+0].id = -1;
1057:   children[nChildren+0].val = maxTimes[0];
1058:   children[nChildren+1].id = -2;
1059:   children[nChildren+1].val = maxTimes[1];
1060:   return 0;
1061: }

1063: static PetscErrorCode PetscLogNestedTreePrint(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, int iStart, PetscLogDouble totalTime)
1064: {
1065:   int                depth = tree[iStart].depth;
1066:   const char         *name;
1067:   int                parentCount=1, nChildren;
1068:   PetscSortItem      *children;
1069:   PetscStageLog      stageLog;
1070:   PetscEventRegInfo  *eventRegInfo;
1071:   PetscEventPerfInfo myPerfInfo={0},selfPerfInfo={0},otherPerfInfo={0};
1072:   PetscLogDouble     countsPerCall=0;
1073:   PetscBool          wasPrinted;
1074:   PetscBool          childWasPrinted;
1075:   MPI_Comm           comm;

1077:   /* Look up the name of the event and its PerfInfo */
1078:   PetscLogGetStageLog(&stageLog);
1079:   eventRegInfo  = stageLog->eventLog->eventInfo;
1080:   name = eventRegInfo[(PetscLogEvent)tree[iStart].nstEvent].name;
1081:   PetscObjectGetComm((PetscObject)viewer,&comm);

1083:   PetscLogNestedTreeGetChildrenCount(tree,nTimers,iStart,depth,&nChildren);
1084:   PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,iStart,depth,nChildren,&children);
1085:   PetscLogNestedTreeSetSelfOtherPerfInfo(tree,iStart,totalTime,children,nChildren,&myPerfInfo,&selfPerfInfo,&otherPerfInfo,&parentCount,&countsPerCall);

1087:   /* Main output for this timer */
1088:   PetscLogNestedTreePrintLine(viewer, myPerfInfo, countsPerCall, parentCount, depth, name, totalTime, &wasPrinted);

1090:   /* Now print the lines for the children */
1091:   if (nChildren > 0) {
1092:     int            i;

1094:     /* Calculate max-times for 'self' and 'other' */
1095:     PetscLogNestedTreeSetMaxTimes(comm,nChildren,selfPerfInfo,otherPerfInfo,children);

1097:     /* Now sort the children (including 'self' and 'other') on total time */
1098:     qsort(children, nChildren+2, sizeof(PetscSortItem), compareSortItems);

1100:     /* Print (or ignore) the children in ascending order of total time */
1101:     PetscViewerXMLStartSection(viewer,"events", NULL);
1102:     for (i=0; i<nChildren+2; i++) {
1103:       if ((children[i].val/totalTime) < THRESHOLD) {
1104:         /* ignored: no output */
1105:       } else if (children[i].id==-1) {
1106:         PetscLogNestedTreePrintLine(viewer, selfPerfInfo, 1, parentCount, depth+1, "self", totalTime, &childWasPrinted);
1107:         if (childWasPrinted) {
1108:           PetscViewerXMLEndSection(viewer,"event");
1109:         }
1110:       } else if (children[i].id==-2) {
1111:         size_t  len;
1112:         char    *otherName;

1114:         PetscStrlen(name,&len);
1115:         PetscMalloc1(len+16,&otherName);
1116:         PetscSNPrintf(otherName,len+16,"%s: other-timed",name);
1117:         PetscLogNestedTreePrintLine(viewer, otherPerfInfo, 1, 1, depth+1, otherName, totalTime, &childWasPrinted);
1118:         PetscFree(otherName);
1119:         if (childWasPrinted) {
1120:           PetscViewerXMLEndSection(viewer,"event");
1121:         }
1122:       } else {
1123:         /* Print the child with a recursive call to this function */
1124:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1125:       }
1126:     }
1127:     PetscViewerXMLEndSection(viewer,"events");
1128:     PetscFree(children);
1129:   }

1131:   if (wasPrinted) {
1132:     PetscViewerXMLEndSection(viewer, "event");
1133:   }
1134:   return 0;
1135: }

1137: static PetscErrorCode PetscLogNestedTreePrintTop(PetscViewer viewer, PetscNestedEventTree *tree, int nTimers, PetscLogDouble totalTime)
1138: {
1139:   int                i, nChildren;
1140:   PetscSortItem      *children;
1141:   MPI_Comm           comm;

1143:   PetscObjectGetComm((PetscObject)viewer,&comm);

1145:   PetscLogNestedTreeGetChildrenCount(tree,nTimers,-1,0,&nChildren);
1146:   PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,-1,0,nChildren,&children);

1148:   if (nChildren>0) {
1149:     /* Now sort the children on total time */
1150:     qsort(children, nChildren, sizeof(PetscSortItem), compareSortItems);
1151:     /* Print (or ignore) the children in ascending order of total time */
1152:     PetscViewerXMLStartSection(viewer, "timertree", "Timings tree");
1153:     PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");
1154:     PetscViewerXMLPutDouble(viewer, "timethreshold", NULL, thresholdTime, "%f");

1156:     for (i=0; i<nChildren; i++) {
1157:       if ((children[i].val/totalTime) < THRESHOLD) {
1158:         /* ignored: no output */
1159:       } else {
1160:         /* Print the child with a recursive call to this function */
1161:         PetscLogNestedTreePrint(viewer, tree, nTimers, children[i].id, totalTime);
1162:       }
1163:     }
1164:     PetscViewerXMLEndSection(viewer, "timertree");
1165:     PetscFree(children);
1166:   }
1167:   return 0;
1168: }

1170: typedef struct {
1171:   char           *name;
1172:   PetscLogDouble time;
1173:   PetscLogDouble flops;
1174:   PetscLogDouble numMessages;
1175:   PetscLogDouble messageLength;
1176:   PetscLogDouble numReductions;
1177: } PetscSelfTimer;

1179: static PetscErrorCode PetscCalcSelfTime(PetscViewer viewer, PetscSelfTimer **p_self, int *p_nstMax)
1180: {
1181:   const int          stage = MAINSTAGE;
1182:   PetscStageLog      stageLog;
1183:   PetscEventRegInfo  *eventRegInfo;
1184:   PetscEventPerfInfo *eventPerfInfo;
1185:   PetscSelfTimer     *selftimes;
1186:   PetscSelfTimer     *totaltimes;
1187:   NestedEventId      *nstEvents;
1188:   int                i, j, maxDefaultTimer;
1189:   NestedEventId      nst;
1190:   PetscLogEvent      dft;
1191:   int                nstMax, nstMax_local;
1192:   MPI_Comm           comm;

1194:   PetscObjectGetComm((PetscObject)viewer,&comm);
1195:   PetscLogGetStageLog(&stageLog);
1196:   eventRegInfo  = stageLog->eventLog->eventInfo;
1197:   eventPerfInfo = stageLog->stageInfo[stage].eventLog->eventInfo;

1199:   /* For each default timer, calculate the (one) nested timer that it corresponds to. */
1200:   maxDefaultTimer =0;
1201:   for (i=0; i<nNestedEvents; i++) {
1202:     int           nParents   = nestedEvents[i].nParents;
1203:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1204:     for (j=0; j<nParents; j++) maxDefaultTimer = PetscMax(dftEvents[j],maxDefaultTimer);
1205:   }
1206:   PetscMalloc1(maxDefaultTimer+1,&nstEvents);
1207:   for (dft=0; dft<maxDefaultTimer; dft++) {nstEvents[dft] = 0;}
1208:   for (i=0; i<nNestedEvents; i++) {
1209:     int           nParents   = nestedEvents[i].nParents;
1210:     NestedEventId nstEvent   = nestedEvents[i].nstEvent;
1211:     PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1212:     for (j=0; j<nParents; j++) nstEvents[dftEvents[j]] = nstEvent;
1213:   }

1215:   /* Calculate largest nested event-ID */
1216:   nstMax_local = 0;
1217:   for (i=0; i<nNestedEvents; i++) nstMax_local = PetscMax(nestedEvents[i].nstEvent,nstMax_local);
1218:   MPIU_Allreduce(&nstMax_local, &nstMax, 1, MPI_INT, MPI_MAX, comm);

1220:   /* Initialize all total-times with zero */
1221:   PetscMalloc1(nstMax+1,&selftimes);
1222:   PetscMalloc1(nstMax+1,&totaltimes);
1223:   for (nst=0; nst<=nstMax; nst++) {
1224:     totaltimes[nst].time          = 0;
1225:     totaltimes[nst].flops         = 0;
1226:     totaltimes[nst].numMessages   = 0;
1227:     totaltimes[nst].messageLength = 0;
1228:     totaltimes[nst].numReductions = 0;
1229:     totaltimes[nst].name          = NULL;
1230:   }

1232:   /* Calculate total-times */
1233:   for (i=0; i<nNestedEvents; i++) {
1234:     const int            nParents  = nestedEvents[i].nParents;
1235:     const NestedEventId  nstEvent  = nestedEvents[i].nstEvent;
1236:     const PetscLogEvent *dftEvents = nestedEvents[i].dftEvents;
1237:     for (j=0; j<nParents; j++) {
1238:       const PetscLogEvent dftEvent = dftEvents[j];
1239:       totaltimes[nstEvent].time          += eventPerfInfo[dftEvent].time;
1240:       totaltimes[nstEvent].flops         += eventPerfInfo[dftEvent].flops;
1241:       totaltimes[nstEvent].numMessages   += eventPerfInfo[dftEvent].numMessages;
1242:       totaltimes[nstEvent].messageLength += eventPerfInfo[dftEvent].messageLength;
1243:       totaltimes[nstEvent].numReductions += eventPerfInfo[dftEvent].numReductions;
1244:     }
1245:     totaltimes[nstEvent].name = eventRegInfo[(PetscLogEvent)nstEvent].name;
1246:   }

1248:   /* Initialize: self-times := totaltimes */
1249:   for (nst=0; nst<=nstMax; nst++) { selftimes[nst] = totaltimes[nst]; }

1251:   /* Subtract timed subprocesses from self-times */
1252:   for (i=0; i<nNestedEvents; i++) {
1253:     const int           nParents          = nestedEvents[i].nParents;
1254:     const PetscLogEvent *dftEvents        = nestedEvents[i].dftEvents;
1255:     const NestedEventId *dftParentsSorted = nestedEvents[i].dftParentsSorted;
1256:     for (j=0; j<nParents; j++) {
1257:       if (dftParentsSorted[j] != DFT_ID_AWAKE) {
1258:         const PetscLogEvent dftEvent  = dftEvents[j];
1259:         const NestedEventId nstParent = nstEvents[dftParentsSorted[j]];
1260:         selftimes[nstParent].time          -= eventPerfInfo[dftEvent].time;
1261:         selftimes[nstParent].flops         -= eventPerfInfo[dftEvent].flops;
1262:         selftimes[nstParent].numMessages   -= eventPerfInfo[dftEvent].numMessages;
1263:         selftimes[nstParent].messageLength -= eventPerfInfo[dftEvent].messageLength;
1264:         selftimes[nstParent].numReductions -= eventPerfInfo[dftEvent].numReductions;
1265:       }
1266:     }
1267:   }

1269:   PetscFree(nstEvents);
1270:   PetscFree(totaltimes);

1272:   /* Set outputs */
1273:   *p_self  = selftimes;
1274:   *p_nstMax = nstMax;
1275:   return 0;
1276: }

1278: static PetscErrorCode PetscPrintSelfTime(PetscViewer viewer, const PetscSelfTimer *selftimes, int nstMax, PetscLogDouble totalTime)
1279: {
1280:   int                i;
1281:   NestedEventId      nst;
1282:   PetscSortItem      *sortSelfTimes;
1283:   PetscLogDouble     *times, *maxTimes;
1284:   PetscStageLog      stageLog;
1285:   PetscEventRegInfo  *eventRegInfo;
1286:   const int          dum_depth = 1, dum_count=1, dum_parentcount=1;
1287:   PetscBool          wasPrinted;
1288:   MPI_Comm           comm;

1290:   PetscObjectGetComm((PetscObject)viewer,&comm);
1291:   PetscLogGetStageLog(&stageLog);
1292:   eventRegInfo  = stageLog->eventLog->eventInfo;

1294:   PetscMalloc1(nstMax+1,&times);
1295:   PetscMalloc1(nstMax+1,&maxTimes);
1296:   for (nst=0; nst<=nstMax; nst++) { times[nst] = selftimes[nst].time;}
1297:   MPIU_Allreduce(times, maxTimes, nstMax+1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);
1298:   PetscFree(times);

1300:   PetscMalloc1(nstMax+1,&sortSelfTimes);

1302:   /* Sort the self-timers on basis of the largest time needed */
1303:   for (nst=0; nst<=nstMax; nst++) {
1304:     sortSelfTimes[nst].id  = nst;
1305:     sortSelfTimes[nst].val = maxTimes[nst];
1306:   }
1307:   PetscFree(maxTimes);
1308:   qsort(sortSelfTimes, nstMax+1, sizeof(PetscSortItem), compareSortItems);

1310:   PetscViewerXMLStartSection(viewer, "selftimertable", "Self-timings");
1311:   PetscViewerXMLPutDouble(viewer, "totaltime", NULL, totalTime, "%f");

1313:   for (i=0; i<=nstMax; i++) {
1314:     if ((sortSelfTimes[i].val/totalTime) >= THRESHOLD) {
1315:       NestedEventId      nstEvent = sortSelfTimes[i].id;
1316:       const char         *name    = eventRegInfo[(PetscLogEvent)nstEvent].name;
1317:       PetscEventPerfInfo selfPerfInfo;

1319:       selfPerfInfo.time          = selftimes[nstEvent].time ;
1320:       selfPerfInfo.flops         = selftimes[nstEvent].flops;
1321:       selfPerfInfo.numMessages   = selftimes[nstEvent].numMessages;
1322:       selfPerfInfo.messageLength = selftimes[nstEvent].messageLength;
1323:       selfPerfInfo.numReductions = selftimes[nstEvent].numReductions;

1325:       PetscLogNestedTreePrintLine(viewer, selfPerfInfo, dum_count, dum_parentcount, dum_depth, name, totalTime, &wasPrinted);
1326:       if (wasPrinted) {
1327:         PetscViewerXMLEndSection(viewer, "event");
1328:       }
1329:     }
1330:   }
1331:   PetscViewerXMLEndSection(viewer, "selftimertable");
1332:   PetscFree(sortSelfTimes);
1333:   return 0;
1334: }

1336: PetscErrorCode PetscLogView_Nested(PetscViewer viewer)
1337: {
1338:   PetscLogDouble       locTotalTime, globTotalTime;
1339:   PetscNestedEventTree *tree = NULL;
1340:   PetscSelfTimer       *selftimers = NULL;
1341:   int                  nTimers = 0, nstMax = 0;
1342:   MPI_Comm             comm;

1344:   PetscObjectGetComm((PetscObject)viewer,&comm);
1345:   PetscViewerInitASCII_XML(viewer);
1346:   PetscViewerASCIIPrintf(viewer, "<!-- PETSc Performance Summary: -->\n");
1347:   PetscViewerXMLStartSection(viewer, "petscroot", NULL);

1349:   /* Get the total elapsed time, local and global maximum */
1350:   PetscTime(&locTotalTime);  locTotalTime -= petsc_BaseTime;
1351:   MPIU_Allreduce(&locTotalTime, &globTotalTime, 1, MPIU_PETSCLOGDOUBLE, MPI_MAX, comm);

1353:   /* Print global information about this run */
1354:   PetscPrintExeSpecs(viewer);
1355:   PetscPrintGlobalPerformance(viewer, locTotalTime);
1356:   /* Collect nested timer tree info from all processes */
1357:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1358:   PetscLogNestedTreePrintTop(viewer, tree, nTimers, globTotalTime);
1359:   PetscLogNestedTreeDestroy(tree, nTimers);

1361:   /* Calculate self-time for all (not-nested) events */
1362:   PetscCalcSelfTime(viewer, &selftimers, &nstMax);
1363:   PetscPrintSelfTime(viewer, selftimers, nstMax, globTotalTime);
1364:   PetscFree(selftimers);

1366:   PetscViewerXMLEndSection(viewer, "petscroot");
1367:   PetscViewerFinalASCII_XML(viewer);
1368:   return 0;
1369: }

1371: /*
1372:  * Get the name of a nested event.
1373:  */
1374: static PetscErrorCode PetscGetNestedEventName(const PetscNestedEventTree *tree,int id,char **name)
1375: {
1376:   PetscStageLog   stageLog;

1378:   PetscLogGetStageLog(&stageLog);
1379:   *name = stageLog->eventLog->eventInfo[(PetscLogEvent)tree[id].nstEvent].name;
1380:   return 0;
1381: }

1383: /*
1384:  * Get the total time elapsed.
1385:  */
1386: static PetscErrorCode PetscGetTotalTime(const PetscViewer viewer,PetscLogDouble *totalTime)
1387: {
1388:   PetscLogDouble  locTotalTime;
1389:   MPI_Comm        comm;

1391:   PetscObjectGetComm((PetscObject)viewer,&comm);
1392:   PetscTime(&locTotalTime);
1393:   locTotalTime -= petsc_BaseTime;
1394:   MPIU_Allreduce(&locTotalTime,totalTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1395:   return 0;
1396: }

1398: /*
1399:  * Write a line to the flame graph output and then recurse into child events.
1400:  */
1401: static PetscErrorCode PetscLogNestedTreePrintFlamegraph(PetscViewer viewer,PetscNestedEventTree *tree,int nTimers,int iStart,PetscLogDouble totalTime,PetscIntStack eventStack)
1402: {
1403:   int                 depth=tree[iStart].depth,parentCount=1,i,nChildren;
1404:   char                *name=NULL;
1405:   PetscEventPerfInfo  myPerfInfo={0},selfPerfInfo={0},otherPerfInfo={0};
1406:   PetscLogDouble      countsPerCall=0,locTime,globTime;
1407:   PetscSortItem       *children;
1408:   PetscStageLog       stageLog;
1409:   MPI_Comm            comm;

1411:   PetscLogGetStageLog(&stageLog);
1412:   PetscObjectGetComm((PetscObject)viewer,&comm);

1414:   /* Determine information about the child events as well as 'self' and 'other' */
1415:   PetscLogNestedTreeGetChildrenCount(tree,nTimers,iStart,depth,&nChildren);
1416:   PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,iStart,depth,nChildren,&children);
1417:   PetscLogNestedTreeSetSelfOtherPerfInfo(tree,iStart,totalTime,children,nChildren,&myPerfInfo,&selfPerfInfo,&otherPerfInfo,&parentCount,&countsPerCall);

1419:   /* Write line to the file. The time shown is 'self' + 'other' because each entry in the output
1420:    * is the total time spent in the event minus the amount spent in child events. */
1421:   locTime = selfPerfInfo.time + otherPerfInfo.time;
1422:   MPIU_Allreduce(&locTime,&globTime,1,MPIU_PETSCLOGDOUBLE,MPI_MAX,comm);
1423:   if (globTime/totalTime > THRESHOLD && tree[iStart].own) {
1424:     /* Iterate over parent events in the stack and write them */
1425:     for (i=0; i<=eventStack->top; i++) {
1426:       PetscGetNestedEventName(tree,eventStack->stack[i],&name);
1427:       PetscViewerASCIIPrintf(viewer,"%s;",name);
1428:     }
1429:     PetscGetNestedEventName(tree,iStart,&name);
1430:     /* The output is given as an integer in microseconds because otherwise the file cannot be read
1431:      * by apps such as speedscope (https://speedscope.app/). */
1432:     PetscViewerASCIIPrintf(viewer,"%s %" PetscInt64_FMT "\n",name,(PetscInt64)(globTime*1e6));
1433:   }

1435:   /* Add the current event to the parent stack and write the child events */
1436:   PetscIntStackPush(eventStack, iStart);
1437:   for (i=0; i<nChildren; i++) {
1438:     PetscLogNestedTreePrintFlamegraph(viewer,tree,nTimers,children[i].id,totalTime,eventStack);
1439:   }
1440:   /* Pop the top item from the stack and immediately discard it */
1441:   {
1442:     int tmp;
1443:     PetscIntStackPop(eventStack, &tmp);
1444:   }
1445:   return 0;
1446: }

1448: /*
1449:  * Print nested logging information to a file suitable for reading into a Flame Graph.
1450:  *
1451:  * The format consists of a semicolon-separated list of events and the event duration in microseconds (which must be an integer).
1452:  * An example output would look like:
1453:  *   MatAssemblyBegin 1
1454:  *   MatAssemblyEnd 10
1455:  *   MatView 302
1456:  *   KSPSetUp 98
1457:  *   KSPSetUp;VecSet 5
1458:  *   KSPSolve 150
1459:  *
1460:  * This option may be requested from the command line by passing in the flag `-log_view :<somefile>.txt:ascii_flamegraph`.
1461:  */
1462: PetscErrorCode PetscLogView_Flamegraph(PetscViewer viewer)
1463: {
1464:   int                   nTimers=0,i,nChildren;
1465:   PetscIntStack         eventStack;
1466:   PetscLogDouble        totalTime;
1467:   PetscNestedEventTree  *tree=NULL;
1468:   PetscSortItem         *children;

1470:   PetscGetTotalTime(viewer,&totalTime);
1471:   PetscLogNestedTreeCreate(viewer, &tree, &nTimers);
1472:   /* We use an integer stack to keep track of parent event IDs */
1473:   PetscIntStackCreate(&eventStack);

1475:   /* Initialize the child events and write them recursively */
1476:   PetscLogNestedTreeGetChildrenCount(tree,nTimers,-1,0,&nChildren);
1477:   PetscLogNestedTreeSetChildrenSortItems(viewer,tree,nTimers,-1,0,nChildren,&children);
1478:   for (i=0; i<nChildren; i++) {
1479:     PetscLogNestedTreePrintFlamegraph(viewer,tree,nTimers,children[i].id,totalTime,eventStack);
1480:   }

1482:   PetscLogNestedTreeDestroy(tree, nTimers);
1483:   PetscIntStackDestroy(eventStack);
1484:   return 0;
1485: }

1487: PETSC_EXTERN PetscErrorCode PetscASend(int count, int datatype)
1488: {
1489: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1490: #endif

1492:   petsc_send_ct++;
1493: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1494:   PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_send_len);
1495: #endif
1496:   return 0;
1497: }

1499: PETSC_EXTERN PetscErrorCode PetscARecv(int count, int datatype)
1500: {
1501: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1502: #endif

1504:   petsc_recv_ct++;
1505: #if !defined(MPIUNI_H) && !defined(PETSC_HAVE_BROKEN_RECURSIVE_MACRO) && !defined(PETSC_HAVE_MPI_MISSING_TYPESIZE)
1506:   PetscMPITypeSize(count,MPI_Type_f2c((MPI_Fint) datatype),&petsc_recv_len);
1507: #endif
1508:   return 0;
1509: }

1511: PETSC_EXTERN PetscErrorCode PetscAReduce()
1512: {
1513:   petsc_allreduce_ct++;
1514:   return 0;
1515: }

1517: #endif