public inbox for [email protected]  
help / color / mirror / Atom feed
From: Ilmar Yunusov <[email protected]>
To: [email protected]
Cc: Ilmar Yunusov <[email protected]>
Subject: [RFC PATCH v0 3/7] Attribute EXPLAIN WAITS to plan nodes
Date: Sat,  9 May 2026 04:22:33 +0500
Message-ID: <cb26aa486d42f227209c27df4aef4b07f13e334a.1778280923.git.tanswis42@gmail.com> (raw)
In-Reply-To: <[email protected]>
References: <[email protected]>

---
 src/backend/commands/explain.c             |   6 +-
 src/backend/executor/execAsync.c           |  22 ++++
 src/backend/executor/execParallel.c        | 131 ++++++++++++++++++---
 src/backend/executor/execProcnode.c        |  31 ++++-
 src/backend/executor/instrument.c          |   7 ++
 src/backend/executor/nodeBitmapAnd.c       |   7 ++
 src/backend/executor/nodeBitmapIndexscan.c |   7 ++
 src/backend/executor/nodeBitmapOr.c        |   7 ++
 src/backend/executor/nodeHash.c            |   7 ++
 src/backend/utils/activity/wait_event.c    |  49 +++++++-
 src/include/executor/instrument.h          |   1 +
 src/include/nodes/execnodes.h              |   1 +
 src/include/utils/wait_event.h             |   4 +
 src/test/regress/expected/explain.out      |  52 +++++++-
 src/test/regress/sql/explain.sql           |  29 +++++
 15 files changed, 333 insertions(+), 28 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 9d7372f5415..0575ae8368b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -530,6 +530,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		instrument_option |= INSTRUMENT_WAL;
 	if (es->io)
 		instrument_option |= INSTRUMENT_IO;
+	if (es->waits)
+		instrument_option |= INSTRUMENT_WAITS;
 
 	/*
 	 * We always collect timing for the entire statement, even when node-level
@@ -2332,6 +2334,8 @@ ExplainNode(PlanState *planstate, List *ancestors,
 		show_buffer_usage(es, &planstate->instrument->instr.bufusage);
 	if (es->wal && planstate->instrument)
 		show_wal_usage(es, &planstate->instrument->instr.walusage);
+	if (es->waits)
+		show_wait_event_usage(es, planstate->wait_event_usage);
 
 	/* Prepare per-worker buffer/WAL usage */
 	if (es->workers_state && (es->buffers || es->wal) && es->verbose)
@@ -4559,7 +4563,7 @@ show_wait_event_usage(ExplainState *es, const WaitEventUsage *usage)
 	if (usage == NULL)
 		return;
 
-	if (es->format == EXPLAIN_FORMAT_TEXT && usage->nentries == 0)
+	if (usage->nentries == 0)
 		return;
 
 	if (usage->nentries > 0)
diff --git a/src/backend/executor/execAsync.c b/src/backend/executor/execAsync.c
index cf7ddbb01f4..e73ea55cf92 100644
--- a/src/backend/executor/execAsync.c
+++ b/src/backend/executor/execAsync.c
@@ -19,6 +19,7 @@
 #include "executor/instrument.h"
 #include "executor/nodeAppend.h"
 #include "executor/nodeForeignscan.h"
+#include "utils/wait_event.h"
 
 /*
  * Asynchronously request a tuple from a designed async-capable node.
@@ -26,12 +27,17 @@
 void
 ExecAsyncRequest(AsyncRequest *areq)
 {
+	WaitEventUsage *previous_wait_event_usage = NULL;
+
 	if (areq->requestee->chgParam != NULL)	/* something changed? */
 		ExecReScan(areq->requestee);	/* let ReScan handle this */
 
 	/* must provide our own instrumentation support */
 	if (areq->requestee->instrument)
 		InstrStartNode(areq->requestee->instrument);
+	if (areq->requestee->wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(areq->requestee->wait_event_usage);
 
 	switch (nodeTag(areq->requestee))
 	{
@@ -47,6 +53,8 @@ ExecAsyncRequest(AsyncRequest *areq)
 	ExecAsyncResponse(areq);
 
 	/* must provide our own instrumentation support */
+	if (areq->requestee->wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	if (areq->requestee->instrument)
 		InstrStopNode(areq->requestee->instrument,
 					  TupIsNull(areq->result) ? 0.0 : 1.0);
@@ -62,9 +70,14 @@ ExecAsyncRequest(AsyncRequest *areq)
 void
 ExecAsyncConfigureWait(AsyncRequest *areq)
 {
+	WaitEventUsage *previous_wait_event_usage = NULL;
+
 	/* must provide our own instrumentation support */
 	if (areq->requestee->instrument)
 		InstrStartNode(areq->requestee->instrument);
+	if (areq->requestee->wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(areq->requestee->wait_event_usage);
 
 	switch (nodeTag(areq->requestee))
 	{
@@ -78,6 +91,8 @@ ExecAsyncConfigureWait(AsyncRequest *areq)
 	}
 
 	/* must provide our own instrumentation support */
+	if (areq->requestee->wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	if (areq->requestee->instrument)
 		InstrStopNode(areq->requestee->instrument, 0.0);
 }
@@ -88,9 +103,14 @@ ExecAsyncConfigureWait(AsyncRequest *areq)
 void
 ExecAsyncNotify(AsyncRequest *areq)
 {
+	WaitEventUsage *previous_wait_event_usage = NULL;
+
 	/* must provide our own instrumentation support */
 	if (areq->requestee->instrument)
 		InstrStartNode(areq->requestee->instrument);
+	if (areq->requestee->wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(areq->requestee->wait_event_usage);
 
 	switch (nodeTag(areq->requestee))
 	{
@@ -106,6 +126,8 @@ ExecAsyncNotify(AsyncRequest *areq)
 	ExecAsyncResponse(areq);
 
 	/* must provide our own instrumentation support */
+	if (areq->requestee->wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	if (areq->requestee->instrument)
 		InstrStopNode(areq->requestee->instrument,
 					  TupIsNull(areq->result) ? 0.0 : 1.0);
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 8213565a708..f77f539dd27 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -83,6 +83,12 @@ typedef struct FixedParallelExecutorState
 	int			jit_flags;
 } FixedParallelExecutorState;
 
+typedef struct SharedWaitEventUsageWorker
+{
+	int			nentries;
+	dsa_pointer entries;
+} SharedWaitEventUsageWorker;
+
 /*
  * DSM structure for accumulating per-PlanState instrumentation.
  *
@@ -92,6 +98,10 @@ typedef struct FixedParallelExecutorState
  * of the first NodeInstrumentation object.  This will depend on the length of
  * the plan_node_id array.
  *
+ * wait_event_usage_offset: Offset, relative to the start of this structure,
+ * of the first SharedWaitEventUsageWorker object, or 0 if wait event usage is
+ * not being collected.
+ *
  * num_workers: Number of workers.
  *
  * num_plan_nodes: Number of plan nodes.
@@ -103,6 +113,7 @@ struct SharedExecutorInstrumentation
 {
 	int			instrument_options;
 	int			instrument_offset;
+	int			wait_event_usage_offset;
 	int			num_workers;
 	int			num_plan_nodes;
 	int			plan_node_id[FLEXIBLE_ARRAY_MEMBER];
@@ -110,17 +121,17 @@ struct SharedExecutorInstrumentation
 	/*
 	 * Array of num_plan_nodes * num_workers NodeInstrumentation objects
 	 * follows.
+	 *
+	 * If wait_event_usage_offset is non-zero, an array of num_plan_nodes *
+	 * num_workers SharedWaitEventUsageWorker objects follows.
 	 */
 };
 #define GetInstrumentationArray(sei) \
 	(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
 	 (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
-
-typedef struct SharedWaitEventUsageWorker
-{
-	int			nentries;
-	dsa_pointer entries;
-} SharedWaitEventUsageWorker;
+#define GetInstrumentationWaitEventUsageArray(sei) \
+	(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
+	 (SharedWaitEventUsageWorker *) (((char *) sei) + sei->wait_event_usage_offset))
 
 struct SharedWaitEventUsage
 {
@@ -143,6 +154,12 @@ typedef struct ExecParallelInitializeDSMContext
 	int			nnodes;
 } ExecParallelInitializeDSMContext;
 
+typedef struct ExecParallelRetrieveInstrumentationContext
+{
+	SharedExecutorInstrumentation *instrumentation;
+	dsa_area   *area;
+} ExecParallelRetrieveInstrumentationContext;
+
 /* Helper functions that run in the parallel leader. */
 static char *ExecSerializePlan(Plan *plan, EState *estate);
 static bool ExecParallelEstimate(PlanState *planstate,
@@ -154,8 +171,11 @@ static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt,
 static bool ExecParallelReInitializeDSM(PlanState *planstate,
 										ParallelContext *pcxt);
 static bool ExecParallelRetrieveInstrumentation(PlanState *planstate,
-												SharedExecutorInstrumentation *instrumentation);
+												ExecParallelRetrieveInstrumentationContext *r);
 static void ExecParallelRetrieveWaitEventUsage(ParallelExecutorInfo *pei);
+static void ExecParallelReportWaitEventUsageWorker(SharedWaitEventUsageWorker *worker,
+												   dsa_area *area,
+												   const WaitEventUsage *usage);
 static void ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
 											 dsa_area *area,
 											 const WaitEventUsage *usage);
@@ -691,6 +711,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 	int			instrumentation_len = 0;
 	int			jit_instrumentation_len = 0;
 	int			instrument_offset = 0;
+	int			wait_event_usage_offset = 0;
 	Size		dsa_minsize = dsa_minimum_size();
 	char	   *query_string;
 	int			query_len;
@@ -798,6 +819,14 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 		instrumentation_len +=
 			mul_size(sizeof(NodeInstrumentation),
 					 mul_size(e.nnodes, nworkers));
+		if (estate->es_instrument & INSTRUMENT_WAITS)
+		{
+			instrumentation_len = MAXALIGN(instrumentation_len);
+			wait_event_usage_offset = instrumentation_len;
+			instrumentation_len +=
+				mul_size(sizeof(SharedWaitEventUsageWorker),
+						 mul_size(e.nnodes, nworkers));
+		}
 		shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
 		shm_toc_estimate_keys(&pcxt->estimator, 1);
 
@@ -903,11 +932,23 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
 		instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
 		instrumentation->instrument_options = estate->es_instrument;
 		instrumentation->instrument_offset = instrument_offset;
+		instrumentation->wait_event_usage_offset = wait_event_usage_offset;
 		instrumentation->num_workers = nworkers;
 		instrumentation->num_plan_nodes = e.nnodes;
 		instrument = GetInstrumentationArray(instrumentation);
 		for (i = 0; i < nworkers * e.nnodes; ++i)
 			InstrInitNode(&instrument[i], estate->es_instrument, false);
+		if (wait_event_usage_offset != 0)
+		{
+			SharedWaitEventUsageWorker *worker_usage;
+
+			worker_usage = GetInstrumentationWaitEventUsageArray(instrumentation);
+			for (i = 0; i < nworkers * e.nnodes; ++i)
+			{
+				worker_usage[i].nentries = 0;
+				worker_usage[i].entries = InvalidDsaPointer;
+			}
+		}
 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
 					   instrumentation);
 		pei->instrumentation = instrumentation;
@@ -1137,9 +1178,11 @@ ExecParallelReInitializeDSM(PlanState *planstate,
  */
 static bool
 ExecParallelRetrieveInstrumentation(PlanState *planstate,
-									SharedExecutorInstrumentation *instrumentation)
+									ExecParallelRetrieveInstrumentationContext *r)
 {
+	SharedExecutorInstrumentation *instrumentation = r->instrumentation;
 	NodeInstrumentation *instrument;
+	SharedWaitEventUsageWorker *wait_event_usage = NULL;
 	int			i;
 	int			n;
 	int			ibytes;
@@ -1159,6 +1202,30 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 	for (n = 0; n < instrumentation->num_workers; ++n)
 		InstrAggNode(planstate->instrument, &instrument[n]);
 
+	/* Accumulate the wait event usage from all workers. */
+	if (instrumentation->wait_event_usage_offset != 0 &&
+		planstate->wait_event_usage != NULL)
+	{
+		wait_event_usage = GetInstrumentationWaitEventUsageArray(instrumentation);
+		wait_event_usage += i * instrumentation->num_workers;
+		for (n = 0; n < instrumentation->num_workers; ++n)
+		{
+			SharedWaitEventUsageWorker *worker = &wait_event_usage[n];
+			WaitEventUsageEntry *entries;
+
+			if (worker->nentries <= 0 || !DsaPointerIsValid(worker->entries))
+				continue;
+
+			entries = dsa_get_address(r->area, worker->entries);
+			pgstat_accumulate_wait_event_usage(planstate->wait_event_usage,
+											   entries,
+											   worker->nentries);
+			dsa_free(r->area, worker->entries);
+			worker->nentries = 0;
+			worker->entries = InvalidDsaPointer;
+		}
+	}
+
 	/*
 	 * Also store the per-worker detail.
 	 *
@@ -1216,7 +1283,7 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 	}
 
 	return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
-								 instrumentation);
+								 r);
 }
 
 /*
@@ -1290,25 +1357,21 @@ ExecParallelRetrieveWaitEventUsage(ParallelExecutorInfo *pei)
 }
 
 static void
-ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
-								 dsa_area *area,
-								 const WaitEventUsage *usage)
+ExecParallelReportWaitEventUsageWorker(SharedWaitEventUsageWorker *worker,
+									   dsa_area *area,
+									   const WaitEventUsage *usage)
 {
-	SharedWaitEventUsageWorker *worker;
 	WaitEventUsageEntry *entries;
 	dsa_pointer entries_dsa;
 	Size		entries_size;
 
-	Assert(shared != NULL);
+	Assert(worker != NULL);
 	Assert(area != NULL);
 	Assert(usage != NULL);
-	Assert(IsParallelWorker());
-	Assert(ParallelWorkerNumber < shared->num_workers);
 
 	if (usage->nentries <= 0)
 		return;
 
-	worker = &shared->worker_usage[ParallelWorkerNumber];
 	entries_size = mul_size(sizeof(WaitEventUsageEntry), usage->nentries);
 	entries_dsa = dsa_allocate(area, entries_size);
 	entries = dsa_get_address(area, entries_dsa);
@@ -1320,6 +1383,20 @@ ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
 	worker->entries = entries_dsa;
 }
 
+static void
+ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
+								 dsa_area *area,
+								 const WaitEventUsage *usage)
+{
+	Assert(shared != NULL);
+	Assert(IsParallelWorker());
+	Assert(ParallelWorkerNumber < shared->num_workers);
+
+	ExecParallelReportWaitEventUsageWorker(&shared->worker_usage[ParallelWorkerNumber],
+										   area,
+										   usage);
+}
+
 /*
  * Finish parallel execution.  We wait for parallel workers to finish, and
  * accumulate their buffer/WAL usage.
@@ -1385,8 +1462,13 @@ ExecParallelCleanup(ParallelExecutorInfo *pei)
 {
 	/* Accumulate instrumentation, if any. */
 	if (pei->instrumentation)
-		ExecParallelRetrieveInstrumentation(pei->planstate,
-											pei->instrumentation);
+	{
+		ExecParallelRetrieveInstrumentationContext r;
+
+		r.instrumentation = pei->instrumentation;
+		r.area = pei->area;
+		ExecParallelRetrieveInstrumentation(pei->planstate, &r);
+	}
 
 	/* Accumulate JIT instrumentation, if any. */
 	if (pei->jit_instrumentation)
@@ -1495,6 +1577,17 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 	Assert(IsParallelWorker());
 	Assert(ParallelWorkerNumber < instrumentation->num_workers);
 	InstrAggNode(&instrument[ParallelWorkerNumber], planstate->instrument);
+	if (instrumentation->wait_event_usage_offset != 0 &&
+		planstate->wait_event_usage != NULL)
+	{
+		SharedWaitEventUsageWorker *wait_event_usage;
+
+		wait_event_usage = GetInstrumentationWaitEventUsageArray(instrumentation);
+		wait_event_usage += i * instrumentation->num_workers;
+		ExecParallelReportWaitEventUsageWorker(&wait_event_usage[ParallelWorkerNumber],
+											   planstate->state->es_query_dsa,
+											   planstate->wait_event_usage);
+	}
 
 	return planstate_tree_walker(planstate, ExecParallelReportInstrumentation,
 								 instrumentation);
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 7c4c66e323f..081855b3fed 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -118,6 +118,7 @@
 #include "executor/nodeWindowAgg.h"
 #include "executor/nodeWorktablescan.h"
 #include "miscadmin.h"
+#include "utils/wait_event.h"
 #include "nodes/nodeFuncs.h"
 
 static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
@@ -415,6 +416,16 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
 	if (estate->es_instrument)
 		result->instrument = InstrAllocNode(estate->es_instrument,
 											result->async_capable);
+	if (estate->es_instrument & INSTRUMENT_WAITS)
+	{
+		MemoryContext oldcontext;
+
+		oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+		result->wait_event_usage = palloc_object(WaitEventUsage);
+		pgstat_init_wait_event_usage(result->wait_event_usage,
+									 estate->es_query_cxt);
+		MemoryContextSwitchTo(oldcontext);
+	}
 
 	return result;
 }
@@ -481,7 +492,8 @@ ExecProcNodeFirst(PlanState *node)
  * This has essentially the same responsibilities as ExecProcNode,
  * but it does not do InstrStartNode/InstrStopNode (mainly because
  * it can't tell how many returned tuples to count).  Each per-node
- * function must provide its own instrumentation support.
+ * function must provide its own instrumentation support, including
+ * wait event attribution when enabled.
  * ----------------------------------------------------------------
  */
 Node *
@@ -758,6 +770,9 @@ ExecShutdownNode(PlanState *node)
 static bool
 ExecShutdownNode_walker(PlanState *node, void *context)
 {
+	bool		node_running;
+	WaitEventUsage *previous_wait_event_usage = NULL;
+
 	if (node == NULL)
 		return false;
 
@@ -773,8 +788,14 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 	 * has never been executed, so as to avoid incorrectly making it appear
 	 * that it has.
 	 */
-	if (node->instrument && node->instrument->running)
+	node_running = node->instrument && node->instrument->running;
+	if (node_running)
+	{
 		InstrStartNode(node->instrument);
+		if (node->wait_event_usage)
+			previous_wait_event_usage =
+				pgstat_enter_wait_event_usage(node->wait_event_usage);
+	}
 
 	planstate_tree_walker(node, ExecShutdownNode_walker, context);
 
@@ -803,8 +824,12 @@ ExecShutdownNode_walker(PlanState *node, void *context)
 	}
 
 	/* Stop the node if we started it above, reporting 0 tuples. */
-	if (node->instrument && node->instrument->running)
+	if (node_running)
+	{
+		if (node->wait_event_usage)
+			pgstat_restore_wait_event_usage(previous_wait_event_usage);
 		InstrStopNode(node->instrument, 0);
+	}
 
 	return false;
 }
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ffbcd572133..a1f22180f82 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -21,6 +21,7 @@
 #include "nodes/execnodes.h"
 #include "portability/instr_time.h"
 #include "utils/guc_hooks.h"
+#include "utils/wait_event.h"
 
 BufferUsage pgBufferUsage;
 static BufferUsage save_pgBufferUsage;
@@ -181,11 +182,17 @@ TupleTableSlot *
 ExecProcNodeInstr(PlanState *node)
 {
 	TupleTableSlot *result;
+	WaitEventUsage *previous_wait_event_usage = NULL;
 
 	InstrStartNode(node->instrument);
+	if (node->wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(node->wait_event_usage);
 
 	result = node->ExecProcNodeReal(node);
 
+	if (node->wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
 
 	return result;
diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c
index 9007dda3802..e2b442bd378 100644
--- a/src/backend/executor/nodeBitmapAnd.c
+++ b/src/backend/executor/nodeBitmapAnd.c
@@ -32,6 +32,7 @@
 #include "executor/instrument.h"
 #include "executor/nodeBitmapAnd.h"
 #include "nodes/tidbitmap.h"
+#include "utils/wait_event.h"
 
 
 /* ----------------------------------------------------------------
@@ -115,10 +116,14 @@ MultiExecBitmapAnd(BitmapAndState *node)
 	int			nplans;
 	int			i;
 	TIDBitmap  *result = NULL;
+	WaitEventUsage *previous_wait_event_usage = NULL;
 
 	/* must provide our own instrumentation support */
 	if (node->ps.instrument)
 		InstrStartNode(node->ps.instrument);
+	if (node->ps.wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(node->ps.wait_event_usage);
 
 	/*
 	 * get information from the node
@@ -162,6 +167,8 @@ MultiExecBitmapAnd(BitmapAndState *node)
 		elog(ERROR, "BitmapAnd doesn't support zero inputs");
 
 	/* must provide our own instrumentation support */
+	if (node->ps.wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	if (node->ps.instrument)
 		InstrStopNode(node->ps.instrument, 0 /* XXX */ );
 
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 7978514e1bc..c2798a81e97 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -28,6 +28,7 @@
 #include "executor/nodeIndexscan.h"
 #include "miscadmin.h"
 #include "nodes/tidbitmap.h"
+#include "utils/wait_event.h"
 
 
 /* ----------------------------------------------------------------
@@ -54,10 +55,14 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
 	IndexScanDesc scandesc;
 	double		nTuples = 0;
 	bool		doscan;
+	WaitEventUsage *previous_wait_event_usage = NULL;
 
 	/* must provide our own instrumentation support */
 	if (node->ss.ps.instrument)
 		InstrStartNode(node->ss.ps.instrument);
+	if (node->ss.ps.wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(node->ss.ps.wait_event_usage);
 
 	/*
 	 * extract necessary information from index scan node
@@ -116,6 +121,8 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
 	}
 
 	/* must provide our own instrumentation support */
+	if (node->ss.ps.wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	if (node->ss.ps.instrument)
 		InstrStopNode(node->ss.ps.instrument, nTuples);
 
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
index 148c80fdae6..e9ee934a6f6 100644
--- a/src/backend/executor/nodeBitmapOr.c
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -33,6 +33,7 @@
 #include "executor/nodeBitmapOr.h"
 #include "nodes/tidbitmap.h"
 #include "miscadmin.h"
+#include "utils/wait_event.h"
 
 
 /* ----------------------------------------------------------------
@@ -116,10 +117,14 @@ MultiExecBitmapOr(BitmapOrState *node)
 	int			nplans;
 	int			i;
 	TIDBitmap  *result = NULL;
+	WaitEventUsage *previous_wait_event_usage = NULL;
 
 	/* must provide our own instrumentation support */
 	if (node->ps.instrument)
 		InstrStartNode(node->ps.instrument);
+	if (node->ps.wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(node->ps.wait_event_usage);
 
 	/*
 	 * get information from the node
@@ -180,6 +185,8 @@ MultiExecBitmapOr(BitmapOrState *node)
 		elog(ERROR, "BitmapOr doesn't support zero inputs");
 
 	/* must provide our own instrumentation support */
+	if (node->ps.wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	if (node->ps.instrument)
 		InstrStopNode(node->ps.instrument, 0 /* XXX */ );
 
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8825bb6fa23..6e9e4b4218f 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -105,9 +105,14 @@ ExecHash(PlanState *pstate)
 Node *
 MultiExecHash(HashState *node)
 {
+	WaitEventUsage *previous_wait_event_usage = NULL;
+
 	/* must provide our own instrumentation support */
 	if (node->ps.instrument)
 		InstrStartNode(node->ps.instrument);
+	if (node->ps.wait_event_usage)
+		previous_wait_event_usage =
+			pgstat_enter_wait_event_usage(node->ps.wait_event_usage);
 
 	if (node->parallel_state != NULL)
 		MultiExecParallelHash(node);
@@ -115,6 +120,8 @@ MultiExecHash(HashState *node)
 		MultiExecPrivateHash(node);
 
 	/* must provide our own instrumentation support */
+	if (node->ps.wait_event_usage)
+		pgstat_restore_wait_event_usage(previous_wait_event_usage);
 	if (node->ps.instrument)
 		InstrStopNode(node->ps.instrument, node->hashtable->reportTuples);
 
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index eb01bc3d88c..3d5a10b44c4 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -27,6 +27,7 @@
 #include "storage/shmem.h"
 #include "storage/subsystems.h"
 #include "storage/spin.h"
+#include "utils/memutils.h"
 #include "utils/wait_event.h"
 
 
@@ -47,6 +48,7 @@ uint32	   *my_wait_event_info = &local_my_wait_event_info;
 
 int			pgstat_wait_event_usage_depth = 0;
 static WaitEventUsage *pgstat_wait_event_usage = NULL;
+static WaitEventUsage *pgstat_wait_event_node_usage = NULL;
 static uint32 pgstat_wait_event_usage_current = 0;
 static instr_time pgstat_wait_event_usage_start;
 
@@ -358,6 +360,28 @@ pgstat_reset_wait_event_storage(void)
 	my_wait_event_info = &local_my_wait_event_info;
 }
 
+/*
+ * Initialize a wait event usage accumulator.
+ */
+void
+pgstat_init_wait_event_usage(WaitEventUsage *usage, MemoryContext memcontext)
+{
+	Assert(usage != NULL);
+	Assert(memcontext != NULL);
+
+	memset(usage, 0, sizeof(WaitEventUsage));
+
+	/*
+	 * Wait events may end inside critical sections, for example while
+	 * performing synchronous I/O.  Keep usage entries in a dedicated context
+	 * where the memory manager permits that accounting path to grow.
+	 */
+	usage->memcontext = AllocSetContextCreate(memcontext,
+											  "Wait Event Usage",
+											  ALLOCSET_SMALL_SIZES);
+	MemoryContextAllowInCriticalSection(usage->memcontext, true);
+}
+
 /*
  * Start collecting exact wait event timings in this backend.
  *
@@ -375,9 +399,9 @@ pgstat_begin_wait_event_usage(WaitEventUsage *usage, MemoryContext memcontext)
 
 	if (pgstat_wait_event_usage_depth++ == 0)
 	{
-		memset(usage, 0, sizeof(WaitEventUsage));
-		usage->memcontext = memcontext;
+		pgstat_init_wait_event_usage(usage, memcontext);
 		pgstat_wait_event_usage = usage;
+		pgstat_wait_event_node_usage = NULL;
 		pgstat_wait_event_usage_current = 0;
 		INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
 	}
@@ -398,11 +422,27 @@ pgstat_end_wait_event_usage(WaitEventUsage *usage)
 			pgstat_count_wait_event_end();
 
 		pgstat_wait_event_usage = NULL;
+		pgstat_wait_event_node_usage = NULL;
 		pgstat_wait_event_usage_current = 0;
 		INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
 	}
 }
 
+WaitEventUsage *
+pgstat_enter_wait_event_usage(WaitEventUsage *usage)
+{
+	WaitEventUsage *previous = pgstat_wait_event_node_usage;
+
+	pgstat_wait_event_node_usage = usage;
+	return previous;
+}
+
+void
+pgstat_restore_wait_event_usage(WaitEventUsage *usage)
+{
+	pgstat_wait_event_node_usage = usage;
+}
+
 /*
  * Record the beginning of a wait event for exact EXPLAIN-style accounting.
  */
@@ -444,6 +484,11 @@ pgstat_count_wait_event_end(void)
 					  pgstat_wait_event_usage_current,
 					  1,
 					  &elapsed);
+	if (pgstat_wait_event_node_usage != NULL)
+		WaitEventUsageAdd(pgstat_wait_event_node_usage,
+						  pgstat_wait_event_usage_current,
+						  1,
+						  &elapsed);
 
 	pgstat_wait_event_usage_current = 0;
 	INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index f093a52aae0..f46ee03d2c1 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -65,6 +65,7 @@ typedef enum InstrumentOption
 	INSTRUMENT_ROWS = 1 << 2,	/* needs row count */
 	INSTRUMENT_WAL = 1 << 3,	/* needs WAL usage */
 	INSTRUMENT_IO = 1 << 4,		/* needs IO usage */
+	INSTRUMENT_WAITS = 1 << 5,	/* needs wait event usage */
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 781c8c79132..56f9dcbef88 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1212,6 +1212,7 @@ typedef struct PlanState
 
 	NodeInstrumentation *instrument;	/* Optional runtime stats for this
 										 * node */
+	WaitEventUsage *wait_event_usage; /* Optional wait event usage */
 	WorkerNodeInstrumentation *worker_instrument;	/* per-worker
 													 * instrumentation */
 
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 63992137ee7..64411288403 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -36,12 +36,16 @@ static inline void pgstat_report_wait_start(uint32 wait_event_info);
 static inline void pgstat_report_wait_end(void);
 extern void pgstat_set_wait_event_storage(uint32 *wait_event_info);
 extern void pgstat_reset_wait_event_storage(void);
+extern void pgstat_init_wait_event_usage(WaitEventUsage *usage,
+										 MemoryContext memcontext);
 extern void pgstat_begin_wait_event_usage(WaitEventUsage *usage,
 										  MemoryContext memcontext);
 extern void pgstat_end_wait_event_usage(WaitEventUsage *usage);
 extern void pgstat_accumulate_wait_event_usage(WaitEventUsage *usage,
 											   const WaitEventUsageEntry *entries,
 											   int nentries);
+extern WaitEventUsage *pgstat_enter_wait_event_usage(WaitEventUsage *usage);
+extern void pgstat_restore_wait_event_usage(WaitEventUsage *usage);
 extern void pgstat_count_wait_event_start(uint32 wait_event_info);
 extern void pgstat_count_wait_event_end(void);
 
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index e3847e222be..c086fa48d80 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -101,12 +101,14 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
 
 -- WAITS option
 select explain_filter('explain (analyze, waits, costs off, summary off, timing off, buffers off) select pg_sleep(0.01)');
-            explain_filter             
----------------------------------------
+             explain_filter              
+-----------------------------------------
  Result (actual rows=N.N loops=N)
+   Wait Events:
+     Timeout:PgSleep calls=N time=N.N ms
  Wait Events:
    Timeout:PgSleep calls=N time=N.N ms
-(3 rows)
+(5 rows)
 
 select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Wait Events,0}';
                                      ?column?                                     
@@ -114,6 +116,12 @@ select explain_filter_to_json('explain (analyze, waits, costs off, summary off,
  {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
 (1 row)
 
+select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Plan,Wait Events,0}';
+                                     ?column?                                     
+----------------------------------------------------------------------------------
+ {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
+(1 row)
+
 begin;
 create function pg_temp.parallel_pg_sleep(float8) returns void
   language internal volatile parallel safe as 'pg_sleep';
@@ -130,6 +138,44 @@ select jsonb_path_query_first(
  {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
 (1 row)
 
+select jsonb_path_query_first(
+  explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+                         select pg_temp.parallel_pg_sleep(0.01)
+                         from tenk1 where unique1 = 1') #> '{0,Plan}',
+  '$.**."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
+                              jsonb_path_query_first                              
+----------------------------------------------------------------------------------
+ {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
+(1 row)
+
+rollback;
+begin;
+-- This test deliberately creates a Bitmap Index Scan runtime-key wait.
+-- The STABLE PL/pgSQL wrapper is test scaffolding: STABLE lets the
+-- expression be used as an index runtime key, while PL/pgSQL prevents SQL
+-- inlining from moving pg_sleep() out of the Bitmap Index Scan boundary.
+-- The planner GUCs below are likewise test-only scaffolding to make the
+-- node shape deterministic.
+create function pg_temp.explain_waits_sleep_int(int) returns int
+  language plpgsql stable as $$begin perform pg_sleep(0.01); return $1; end$$;
+create temp table explain_waits_bitmap (a int);
+insert into explain_waits_bitmap select g from generate_series(1, 10) g;
+create index explain_waits_bitmap_a_idx on explain_waits_bitmap(a);
+analyze explain_waits_bitmap;
+set local enable_seqscan = off;
+set local enable_indexscan = off;
+select jsonb_path_query_first(
+  explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+                         select * from explain_waits_bitmap
+                         where a = pg_temp.explain_waits_sleep_int(1)') #> '{0,Plan}',
+  '$.** ? (@."Node Type" == "Bitmap Index Scan")."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
+                              jsonb_path_query_first                              
+----------------------------------------------------------------------------------
+ {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
+(1 row)
+
 rollback;
 explain (waits) select 1;
 ERROR:  EXPLAIN option WAITS requires ANALYZE
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index 8821250bcef..bd196f248db 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -71,6 +71,7 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
 -- WAITS option
 select explain_filter('explain (analyze, waits, costs off, summary off, timing off, buffers off) select pg_sleep(0.01)');
 select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Wait Events,0}';
+select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Plan,Wait Events,0}';
 begin;
 create function pg_temp.parallel_pg_sleep(float8) returns void
   language internal volatile parallel safe as 'pg_sleep';
@@ -82,6 +83,34 @@ select jsonb_path_query_first(
                          from tenk1 where unique1 = 1') #> '{0,Wait Events}',
   '$[*] ? (@."Wait Event" == "PgSleep")'
 );
+select jsonb_path_query_first(
+  explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+                         select pg_temp.parallel_pg_sleep(0.01)
+                         from tenk1 where unique1 = 1') #> '{0,Plan}',
+  '$.**."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
+rollback;
+begin;
+-- This test deliberately creates a Bitmap Index Scan runtime-key wait.
+-- The STABLE PL/pgSQL wrapper is test scaffolding: STABLE lets the
+-- expression be used as an index runtime key, while PL/pgSQL prevents SQL
+-- inlining from moving pg_sleep() out of the Bitmap Index Scan boundary.
+-- The planner GUCs below are likewise test-only scaffolding to make the
+-- node shape deterministic.
+create function pg_temp.explain_waits_sleep_int(int) returns int
+  language plpgsql stable as $$begin perform pg_sleep(0.01); return $1; end$$;
+create temp table explain_waits_bitmap (a int);
+insert into explain_waits_bitmap select g from generate_series(1, 10) g;
+create index explain_waits_bitmap_a_idx on explain_waits_bitmap(a);
+analyze explain_waits_bitmap;
+set local enable_seqscan = off;
+set local enable_indexscan = off;
+select jsonb_path_query_first(
+  explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+                         select * from explain_waits_bitmap
+                         where a = pg_temp.explain_waits_sleep_int(1)') #> '{0,Plan}',
+  '$.** ? (@."Node Type" == "Bitmap Index Scan")."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
 rollback;
 explain (waits) select 1;
 
-- 
2.52.0






reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Reply to all the recipients using the --to and --cc options:
  reply via email

  To: [email protected]
  Cc: [email protected]
  Subject: Re: [RFC PATCH v0 3/7] Attribute EXPLAIN WAITS to plan nodes
  In-Reply-To: <cb26aa486d42f227209c27df4aef4b07f13e334a.1778280923.git.tanswis42@gmail.com>

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox