public inbox for [email protected]
help / color / mirror / Atom feedFrom: Lukas Fittl <[email protected]>
To: PostgreSQL Hackers <[email protected]>
Cc: Andres Freund <[email protected]>
Subject: Stack-based tracking of per-node WAL/buffer usage
Date: Sun, 31 Aug 2025 16:57:01 -0700
Message-ID: <CAP53PkzdBK8VJ1fS4AZ481LgMN8f9mJiC39ZRHqkFUSYq6KWmg@mail.gmail.com> (raw)
Hi,
Please find attached a patch series that introduces a new paradigm for how
per-node WAL/buffer usage is tracked, with two primary goals: (1) reduce
overhead of EXPLAIN ANALYZE, (2) enable future work like tracking estimated
distinct buffer hits [0].
Currently we utilize pgWalUsage/pgBufferUsage as global counters, and in
InstrStopNode we call the rather
expensive BufferUsageAccumDiff/WalUsageAccumDiff to know how much activity
happened within a given node cycle.
This proposal instead uses a stack, where each time we enter a node
(InstrStartNode) we point a new global (pgInstrStack) to the current stack
entry. Whilst we're in that node we increment buffer/WAL usage statistics
to the stack entry. On exit (InstrStopNode) we restore the previous entry.
This change provides about a 10% performance benefit for EXPLAIN ANALYZE on
paths that repeatedly enter InstrStopNode, e.g. SELECT COUNT(*):
CREATE TABLE test(id int);
INSERT INTO test SELECT * FROM generate_series(0, 1000000);
master (124ms, best out of 3):
postgres=# EXPLAIN (ANALYZE) SELECT COUNT(*) FROM test;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=16925.01..16925.02 rows=1 width=8) (actual
time=124.910..124.910 rows=1.00 loops=1)
Buffers: shared hit=752 read=3673
-> Seq Scan on test (cost=0.00..14425.01 rows=1000001 width=0) (actual
time=0.201..62.228 rows=1000001.00 loops=1)
Buffers: shared hit=752 read=3673
Planning Time: 0.116 ms
Execution Time: 124.961 ms
patched (109ms, best out of 3):
postgres=# EXPLAIN (ANALYZE) SELECT COUNT(*) FROM test;
QUERY PLAN
------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=16925.01..16925.02 rows=1 width=8) (actual
time=109.788..109.788 rows=1.00 loops=1)
Buffers: shared hit=940 read=3485
-> Seq Scan on test (cost=0.00..14425.01 rows=1000001 width=0) (actual
time=0.153..69.368 rows=1000001.00 loops=1)
Buffers: shared hit=940 read=3485
Planning Time: 0.134 ms
Execution Time: 109.837 ms
(6 rows)
I have also prototyped a more ambitious approach that completely removes
pgWalUsage/pgBufferUsage (utilizing the stack-collected data for e.g.
pg_stat_statements), but for now this patch set does not include that
change, but instead keeps adding to these legacy globals as well.
Patches attached:
0001: Separate node instrumentation from other use of Instrumentation struct
Previously different places (e.g. query "total time") were repurposing
the per-node Instrumentation struct. Instead, simplify the Instrumentation
struct to only track time, WAL/buffer usage, and tuple counts. Similarly,
drop the use of InstrEndLoop outside of per-node instrumentation. Introduce
the NodeInstrumentation struct to carry forward the per-node
instrumentation information.
0002: Replace direct changes of pgBufferUsage/pgWalUsage with INSTR_* macros
0003: Introduce stack for tracking per-node WAL/buffer usage
Feedback/thoughts welcome!
CCing Andres since he had expressed interest in this off-list.
[0]: See lightning talk slides from PGConf.Dev discussing an HLL-based
EXPLAIN (BUFFERS DISTINCT):
https://resources.pganalyze.com/pganalyze_PGConf.dev_2025_shared_blks_hit_distinct.pdf
Thanks,
Lukas
--
Lukas Fittl
Attachments:
[application/octet-stream] v1-0002-Replace-direct-changes-of-pgBufferUsage-pgWalUsag.patch (9.0K, 3-v1-0002-Replace-direct-changes-of-pgBufferUsage-pgWalUsag.patch)
download | inline diff:
From 942d8eb9b0f6a8d95c7cfd6a995d93ea9c667151 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Sun, 31 Aug 2025 16:34:42 -0700
Subject: [PATCH v1 2/3] Replace direct changes of pgBufferUsage/pgWalUsage
with INSTR_* macros
This encapsulates the ownership of these globals better, and will allow
a subsequent refactoring.
---
src/backend/access/transam/xlog.c | 8 ++++----
src/backend/storage/buffer/bufmgr.c | 24 ++++++++++++------------
src/backend/storage/buffer/localbuf.c | 6 +++---
src/backend/storage/file/buffile.c | 8 ++++----
src/backend/utils/activity/pgstat_io.c | 8 ++++----
src/include/executor/instrument.h | 19 +++++++++++++++++++
6 files changed, 46 insertions(+), 27 deletions(-)
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 7ffb2179151..61516f35676 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -1078,9 +1078,9 @@ XLogInsertRecord(XLogRecData *rdata,
/* Report WAL traffic to the instrumentation. */
if (inserted)
{
- pgWalUsage.wal_bytes += rechdr->xl_tot_len;
- pgWalUsage.wal_records++;
- pgWalUsage.wal_fpi += num_fpi;
+ INSTR_WALUSAGE_ADD(wal_bytes, rechdr->xl_tot_len);
+ INSTR_WALUSAGE_INCR(wal_records);
+ INSTR_WALUSAGE_ADD(wal_fpi, num_fpi);
/* Required for the flush of pending stats WAL data */
pgstat_report_fixed = true;
@@ -2060,7 +2060,7 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic)
WriteRqst.Flush = 0;
XLogWrite(WriteRqst, tli, false);
LWLockRelease(WALWriteLock);
- pgWalUsage.wal_buffers_full++;
+ INSTR_WALUSAGE_INCR(wal_buffers_full);
TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
/*
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 350cc0402aa..41f7c729c1d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -705,7 +705,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
{
PinLocalBuffer(bufHdr, true);
- pgBufferUsage.local_blks_hit++;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
return true;
}
@@ -737,7 +737,7 @@ ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockN
else
PinBuffer_Locked(bufHdr); /* pin for first time */
- pgBufferUsage.shared_blks_hit++;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
return true;
}
@@ -1147,14 +1147,14 @@ PinBufferForBlock(Relation rel,
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
if (*foundPtr)
- pgBufferUsage.local_blks_hit++;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
}
else
{
bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
strategy, foundPtr, io_context);
if (*foundPtr)
- pgBufferUsage.shared_blks_hit++;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
}
if (rel)
{
@@ -1888,9 +1888,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
true);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(local_blks_hit);
else
- pgBufferUsage.shared_blks_hit += 1;
+ INSTR_BUFUSAGE_INCR(shared_blks_hit);
if (operation->rel)
pgstat_count_buffer_hit(operation->rel);
@@ -1958,9 +1958,9 @@ AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
io_start, 1, io_buffers_len * BLCKSZ);
if (persistence == RELPERSISTENCE_TEMP)
- pgBufferUsage.local_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(local_blks_read, io_buffers_len);
else
- pgBufferUsage.shared_blks_read += io_buffers_len;
+ INSTR_BUFUSAGE_ADD(shared_blks_read, io_buffers_len);
/*
* Track vacuum cost when issuing IO, not after waiting for it.
@@ -2865,7 +2865,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
}
- pgBufferUsage.shared_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(shared_blks_written, extend_by);
*extended_by = extend_by;
@@ -2983,7 +2983,7 @@ MarkBufferDirty(Buffer buffer)
*/
if (!(old_buf_state & BM_DIRTY))
{
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
@@ -4391,7 +4391,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
IOOP_WRITE, io_start, 1, BLCKSZ);
- pgBufferUsage.shared_blks_written++;
+ INSTR_BUFUSAGE_INCR(shared_blks_written);
/*
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
@@ -5547,7 +5547,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
if (dirtied)
{
- pgBufferUsage.shared_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(shared_blks_dirtied);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3c0d20f4659..46bc57812df 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -216,7 +216,7 @@ FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
/* Mark not-dirty */
TerminateLocalBufferIO(bufHdr, true, 0, false);
- pgBufferUsage.local_blks_written++;
+ INSTR_BUFUSAGE_INCR(local_blks_written);
}
static Buffer
@@ -476,7 +476,7 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
*extended_by = extend_by;
- pgBufferUsage.local_blks_written += extend_by;
+ INSTR_BUFUSAGE_ADD(local_blks_written, extend_by);
return first_block;
}
@@ -507,7 +507,7 @@ MarkLocalBufferDirty(Buffer buffer)
buf_state = pg_atomic_read_u32(&bufHdr->state);
if (!(buf_state & BM_DIRTY))
- pgBufferUsage.local_blks_dirtied++;
+ INSTR_BUFUSAGE_INCR(local_blks_dirtied);
buf_state |= BM_DIRTY;
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 366d70d38a1..9d39df998cb 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -474,13 +474,13 @@ BufFileLoadBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_read_time, io_time, io_start);
}
/* we choose not to advance curOffset here */
if (file->nbytes > 0)
- pgBufferUsage.temp_blks_read++;
+ INSTR_BUFUSAGE_INCR(temp_blks_read);
}
/*
@@ -548,13 +548,13 @@ BufFileDumpBuffer(BufFile *file)
if (track_io_timing)
{
INSTR_TIME_SET_CURRENT(io_time);
- INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+ INSTR_BUFUSAGE_TIME_ACCUM_DIFF(temp_blk_write_time, io_time, io_start);
}
file->curOffset += bytestowrite;
wpos += bytestowrite;
- pgBufferUsage.temp_blks_written++;
+ INSTR_BUFUSAGE_INCR(temp_blks_written);
}
file->dirty = false;
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 13ae57ed649..4f6274eb573 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -135,17 +135,17 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
{
pgstat_count_buffer_write_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_write_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_write_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_write_time, io_time);
}
else if (io_op == IOOP_READ)
{
pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
if (io_object == IOOBJECT_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.shared_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(shared_blk_read_time, io_time);
else if (io_object == IOOBJECT_TEMP_RELATION)
- INSTR_TIME_ADD(pgBufferUsage.local_blk_read_time, io_time);
+ INSTR_BUFUSAGE_TIME_ADD(local_blk_read_time, io_time);
}
}
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 8c563510f4c..3a280f4caae 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -149,4 +149,23 @@ extern void BufferUsageAccumDiff(BufferUsage *dst,
extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
const WalUsage *sub);
+#define INSTR_BUFUSAGE_INCR(fld) do { \
+ pgBufferUsage.fld++; \
+ } while(0)
+#define INSTR_BUFUSAGE_ADD(fld,val) do { \
+ pgBufferUsage.fld += val; \
+ } while(0)
+#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
+ INSTR_TIME_ADD(pgBufferUsage.fld, val); \
+ } while (0)
+#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
+ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \
+ } while (0)
+#define INSTR_WALUSAGE_INCR(fld) do { \
+ pgWalUsage.fld++; \
+ } while(0)
+#define INSTR_WALUSAGE_ADD(fld,val) do { \
+ pgWalUsage.fld += val; \
+ } while(0)
+
#endif /* INSTRUMENT_H */
--
2.47.1
[application/octet-stream] v1-0001-Separate-node-instrumentation-from-other-use-of-I.patch (21.8K, 4-v1-0001-Separate-node-instrumentation-from-other-use-of-I.patch)
download | inline diff:
From 3e35547712f1fb10bce4fb3908912a282196b198 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Sat, 1 Mar 2025 19:31:30 -0800
Subject: [PATCH v1 1/3] Separate node instrumentation from other use of
Instrumentation struct
Previously different places (e.g. query "total time") were repurposing
the Instrumentation struct initially introduced for capturing per-node
statistics during execution. This dual use of the struct is confusing,
e.g. by cluttering calls of InstrStartNode/InstrStopNode in unrelated
code paths, and prevents future refactorings.
Instead, simplify the Instrumentation struct to only track time,
WAL/buffer usage, and tuple counts. Similarly, drop the use of InstrEndLoop
outside of per-node instrumentation. Introduce the NodeInstrumentation
struct to carry forward the per-node instrumentation information.
---
contrib/auto_explain/auto_explain.c | 10 +--
.../pg_stat_statements/pg_stat_statements.c | 10 +--
contrib/postgres_fdw/postgres_fdw.c | 2 +-
src/backend/commands/explain.c | 11 +--
src/backend/commands/trigger.c | 8 +-
src/backend/executor/execMain.c | 10 +--
src/backend/executor/execParallel.c | 22 +++--
src/backend/executor/execProcnode.c | 4 +-
src/backend/executor/instrument.c | 86 +++++++++++++++++--
src/include/executor/instrument.h | 51 ++++++++---
src/include/nodes/execnodes.h | 3 +-
11 files changed, 153 insertions(+), 64 deletions(-)
diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index 1f4badb4928..bd059f12224 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -315,7 +315,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL);
MemoryContextSwitchTo(oldcxt);
}
}
@@ -381,14 +381,8 @@ explain_ExecutorEnd(QueryDesc *queryDesc)
*/
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- /*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
- */
- InstrEndLoop(queryDesc->totaltime);
-
/* Log plan if duration is exceeded. */
- msec = queryDesc->totaltime->total * 1000.0;
+ msec = INSTR_TIME_GET_DOUBLE(queryDesc->totaltime->total) * 1000.0;
if (msec >= auto_explain_log_min_duration)
{
ExplainState *es = NewExplainState();
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 1cb368c8590..a1e69789c73 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -1021,7 +1021,7 @@ pgss_ExecutorStart(QueryDesc *queryDesc, int eflags)
MemoryContext oldcxt;
oldcxt = MemoryContextSwitchTo(queryDesc->estate->es_query_cxt);
- queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL, false);
+ queryDesc->totaltime = InstrAlloc(1, INSTRUMENT_ALL);
MemoryContextSwitchTo(oldcxt);
}
}
@@ -1080,18 +1080,12 @@ pgss_ExecutorEnd(QueryDesc *queryDesc)
if (queryId != INT64CONST(0) && queryDesc->totaltime &&
pgss_enabled(nesting_level))
{
- /*
- * Make sure stats accumulation is done. (Note: it's okay if several
- * levels of hook all do this.)
- */
- InstrEndLoop(queryDesc->totaltime);
-
pgss_store(queryDesc->sourceText,
queryId,
queryDesc->plannedstmt->stmt_location,
queryDesc->plannedstmt->stmt_len,
PGSS_EXEC,
- queryDesc->totaltime->total * 1000.0, /* convert to msec */
+ INSTR_TIME_GET_DOUBLE(queryDesc->totaltime->total) * 1000.0, /* convert to msec */
queryDesc->estate->es_total_processed,
&queryDesc->totaltime->bufusage,
&queryDesc->totaltime->walusage,
diff --git a/contrib/postgres_fdw/postgres_fdw.c b/contrib/postgres_fdw/postgres_fdw.c
index 456b267f70b..7619ac486c0 100644
--- a/contrib/postgres_fdw/postgres_fdw.c
+++ b/contrib/postgres_fdw/postgres_fdw.c
@@ -2778,7 +2778,7 @@ postgresIterateDirectModify(ForeignScanState *node)
if (!resultRelInfo->ri_projectReturning)
{
TupleTableSlot *slot = node->ss.ss_ScanTupleSlot;
- Instrumentation *instr = node->ss.ps.instrument;
+ NodeInstrumentation *instr = node->ss.ps.instrument;
Assert(!dmstate->has_returning);
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 8345bc0264b..6a135e51996 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1102,9 +1102,6 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
char *relname;
char *conname = NULL;
- /* Must clean up instrumentation state */
- InstrEndLoop(instr);
-
/*
* We ignore triggers that were never invoked; they likely aren't
* relevant to the current query type.
@@ -1135,7 +1132,7 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
appendStringInfo(es->str, " on %s", relname);
if (es->timing)
appendStringInfo(es->str, ": time=%.3f calls=%.0f\n",
- 1000.0 * instr->total, instr->ntuples);
+ 1000.0 * INSTR_TIME_GET_DOUBLE(instr->total), instr->ntuples);
else
appendStringInfo(es->str, ": calls=%.0f\n", instr->ntuples);
}
@@ -1146,7 +1143,7 @@ report_triggers(ResultRelInfo *rInfo, bool show_relname, ExplainState *es)
ExplainPropertyText("Constraint Name", conname, es);
ExplainPropertyText("Relation", relname, es);
if (es->timing)
- ExplainPropertyFloat("Time", "ms", 1000.0 * instr->total, 3,
+ ExplainPropertyFloat("Time", "ms", 1000.0 * INSTR_TIME_GET_DOUBLE(instr->total), 3,
es);
ExplainPropertyFloat("Calls", NULL, instr->ntuples, 0, es);
}
@@ -1888,7 +1885,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
double startup_ms;
double total_ms;
@@ -2294,7 +2291,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
for (int n = 0; n < w->num_workers; n++)
{
- Instrumentation *instrument = &w->instrument[n];
+ NodeInstrumentation *instrument = &w->instrument[n];
double nloops = instrument->nloops;
if (nloops <= 0)
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index 579ac8d76ae..9b53dd99e99 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -2344,7 +2344,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
* If doing EXPLAIN ANALYZE, start charging time to this trigger.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStart(instr + tgindx);
/*
* Do the function evaluation in the per-tuple memory context, so that
@@ -2392,7 +2392,7 @@ ExecCallTriggerFunc(TriggerData *trigdata,
* one "tuple returned" (really the number of firings).
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStop(instr + tgindx, 1);
return (HeapTuple) DatumGetPointer(result);
}
@@ -4381,7 +4381,7 @@ AfterTriggerExecute(EState *estate,
* to include time spent re-fetching tuples in the trigger cost.
*/
if (instr)
- InstrStartNode(instr + tgindx);
+ InstrStart(instr + tgindx);
/*
* Fetch the required tuple(s).
@@ -4607,7 +4607,7 @@ AfterTriggerExecute(EState *estate,
* one "tuple returned" (really the number of firings).
*/
if (instr)
- InstrStopNode(instr + tgindx, 1);
+ InstrStop(instr + tgindx, 1);
}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index b8b9d2a85f7..b83ced9a57a 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -331,7 +331,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
/* Allow instrumentation of Executor overall runtime */
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ InstrStart(queryDesc->totaltime);
/*
* extract information from the query descriptor and the query feature.
@@ -383,7 +383,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
dest->rShutdown(dest);
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, estate->es_processed);
+ InstrStop(queryDesc->totaltime, estate->es_processed);
MemoryContextSwitchTo(oldcontext);
}
@@ -433,7 +433,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
/* Allow instrumentation of Executor overall runtime */
if (queryDesc->totaltime)
- InstrStartNode(queryDesc->totaltime);
+ InstrStart(queryDesc->totaltime);
/* Run ModifyTable nodes to completion */
ExecPostprocessPlan(estate);
@@ -443,7 +443,7 @@ standard_ExecutorFinish(QueryDesc *queryDesc)
AfterTriggerEndQuery(estate);
if (queryDesc->totaltime)
- InstrStopNode(queryDesc->totaltime, 0);
+ InstrStop(queryDesc->totaltime, 0);
MemoryContextSwitchTo(oldcontext);
@@ -1247,7 +1247,7 @@ InitResultRelInfo(ResultRelInfo *resultRelInfo,
resultRelInfo->ri_TrigWhenExprs = (ExprState **)
palloc0(n * sizeof(ExprState *));
if (instrument_options)
- resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options, false);
+ resultRelInfo->ri_TrigInstrument = InstrAlloc(n, instrument_options);
}
else
{
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index f098a5557cf..e87810d292e 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -85,7 +85,7 @@ typedef struct FixedParallelExecutorState
* instrument_options: Same meaning here as in instrument.c.
*
* instrument_offset: Offset, relative to the start of this structure,
- * of the first Instrumentation object. This will depend on the length of
+ * of the first NodeInstrumentation object. This will depend on the length of
* the plan_node_id array.
*
* num_workers: Number of workers.
@@ -102,11 +102,15 @@ struct SharedExecutorInstrumentation
int num_workers;
int num_plan_nodes;
int plan_node_id[FLEXIBLE_ARRAY_MEMBER];
- /* array of num_plan_nodes * num_workers Instrumentation objects follows */
+
+ /*
+ * array of num_plan_nodes * num_workers NodeInstrumentation objects
+ * follows
+ */
};
#define GetInstrumentationArray(sei) \
(AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
- (Instrumentation *) (((char *) sei) + sei->instrument_offset))
+ (NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
/* Context object for ExecParallelEstimate. */
typedef struct ExecParallelEstimateContext
@@ -713,7 +717,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation_len = MAXALIGN(instrumentation_len);
instrument_offset = instrumentation_len;
instrumentation_len +=
- mul_size(sizeof(Instrumentation),
+ mul_size(sizeof(NodeInstrumentation),
mul_size(e.nnodes, nworkers));
shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
shm_toc_estimate_keys(&pcxt->estimator, 1);
@@ -799,7 +803,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
*/
if (estate->es_instrument)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
@@ -809,7 +813,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation->num_plan_nodes = e.nnodes;
instrument = GetInstrumentationArray(instrumentation);
for (i = 0; i < nworkers * e.nnodes; ++i)
- InstrInit(&instrument[i], estate->es_instrument);
+ InstrInitNode(&instrument[i], estate->es_instrument);
shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
instrumentation);
pei->instrumentation = instrumentation;
@@ -1036,7 +1040,7 @@ static bool
ExecParallelRetrieveInstrumentation(PlanState *planstate,
SharedExecutorInstrumentation *instrumentation)
{
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
int i;
int n;
int ibytes;
@@ -1064,7 +1068,7 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
* Switch into per-query memory context.
*/
oldcontext = MemoryContextSwitchTo(planstate->state->es_query_cxt);
- ibytes = mul_size(instrumentation->num_workers, sizeof(Instrumentation));
+ ibytes = mul_size(instrumentation->num_workers, sizeof(NodeInstrumentation));
planstate->worker_instrument =
palloc(ibytes + offsetof(WorkerInstrumentation, instrument));
MemoryContextSwitchTo(oldcontext);
@@ -1296,7 +1300,7 @@ ExecParallelReportInstrumentation(PlanState *planstate,
{
int i;
int plan_node_id = planstate->plan->plan_node_id;
- Instrumentation *instrument;
+ NodeInstrumentation *instrument;
InstrEndLoop(planstate->instrument);
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index f5f9cfbeead..d286471254b 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -413,8 +413,8 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
/* Set up instrumentation for this node if requested */
if (estate->es_instrument)
- result->instrument = InstrAlloc(1, estate->es_instrument,
- result->async_capable);
+ result->instrument = InstrAllocNode(1, estate->es_instrument,
+ result->async_capable);
return result;
}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 56e635f4700..c53480d8030 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -26,9 +26,9 @@ static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
static void WalUsageAdd(WalUsage *dst, WalUsage *add);
-/* Allocate new instrumentation structure(s) */
+/* General purpose instrumentation handling */
Instrumentation *
-InstrAlloc(int n, int instrument_options, bool async_mode)
+InstrAlloc(int n, int instrument_options)
{
Instrumentation *instr;
@@ -41,6 +41,74 @@ InstrAlloc(int n, int instrument_options, bool async_mode)
bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
int i;
+ for (i = 0; i < n; i++)
+ {
+ instr[i].need_bufusage = need_buffers;
+ instr[i].need_walusage = need_wal;
+ instr[i].need_timer = need_timer;
+ }
+ }
+
+ return instr;
+}
+void
+InstrStart(Instrumentation *instr)
+{
+ if (instr->need_timer &&
+ !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
+ elog(ERROR, "InstrStart called twice in a row");
+
+ if (instr->need_bufusage)
+ instr->bufusage_start = pgBufferUsage;
+
+ if (instr->need_walusage)
+ instr->walusage_start = pgWalUsage;
+}
+void
+InstrStop(Instrumentation *instr, double nTuples)
+{
+ instr_time endtime;
+
+ /* count the specified tuples */
+ instr->ntuples += nTuples;
+
+ /* let's update the time only if the timer was requested */
+ if (instr->need_timer)
+ {
+ if (INSTR_TIME_IS_ZERO(instr->starttime))
+ elog(ERROR, "InstrStop called without start");
+
+ INSTR_TIME_SET_CURRENT(endtime);
+ INSTR_TIME_ACCUM_DIFF(instr->total, endtime, instr->starttime);
+
+ INSTR_TIME_SET_ZERO(instr->starttime);
+ }
+
+ /* Add delta of buffer usage since entry to node's totals */
+ if (instr->need_bufusage)
+ BufferUsageAccumDiff(&instr->bufusage,
+ &pgBufferUsage, &instr->bufusage_start);
+
+ if (instr->need_walusage)
+ WalUsageAccumDiff(&instr->walusage,
+ &pgWalUsage, &instr->walusage_start);
+}
+
+/* Allocate new node instrumentation structure(s) */
+NodeInstrumentation *
+InstrAllocNode(int n, int instrument_options, bool async_mode)
+{
+ NodeInstrumentation *instr;
+
+ /* initialize all fields to zeroes, then modify as needed */
+ instr = palloc0(n * sizeof(NodeInstrumentation));
+ if (instrument_options & (INSTRUMENT_BUFFERS | INSTRUMENT_TIMER | INSTRUMENT_WAL))
+ {
+ bool need_buffers = (instrument_options & INSTRUMENT_BUFFERS) != 0;
+ bool need_wal = (instrument_options & INSTRUMENT_WAL) != 0;
+ bool need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
+ int i;
+
for (i = 0; i < n; i++)
{
instr[i].need_bufusage = need_buffers;
@@ -55,9 +123,9 @@ InstrAlloc(int n, int instrument_options, bool async_mode)
/* Initialize a pre-allocated instrumentation structure. */
void
-InstrInit(Instrumentation *instr, int instrument_options)
+InstrInitNode(NodeInstrumentation * instr, int instrument_options)
{
- memset(instr, 0, sizeof(Instrumentation));
+ memset(instr, 0, sizeof(NodeInstrumentation));
instr->need_bufusage = (instrument_options & INSTRUMENT_BUFFERS) != 0;
instr->need_walusage = (instrument_options & INSTRUMENT_WAL) != 0;
instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
@@ -65,7 +133,7 @@ InstrInit(Instrumentation *instr, int instrument_options)
/* Entry to a plan node */
void
-InstrStartNode(Instrumentation *instr)
+InstrStartNode(NodeInstrumentation * instr)
{
if (instr->need_timer &&
!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
@@ -81,7 +149,7 @@ InstrStartNode(Instrumentation *instr)
/* Exit from a plan node */
void
-InstrStopNode(Instrumentation *instr, double nTuples)
+InstrStopNode(NodeInstrumentation * instr, double nTuples)
{
double save_tuplecount = instr->tuplecount;
instr_time endtime;
@@ -129,7 +197,7 @@ InstrStopNode(Instrumentation *instr, double nTuples)
/* Update tuple count */
void
-InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
+InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples)
{
/* count the returned tuples */
instr->tuplecount += nTuples;
@@ -137,7 +205,7 @@ InstrUpdateTupleCount(Instrumentation *instr, double nTuples)
/* Finish a run cycle for a plan node */
void
-InstrEndLoop(Instrumentation *instr)
+InstrEndLoop(NodeInstrumentation * instr)
{
double totaltime;
@@ -166,7 +234,7 @@ InstrEndLoop(Instrumentation *instr)
/* aggregate instrumentation information */
void
-InstrAggNode(Instrumentation *dst, Instrumentation *add)
+InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add)
{
if (!dst->running && add->running)
{
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 03653ab6c6c..8c563510f4c 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -66,7 +66,33 @@ typedef enum InstrumentOption
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
+/*
+ * General purpose instrumentation that can capture time, WAL/buffer usage and tuples
+ *
+ * Initialized through InstrAlloc, followed by one or more calls to a pair of
+ * InstrStart/InstrStop (activity is measured inbetween).
+ */
typedef struct Instrumentation
+{
+ /* Parameters set at creation: */
+ bool need_timer; /* true if we need timer data */
+ bool need_bufusage; /* true if we need buffer usage data */
+ bool need_walusage; /* true if we need WAL usage data */
+ /* Internal state keeping: */
+ instr_time starttime; /* start time of current iteration of node */
+ BufferUsage bufusage_start; /* buffer usage at start */
+ WalUsage walusage_start; /* WAL usage at start */
+ /* Accumulated statistics: */
+ instr_time total; /* total runtime */
+ double ntuples; /* total tuples counted in InstrStop */
+ BufferUsage bufusage; /* total buffer usage */
+ WalUsage walusage; /* total WAL usage */
+} Instrumentation;
+
+/*
+ * Specialized instrumentation for per-node execution statistics
+ */
+typedef struct NodeInstrumentation
{
/* Parameters set at node creation: */
bool need_timer; /* true if we need timer data */
@@ -91,25 +117,30 @@ typedef struct Instrumentation
double nfiltered2; /* # of tuples removed by "other" quals */
BufferUsage bufusage; /* total buffer usage */
WalUsage walusage; /* total WAL usage */
-} Instrumentation;
+} NodeInstrumentation;
typedef struct WorkerInstrumentation
{
int num_workers; /* # of structures that follow */
- Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+ NodeInstrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
} WorkerInstrumentation;
extern PGDLLIMPORT BufferUsage pgBufferUsage;
extern PGDLLIMPORT WalUsage pgWalUsage;
-extern Instrumentation *InstrAlloc(int n, int instrument_options,
- bool async_mode);
-extern void InstrInit(Instrumentation *instr, int instrument_options);
-extern void InstrStartNode(Instrumentation *instr);
-extern void InstrStopNode(Instrumentation *instr, double nTuples);
-extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples);
-extern void InstrEndLoop(Instrumentation *instr);
-extern void InstrAggNode(Instrumentation *dst, Instrumentation *add);
+extern Instrumentation *InstrAlloc(int n, int instrument_options);
+extern void InstrStart(Instrumentation *instr);
+extern void InstrStop(Instrumentation *instr, double nTuples);
+
+extern NodeInstrumentation * InstrAllocNode(int n, int instrument_options,
+ bool async_mode);
+extern void InstrInitNode(NodeInstrumentation * instr, int instrument_options);
+extern void InstrStartNode(NodeInstrumentation * instr);
+extern void InstrStopNode(NodeInstrumentation * instr, double nTuples);
+extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples);
+extern void InstrEndLoop(NodeInstrumentation * instr);
+extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add);
+
extern void InstrStartParallelQuery(void);
extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
extern void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index de782014b2d..9b3bd66d401 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1169,7 +1169,8 @@ typedef struct PlanState
ExecProcNodeMtd ExecProcNodeReal; /* actual function, if above is a
* wrapper */
- Instrumentation *instrument; /* Optional runtime stats for this node */
+ NodeInstrumentation *instrument; /* Optional runtime stats for this
+ * node */
WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */
/* Per-worker JIT instrumentation */
--
2.47.1
[application/octet-stream] v1-0003-Introduce-stack-for-tracking-per-node-WAL-buffer-.patch (12.3K, 5-v1-0003-Introduce-stack-for-tracking-per-node-WAL-buffer-.patch)
download | inline diff:
From 4375fcb4141f18d6cd927659970518553aa3fe94 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <[email protected]>
Date: Sun, 31 Aug 2025 16:37:05 -0700
Subject: [PATCH v1 3/3] Introduce stack for tracking per-node WAL/buffer usage
---
src/backend/commands/explain.c | 8 +-
src/backend/executor/execMain.c | 7 ++
src/backend/executor/execProcnode.c | 9 +++
src/backend/executor/instrument.c | 111 ++++++++++++++++++++++++----
src/include/executor/instrument.h | 42 ++++++++++-
5 files changed, 155 insertions(+), 22 deletions(-)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 6a135e51996..584f0adbcc1 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2280,9 +2280,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
/* Show buffer/WAL usage */
if (es->buffers && planstate->instrument)
- show_buffer_usage(es, &planstate->instrument->bufusage);
+ show_buffer_usage(es, &planstate->instrument->stack.bufusage);
if (es->wal && planstate->instrument)
- show_wal_usage(es, &planstate->instrument->walusage);
+ show_wal_usage(es, &planstate->instrument->stack.walusage);
/* Prepare per-worker buffer/WAL usage */
if (es->workers_state && (es->buffers || es->wal) && es->verbose)
@@ -2299,9 +2299,9 @@ ExplainNode(PlanState *planstate, List *ancestors,
ExplainOpenWorker(n, es);
if (es->buffers)
- show_buffer_usage(es, &instrument->bufusage);
+ show_buffer_usage(es, &instrument->stack.bufusage);
if (es->wal)
- show_wal_usage(es, &instrument->walusage);
+ show_wal_usage(es, &instrument->stack.walusage);
ExplainCloseWorker(n, es);
}
}
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index b83ced9a57a..1c2268bc608 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -312,6 +312,7 @@ standard_ExecutorRun(QueryDesc *queryDesc,
DestReceiver *dest;
bool sendTuples;
MemoryContext oldcontext;
+ InstrStackResource *res;
/* sanity checks */
Assert(queryDesc != NULL);
@@ -333,6 +334,9 @@ standard_ExecutorRun(QueryDesc *queryDesc,
if (queryDesc->totaltime)
InstrStart(queryDesc->totaltime);
+ /* Start up per-query node level instrumentation */
+ res = InstrStartQuery();
+
/*
* extract information from the query descriptor and the query feature.
*/
@@ -382,6 +386,9 @@ standard_ExecutorRun(QueryDesc *queryDesc,
if (sendTuples)
dest->rShutdown(dest);
+ /* Shut down per-query node level instrumentation */
+ InstrShutdownQuery(res);
+
if (queryDesc->totaltime)
InstrStop(queryDesc->totaltime, estate->es_processed);
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index d286471254b..7436f307994 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -823,8 +823,17 @@ ExecShutdownNode_walker(PlanState *node, void *context)
/* Stop the node if we started it above, reporting 0 tuples. */
if (node->instrument && node->instrument->running)
+ {
InstrStopNode(node->instrument, 0);
+ /*
+ * Propagate WAL/buffer stats to the parent node on the
+ * instrumentation stack (which is where InstrStopNode returned us
+ * to).
+ */
+ InstrNodeAddToCurrent(&node->instrument->stack);
+ }
+
return false;
}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index c53480d8030..040d1fdecbd 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -16,15 +16,40 @@
#include <unistd.h>
#include "executor/instrument.h"
+#include "utils/memutils.h"
BufferUsage pgBufferUsage;
static BufferUsage save_pgBufferUsage;
WalUsage pgWalUsage;
static WalUsage save_pgWalUsage;
+InstrStack *pgInstrStack = NULL;
static void BufferUsageAdd(BufferUsage *dst, const BufferUsage *add);
static void WalUsageAdd(WalUsage *dst, WalUsage *add);
+/*
+ * Node-specific instrumentation handling uses ResourceOwner mechanism to
+ * reset pgInstrStack on abort.
+ */
+static void ResOwnerReleaseInstrStack(Datum res);
+static const ResourceOwnerDesc instr_stack_resowner_desc =
+{
+ .name = "instrumentation stack scope",
+ .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
+ .release_priority = RELEASE_PRIO_FIRST,
+ .ReleaseResource = ResOwnerReleaseInstrStack,
+ .DebugPrint = NULL, /* default message is fine */
+};
+static inline void
+ResourceOwnerRememberInstrStack(ResourceOwner owner, InstrStackResource * scope)
+{
+ ResourceOwnerRemember(owner, PointerGetDatum(scope), &instr_stack_resowner_desc);
+}
+static inline void
+ResourceOwnerForgetInstrStack(ResourceOwner owner, InstrStackResource * scope)
+{
+ ResourceOwnerForget(owner, PointerGetDatum(scope), &instr_stack_resowner_desc);
+}
/* General purpose instrumentation handling */
Instrumentation *
@@ -139,12 +164,17 @@ InstrStartNode(NodeInstrumentation * instr)
!INSTR_TIME_SET_CURRENT_LAZY(instr->starttime))
elog(ERROR, "InstrStartNode called twice in a row");
- /* save buffer usage totals at node entry, if needed */
- if (instr->need_bufusage)
- instr->bufusage_start = pgBufferUsage;
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /*
+ * Ensure that we have an active pgInstrStack (InstrStartQuery must
+ * have been called)
+ */
+ Assert(pgInstrStack != NULL);
- if (instr->need_walusage)
- instr->walusage_start = pgWalUsage;
+ instr->stack.previous = pgInstrStack;
+ pgInstrStack = &instr->stack;
+ }
}
/* Exit from a plan node */
@@ -169,14 +199,12 @@ InstrStopNode(NodeInstrumentation * instr, double nTuples)
INSTR_TIME_SET_ZERO(instr->starttime);
}
- /* Add delta of buffer usage since entry to node's totals */
- if (instr->need_bufusage)
- BufferUsageAccumDiff(&instr->bufusage,
- &pgBufferUsage, &instr->bufusage_start);
-
- if (instr->need_walusage)
- WalUsageAccumDiff(&instr->walusage,
- &pgWalUsage, &instr->walusage_start);
+ if (instr->need_bufusage || instr->need_walusage)
+ {
+ /* Ensure that there is a stack entry above the top-most node */
+ Assert(instr->stack.previous != NULL);
+ pgInstrStack = instr->stack.previous;
+ }
/* Is this the first tuple of this cycle? */
if (!instr->running)
@@ -257,10 +285,65 @@ InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add)
/* Add delta of buffer usage since entry to node's totals */
if (dst->need_bufusage)
- BufferUsageAdd(&dst->bufusage, &add->bufusage);
+ BufferUsageAdd(&dst->stack.bufusage, &add->stack.bufusage);
if (dst->need_walusage)
+ WalUsageAdd(&dst->stack.walusage, &add->stack.walusage);
+}
+
+InstrStackResource *
+InstrStartQuery()
+{
+ InstrStack *usage = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStack));
+ InstrStackResource *usageRes = MemoryContextAllocZero(CurTransactionContext, sizeof(InstrStackResource));
+ ResourceOwner owner = CurrentResourceOwner;
+
+ Assert(owner != NULL);
+
+ usageRes->owner = owner;
+
+ ResourceOwnerEnlarge(owner);
+ ResourceOwnerRememberInstrStack(owner, usageRes);
+
+ usage->previous = pgInstrStack;
+ pgInstrStack = usage;
+
+ return usageRes;
+}
+
+void
+InstrShutdownQuery(InstrStackResource * res)
+{
+ Assert(res != NULL);
+ Assert(res->owner != NULL);
+
+ pgInstrStack = res->previous;
+
+ ResourceOwnerForgetInstrStack(res->owner, res);
+}
+
+static void
+ResOwnerReleaseInstrStack(Datum res)
+{
+ /*
+ * XXX: Registered resources are *not* called in reverse order, i.e. we'll
+ * get what was first registered first at shutdown. To avoid handling
+ * that, we are resetting the stack here on abort (instead of recovering
+ * to previous).
+ */
+ pgInstrStack = NULL;
+}
+
+void
+InstrNodeAddToCurrent(InstrStack * add)
+{
+ if (pgInstrStack != NULL)
+ {
+ InstrStack *dst = pgInstrStack;
+
+ BufferUsageAdd(&dst->bufusage, &add->bufusage);
WalUsageAdd(&dst->walusage, &add->walusage);
+ }
}
/* note current values during parallel executor startup */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 3a280f4caae..a98efab5f93 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -14,6 +14,7 @@
#define INSTRUMENT_H
#include "portability/instr_time.h"
+#include "utils/resowner.h"
/*
@@ -66,6 +67,21 @@ typedef enum InstrumentOption
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
+/* Stack of WAL/buffer usage used for per-node instrumentation */
+typedef struct InstrStack
+{
+ struct InstrStack *previous;
+ BufferUsage bufusage;
+ WalUsage walusage;
+} InstrStack;
+
+/* Used to manage resetting of instrumentation stack on abort. */
+typedef struct InstrStackResource
+{
+ InstrStack *previous;
+ ResourceOwner owner;
+} InstrStackResource;
+
/*
* General purpose instrumentation that can capture time, WAL/buffer usage and tuples
*
@@ -91,6 +107,10 @@ typedef struct Instrumentation
/*
* Specialized instrumentation for per-node execution statistics
+ *
+ * Requires use of InstrStartQuery to initialize the stack used for WAL/buffer
+ * usage statistics, and cleanup through InstrShutdownQuery. Solely intended for
+ * the executor and anyone reporting about its activities (e.g. EXPLAIN ANALYZE).
*/
typedef struct NodeInstrumentation
{
@@ -105,8 +125,6 @@ typedef struct NodeInstrumentation
instr_time counter; /* accumulated runtime for this node */
double firsttuple; /* time for first tuple of this cycle */
double tuplecount; /* # of tuples emitted so far this cycle */
- BufferUsage bufusage_start; /* buffer usage at start */
- WalUsage walusage_start; /* WAL usage at start */
/* Accumulated statistics across all completed cycles: */
double startup; /* total startup time (in seconds) */
double total; /* total time (in seconds) */
@@ -115,8 +133,7 @@ typedef struct NodeInstrumentation
double nloops; /* # of run cycles for this node */
double nfiltered1; /* # of tuples removed by scanqual or joinqual */
double nfiltered2; /* # of tuples removed by "other" quals */
- BufferUsage bufusage; /* total buffer usage */
- WalUsage walusage; /* total WAL usage */
+ InstrStack stack; /* stack tracking buffer/WAL usage */
} NodeInstrumentation;
typedef struct WorkerInstrumentation
@@ -127,6 +144,7 @@ typedef struct WorkerInstrumentation
extern PGDLLIMPORT BufferUsage pgBufferUsage;
extern PGDLLIMPORT WalUsage pgWalUsage;
+extern PGDLLIMPORT InstrStack * pgInstrStack;
extern Instrumentation *InstrAlloc(int n, int instrument_options);
extern void InstrStart(Instrumentation *instr);
@@ -135,11 +153,14 @@ extern void InstrStop(Instrumentation *instr, double nTuples);
extern NodeInstrumentation * InstrAllocNode(int n, int instrument_options,
bool async_mode);
extern void InstrInitNode(NodeInstrumentation * instr, int instrument_options);
+extern InstrStackResource * InstrStartQuery(void);
+extern void InstrShutdownQuery(InstrStackResource * res);
extern void InstrStartNode(NodeInstrumentation * instr);
extern void InstrStopNode(NodeInstrumentation * instr, double nTuples);
extern void InstrUpdateTupleCount(NodeInstrumentation * instr, double nTuples);
extern void InstrEndLoop(NodeInstrumentation * instr);
extern void InstrAggNode(NodeInstrumentation * dst, NodeInstrumentation * add);
+extern void InstrNodeAddToCurrent(InstrStack * stack);
extern void InstrStartParallelQuery(void);
extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage);
@@ -151,21 +172,34 @@ extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add,
#define INSTR_BUFUSAGE_INCR(fld) do { \
pgBufferUsage.fld++; \
+ if (pgInstrStack) \
+ pgInstrStack->bufusage.fld++; \
} while(0)
#define INSTR_BUFUSAGE_ADD(fld,val) do { \
pgBufferUsage.fld += val; \
+ if (pgInstrStack) \
+ pgInstrStack->bufusage.fld += val; \
} while(0)
#define INSTR_BUFUSAGE_TIME_ADD(fld,val) do { \
INSTR_TIME_ADD(pgBufferUsage.fld, val); \
+ if (pgInstrStack) \
+ INSTR_TIME_ADD(pgInstrStack->bufusage.fld, val); \
} while (0)
#define INSTR_BUFUSAGE_TIME_ACCUM_DIFF(fld,endval,startval) do { \
INSTR_TIME_ACCUM_DIFF(pgBufferUsage.fld, endval, startval); \
+ if (pgInstrStack) \
+ INSTR_TIME_ACCUM_DIFF(pgInstrStack->bufusage.fld, endval, startval); \
} while (0)
+
#define INSTR_WALUSAGE_INCR(fld) do { \
pgWalUsage.fld++; \
+ if (pgInstrStack) \
+ pgInstrStack->walusage.fld++; \
} while(0)
#define INSTR_WALUSAGE_ADD(fld,val) do { \
pgWalUsage.fld += val; \
+ if (pgInstrStack) \
+ pgInstrStack->walusage.fld += val; \
} while(0)
#endif /* INSTRUMENT_H */
--
2.47.1
view thread (42+ messages) latest in thread
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected], [email protected], [email protected]
Subject: Re: Stack-based tracking of per-node WAL/buffer usage
In-Reply-To: <CAP53PkzdBK8VJ1fS4AZ481LgMN8f9mJiC39ZRHqkFUSYq6KWmg@mail.gmail.com>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox