From 42dc8cdda3346bedb7839ceda44d61db4e21f72d Mon Sep 17 00:00:00 2001 From: Lukas Fittl Date: Sat, 7 Mar 2026 11:46:19 -0800 Subject: [PATCH v8 7/8] Index scans: Show table buffer accesses separately in EXPLAIN ANALYZE This sets up a separate instrumentation stack that is used whilst an Index Scan does scanning on the table, for example due to additional data being needed. EXPLAIN ANALYZE will now show "Table Buffers" that represent such activity. The activity is also included in regular "Buffers" together with index activity and that of any child nodes. Author: Lukas Fittl Suggested-by: Andres Freund Reviewed-by: Zsolt Parragi Discussion: https://www.postgresql.org/message-id/flat/CAP53PkxrmpECzVFpeeEEHDGe6u625s%2BYkmVv5-gw3L_NDSfbiA%40mail.gmail.com#cb583a08e8e096aa1f093bb178906173 --- doc/src/sgml/perform.sgml | 13 ++- doc/src/sgml/ref/explain.sgml | 1 + src/backend/commands/explain.c | 44 ++++++++-- src/backend/executor/execProcnode.c | 35 ++++++++ src/backend/executor/nodeIndexscan.c | 110 +++++++++++++++++++------ src/include/executor/instrument_node.h | 6 ++ src/include/nodes/execnodes.h | 7 ++ 7 files changed, 182 insertions(+), 34 deletions(-) diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5f6f1db0467..9219625faf6 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -734,6 +734,7 @@ WHERE t1.unique1 < 10 AND t1.unique2 = t2.unique2; -> Index Scan using tenk2_unique2 on tenk2 t2 (cost=0.29..7.90 rows=1 width=244) (actual time=0.003..0.003 rows=1.00 loops=10) Index Cond: (unique2 = t1.unique2) Index Searches: 10 + Table Buffers: shared hit=10 Buffers: shared hit=24 read=6 Planning: Buffers: shared hit=15 dirtied=9 @@ -949,7 +950,8 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; Index Cond: (f1 @> '((0.5,2))'::polygon) Rows Removed by Index Recheck: 1 Index Searches: 1 - Buffers: shared hit=1 + Table Buffers: shared hit=1 + Buffers: shared hit=2 Planning Time: 0.039 ms Execution Time: 0.098 ms @@ -958,7 +960,9 @@ EXPLAIN ANALYZE SELECT * FROM polygon_tbl WHERE f1 @> polygon '(0.5,2.0)'; then rejected by a recheck of the index condition. This happens because a GiST index is lossy for polygon containment tests: it actually returns the rows with polygons that overlap the target, and then we have - to do the exact containment test on those rows. + to do the exact containment test on those rows. The Table Buffers + counts indicate how many operations were performed on the table instead of + the index. This number is included in the Buffers counts. @@ -1147,13 +1151,14 @@ EXPLAIN ANALYZE SELECT * FROM tenk1 WHERE unique1 < 100 AND unique2 > 9000 QUERY PLAN -------------------------------------------------------------------&zwsp;------------------------------------------------------------ Limit (cost=0.29..14.33 rows=2 width=244) (actual time=0.051..0.071 rows=2.00 loops=1) - Buffers: shared hit=16 + Buffers: shared hit=14 -> Index Scan using tenk1_unique2 on tenk1 (cost=0.29..70.50 rows=10 width=244) (actual time=0.051..0.070 rows=2.00 loops=1) Index Cond: (unique2 > 9000) Filter: (unique1 < 100) Rows Removed by Filter: 287 Index Searches: 1 - Buffers: shared hit=16 + Table Buffers: shared hit=11 + Buffers: shared hit=14 Planning Time: 0.077 ms Execution Time: 0.086 ms diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml index 7dee77fd366..912c96f2ff5 100644 --- a/doc/src/sgml/ref/explain.sgml +++ b/doc/src/sgml/ref/explain.sgml @@ -506,6 +506,7 @@ EXPLAIN ANALYZE EXECUTE query(100, 200); -> Index Scan using test_pkey on test (cost=0.29..10.27 rows=99 width=8) (actual time=0.009..0.025 rows=99.00 loops=1) Index Cond: ((id > 100) AND (id < 200)) Index Searches: 1 + Table Buffers: shared hit=1 Buffers: shared hit=4 Planning Time: 0.244 ms Execution Time: 0.073 ms diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index aa76f68bd10..8a641f9d05f 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -143,7 +143,7 @@ static void show_instrumentation_count(const char *qlabel, int which, static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static bool peek_buffer_usage(ExplainState *es, const BufferUsage *usage); -static void show_buffer_usage(ExplainState *es, const BufferUsage *usage); +static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void show_memory_counters(ExplainState *es, const MemoryContextCounters *mem_counters); @@ -603,7 +603,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, } if (bufusage) - show_buffer_usage(es, bufusage); + show_buffer_usage(es, bufusage, NULL); if (mem_counters) show_memory_counters(es, mem_counters); @@ -1020,7 +1020,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) if (es->buffers && peek_buffer_usage(es, &metrics->bufferUsage)) { es->indent++; - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); es->indent--; } } @@ -1034,7 +1034,7 @@ ExplainPrintSerialize(ExplainState *es, SerializeMetrics *metrics) BYTES_TO_KILOBYTES(metrics->bytesSent), es); ExplainPropertyText("Format", format, es); if (es->buffers) - show_buffer_usage(es, &metrics->bufferUsage); + show_buffer_usage(es, &metrics->bufferUsage, NULL); } ExplainCloseGroup("Serialization", "Serialization", true, es); @@ -1962,6 +1962,9 @@ ExplainNode(PlanState *planstate, List *ancestors, show_instrumentation_count("Rows Removed by Filter", 1, planstate, es); show_indexsearches_info(planstate, es); + + if (es->buffers && planstate->instrument) + show_buffer_usage(es, &((IndexScanState *) planstate)->iss_InstrumentTable->instr.bufusage, "Table"); break; case T_IndexOnlyScan: show_scan_qual(((IndexOnlyScan *) plan)->indexqual, @@ -2280,7 +2283,7 @@ ExplainNode(PlanState *planstate, List *ancestors, /* Show buffer/WAL usage */ if (es->buffers && planstate->instrument) - show_buffer_usage(es, &planstate->instrument->instr.bufusage); + show_buffer_usage(es, &planstate->instrument->instr.bufusage, NULL); if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->instr.walusage); @@ -2299,7 +2302,7 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainOpenWorker(n, es); if (es->buffers) - show_buffer_usage(es, &instrument->instr.bufusage); + show_buffer_usage(es, &instrument->instr.bufusage, NULL); if (es->wal) show_wal_usage(es, &instrument->instr.walusage); ExplainCloseWorker(n, es); @@ -4099,7 +4102,7 @@ peek_buffer_usage(ExplainState *es, const BufferUsage *usage) * Show buffer usage details. This better be sync with peek_buffer_usage. */ static void -show_buffer_usage(ExplainState *es, const BufferUsage *usage) +show_buffer_usage(ExplainState *es, const BufferUsage *usage, const char *title) { if (es->format == EXPLAIN_FORMAT_TEXT) { @@ -4124,6 +4127,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared || has_local || has_temp) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "Buffers:"); if (has_shared) @@ -4179,6 +4184,8 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) if (has_shared_timing || has_local_timing || has_temp_timing) { ExplainIndentText(es); + if (title) + appendStringInfo(es->str, "%s ", title); appendStringInfoString(es->str, "I/O Timings:"); if (has_shared_timing) @@ -4220,6 +4227,14 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) } else { + char *buffers_title = NULL; + + if (title) + { + buffers_title = psprintf("%s Buffers", title); + ExplainOpenGroup(buffers_title, buffers_title, true, es); + } + ExplainPropertyInteger("Shared Hit Blocks", NULL, usage->shared_blks_hit, es); ExplainPropertyInteger("Shared Read Blocks", NULL, @@ -4240,8 +4255,20 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) usage->temp_blks_read, es); ExplainPropertyInteger("Temp Written Blocks", NULL, usage->temp_blks_written, es); + + if (buffers_title) + ExplainCloseGroup(buffers_title, buffers_title, true, es); + if (track_io_timing) { + char *timings_title = NULL; + + if (title) + { + timings_title = psprintf("%s I/O Timings", title); + ExplainOpenGroup(timings_title, timings_title, true, es); + } + ExplainPropertyFloat("Shared I/O Read Time", "ms", INSTR_TIME_GET_MILLISEC(usage->shared_blk_read_time), 3, es); @@ -4260,6 +4287,9 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage) ExplainPropertyFloat("Temp I/O Write Time", "ms", INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time), 3, es); + + if (timings_title) + ExplainCloseGroup(timings_title, timings_title, true, es); } } } diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index 6a74ca516ae..5e476939edf 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -414,9 +414,24 @@ ExecInitNode(Plan *node, EState *estate, int eflags) /* Set up instrumentation for this node if requested */ if (estate->es_instrument) + { result->instrument = InstrAllocNode(estate->es_instrument, result->async_capable); + /* IndexScan tracks table access separately from index access. */ + if (IsA(result, IndexScanState) && (estate->es_instrument & INSTRUMENT_BUFFERS) != 0) + { + IndexScanState *iss = castNode(IndexScanState, result); + + /* + * We intentionally don't collect timing here (even if enabled), + * since we don't need it, and IndexNext calls InstrPushStack / + * InstrPopStack (instead of InstrNode*) to reduce overhead. + */ + iss->iss_InstrumentTable = InstrAllocNode(INSTRUMENT_BUFFERS, false); + } + } + return result; } @@ -836,8 +851,19 @@ ExecRememberNodeInstrumentation_walker(PlanState *node, void *context) return false; if (node->instrument) + { InstrQueryRememberNode(parent, node->instrument); + /* IndexScan has a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTable) + InstrQueryRememberNode(parent, iss->iss_InstrumentTable); + } + } + return planstate_tree_walker(node, ExecRememberNodeInstrumentation_walker, context); } @@ -879,6 +905,15 @@ ExecFinalizeNodeInstrumentation_walker(PlanState *node, void *context) if (!node->instrument) return false; + /* IndexScan has a separate entry to track table access */ + if (IsA(node, IndexScanState)) + { + IndexScanState *iss = castNode(IndexScanState, node); + + if (iss->iss_InstrumentTable) + iss->iss_InstrumentTable = InstrFinalizeNode(iss->iss_InstrumentTable, &node->instrument->instr); + } + node->instrument = InstrFinalizeNode(node->instrument, parent); return false; diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 4513b1f7a90..3bd35392168 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -83,7 +83,9 @@ IndexNext(IndexScanState *node) ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; + ItemPointer tid; TupleTableSlot *slot; + bool found; /* * extract necessary information from index scan node @@ -128,8 +130,24 @@ IndexNext(IndexScanState *node) /* * ok, now that we have what we need, fetch the next tuple. */ - while (index_getnext_slot(scandesc, direction, slot)) + while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { + if (node->iss_InstrumentTable) + InstrPushStack(&node->iss_InstrumentTable->instr); + + for (;;) + { + found = index_fetch_heap(scandesc, slot); + if (found || !scandesc->xs_heap_continue) + break; + } + + if (node->iss_InstrumentTable) + InstrPopStack(&node->iss_InstrumentTable->instr); + + if (unlikely(!found)) + continue; + CHECK_FOR_INTERRUPTS(); /* @@ -257,36 +275,67 @@ IndexNextWithReorder(IndexScanState *node) } /* - * Fetch next tuple from the index. + * Fetch next valid tuple from the index. */ -next_indextuple: - if (!index_getnext_slot(scandesc, ForwardScanDirection, slot)) + for (;;) { + ItemPointer tid; + bool found; + + /* Time to fetch the next TID from the index */ + tid = index_getnext_tid(scandesc, ForwardScanDirection); + + /* If we're out of index entries, we're done */ + if (tid == NULL) + { + /* + * No more tuples from the index. But we still need to drain + * any remaining tuples from the queue before we're done. + */ + node->iss_ReachedEnd = true; + break; + } + + Assert(ItemPointerEquals(tid, &scandesc->xs_heaptid)); + + if (node->iss_InstrumentTable) + InstrPushStack(&node->iss_InstrumentTable->instr); + + for (;;) + { + found = index_fetch_heap(scandesc, slot); + if (found || !scandesc->xs_heap_continue) + break; + } + + if (node->iss_InstrumentTable) + InstrPopStack(&node->iss_InstrumentTable->instr); + /* - * No more tuples from the index. But we still need to drain any - * remaining tuples from the queue before we're done. + * If the index was lossy, we have to recheck the index quals and + * ORDER BY expressions using the fetched tuple. */ - node->iss_ReachedEnd = true; - continue; - } - - /* - * If the index was lossy, we have to recheck the index quals and - * ORDER BY expressions using the fetched tuple. - */ - if (scandesc->xs_recheck) - { - econtext->ecxt_scantuple = slot; - if (!ExecQualAndReset(node->indexqualorig, econtext)) + if (found && scandesc->xs_recheck) { - /* Fails recheck, so drop it and loop back for another */ - InstrCountFiltered2(node, 1); - /* allow this loop to be cancellable */ - CHECK_FOR_INTERRUPTS(); - goto next_indextuple; + econtext->ecxt_scantuple = slot; + if (!ExecQualAndReset(node->indexqualorig, econtext)) + { + /* Fails recheck, so drop it and loop back for another */ + InstrCountFiltered2(node, 1); + /* allow this loop to be cancellable */ + CHECK_FOR_INTERRUPTS(); + continue; + } } + + if (found) + break; } + /* No more index entries, re-run to clear the reorder queue */ + if (node->iss_ReachedEnd) + continue; + if (scandesc->xs_recheckorderby) { econtext->ecxt_scantuple = slot; @@ -812,6 +861,11 @@ ExecEndIndexScan(IndexScanState *node) * which will have a new IndexOnlyScanState and zeroed stats. */ winstrument->nsearches += node->iss_Instrument.nsearches; + if (node->iss_InstrumentTable) + { + BufferUsageAdd(&winstrument->worker_table_bufusage, &node->iss_InstrumentTable->instr.bufusage); + WalUsageAdd(&winstrument->worker_table_walusage, &node->iss_InstrumentTable->instr.walusage); + } } /* @@ -1819,4 +1873,14 @@ ExecIndexScanRetrieveInstrumentation(IndexScanState *node) SharedInfo->num_workers * sizeof(IndexScanInstrumentation); node->iss_SharedInfo = palloc(size); memcpy(node->iss_SharedInfo, SharedInfo, size); + + /* Aggregate workers' table buffer/WAL usage into leader's entry */ + if (node->iss_InstrumentTable) + for (int i = 0; i < node->iss_SharedInfo->num_workers; i++) + { + BufferUsageAdd(&node->iss_InstrumentTable->instr.bufusage, + &node->iss_SharedInfo->winstrument[i].worker_table_bufusage); + WalUsageAdd(&node->iss_InstrumentTable->instr.walusage, + &node->iss_SharedInfo->winstrument[i].worker_table_walusage); + } } diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h index 8847d7f94fa..170b6143ef6 100644 --- a/src/include/executor/instrument_node.h +++ b/src/include/executor/instrument_node.h @@ -18,6 +18,8 @@ #ifndef INSTRUMENT_NODE_H #define INSTRUMENT_NODE_H +#include "executor/instrument.h" + /* --------------------- * Instrumentation information for aggregate function execution @@ -48,6 +50,10 @@ typedef struct IndexScanInstrumentation { /* Index search count (incremented with pgstat_count_index_scan call) */ uint64 nsearches; + + /* Used for passing iss_InstrumentTableStack data from parallel workers */ + BufferUsage worker_table_bufusage; + WalUsage worker_table_walusage; } IndexScanInstrumentation; /* diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 605c7a6cc39..c778641c13d 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -1728,6 +1728,13 @@ typedef struct IndexScanState IndexScanInstrumentation iss_Instrument; SharedIndexScanInstrumentation *iss_SharedInfo; + /* + * Instrumentation utilized for tracking table access. This is separate + * from iss_Instrument since it needs to be allocated in the right context + * and IndexScanInstrumentation shouldn't contain pointers. + */ + NodeInstrumentation *iss_InstrumentTable; + /* These are needed for re-checking ORDER BY expr ordering */ pairingheap *iss_ReorderQueue; bool iss_ReachedEnd; -- 2.47.1