From f772043e2104bf67964418dc80c3abb56bdb069d Mon Sep 17 00:00:00 2001 From: Amit Langote Date: Thu, 29 Jan 2026 00:57:04 +0900 Subject: [PATCH v5 1/5] Add batch table AM API and heapam implementation Introduce new table AM callbacks to fetch multiple tuples per call. This reduces per-tuple call overhead by letting executor nodes work in batches. Define a HeapBatch structure and supporting code in tableam.h. Batches are limited to tuples from a single page and at most EXEC_BATCH_ROWS (currently 64) entries. Provide initial heapam support with heapgettup_pagemode_batch(). No executor node is switched over yet; a later commit will adapt SeqScan to use this API. Other nodes may adopt it in the future. Also add pgstat_count_heap_getnext_batch() to record batched fetches in pgstat. Reviewed-by: Daniil Davydov <3danissimo@gmail.com> Reviewed-by: ChangAo Chen <2624345507@qq.com> Discussion: https://postgr.es/m/CA+HiwqFfAY_ZFqN8wcAEMw71T9hM_kA8UtyHaZZEZtuT3UyogA@mail.gmail.com --- src/backend/access/heap/heapam.c | 221 +++++++++++++++++++++++ src/backend/access/heap/heapam_handler.c | 4 + src/include/access/heapam.h | 18 ++ src/include/access/tableam.h | 58 ++++++ src/include/pgstat.h | 5 + 5 files changed, 306 insertions(+) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index f30a56ecf55..d8d1bdf5191 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1151,6 +1151,134 @@ continue_page: scan->rs_inited = false; } +/* + * heapgettup_pagemode_batch + * Collect up to 'maxitems' visible tuples from a single page in page mode. + * + * This function returns a *batch* of tuples from one heap page. If the + * current page (as tracked by the scan desc) has no more tuples left, + * it will advance to the next page and prepare it (via heap_prepare_pagescan). + * It will not cross a page boundary while filling the batch. + * + * Return value: + * number of tuples written into 'tdata' (0 at end-of-scan). + * + * Side effects: + * - Ensures rs_cbuf pins the page from which tuples were produced. + * - Sets rs_cblock, rs_cindex, rs_ntuples consistently (same as + * heapgettup_pagemode’s inner-loop effects). + * - Does *not* change buffer pin counts except through normal page + * transitions performed by heap_fetch_next_buffer(). + */ +static int +heapgettup_pagemode_batch(HeapScanDesc scan, + ScanDirection dir, + int nkeys, ScanKey key, + HeapTupleData *tdata, + int maxitems) +{ + Page page; + uint32 lineindex; + uint32 linesleft; + int nout = 0; + Relation rel = scan->rs_base.rs_rd; + TupleDesc tupdesc = RelationGetDescr(rel); + + /* + * Current batching limitations (may be relaxed in future): + * + * - Forward scans only: backward scan support would require changes to + * batch iteration and page advancement logic. + * + * - Pagemode required: batching relies on the pre-built rs_vistuples[] + * array from heap_prepare_pagescan(). This is guaranteed by + * ScanCanUseBatching() which only enables batching when SO_ALLOW_PAGEMODE + * is set. Unlike heap_getnextslot, we don't support dynamic fallback to + * tuple-at-a-time mode since the batch execution path is selected at + * ExecInit time. + */ + Assert(ScanDirectionIsForward(dir)); + Assert(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE); + Assert(maxitems > 0); + + /* + * Loop until we find tuples that pass the scan key, or reach end of scan. + * We never cross page boundaries within a single batch. + */ + for (;;) + { + /* + * Advance to a page with visible tuples if needed. + */ + if (BufferIsValid(scan->rs_cbuf)) + { + lineindex = scan->rs_cindex + 1; + linesleft = (lineindex <= scan->rs_ntuples) ? + (scan->rs_ntuples - lineindex) : 0; + } + else + linesleft = 0; + + while (linesleft == 0) + { + heap_fetch_next_buffer(scan, dir); + + if (!BufferIsValid(scan->rs_cbuf)) + { + /* End of scan */ + scan->rs_cblock = InvalidBlockNumber; + scan->rs_prefetch_block = InvalidBlockNumber; + scan->rs_inited = false; + return 0; + } + + Assert(BufferGetBlockNumber(scan->rs_cbuf) == scan->rs_cblock); + heap_prepare_pagescan((TableScanDesc) scan); + + lineindex = 0; + linesleft = scan->rs_ntuples; + } + + /* + * Walk rs_vistuples[] copying headers into tdata[] until the page + * is exhausted or batch capacity is reached. + */ + page = BufferGetPage(scan->rs_cbuf); + + for (; linesleft > 0 && nout < maxitems; linesleft--, lineindex++) + { + OffsetNumber lineoff; + ItemId lpp; + HeapTupleData *dst = &tdata[nout]; + + Assert(lineindex < scan->rs_ntuples); + lineoff = scan->rs_vistuples[lineindex]; + lpp = PageGetItemId(page, lineoff); + Assert(ItemIdIsNormal(lpp)); + + dst->t_data = (HeapTupleHeader) PageGetItem(page, lpp); + dst->t_len = ItemIdGetLength(lpp); + Assert(dst->t_tableOid == RelationGetRelid(rel)); + ItemPointerSet(&(dst->t_self), scan->rs_cblock, lineoff); + + if (key != NULL && !HeapKeyTest(dst, tupdesc, nkeys, key)) + continue; + + scan->rs_cindex = lineindex; + nout++; + } + + /* Return if we found any tuples; otherwise try next page */ + if (nout > 0) + return nout; + + /* Mark page exhausted so we advance on next iteration */ + scan->rs_cindex = scan->rs_ntuples; + } + + pg_unreachable(); + return 0; +} /* ---------------------------------------------------------------- * heap access method interface @@ -1483,6 +1611,99 @@ heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *s return true; } +/*---------- Batching support -----------*/ + +/* + * heap_scan_begin_batch + * + * Allocate a HeapBatch with space for 'maxitems' tuple headers. No pin is + * taken here. Memory is allocated under the scan's memory context. + */ +void * +heap_begin_batch(TableScanDesc sscan, int maxitems) +{ + HeapBatch *hb; + Oid relid; + Size alloc_size; + + Assert(maxitems > 0); + + /* Single allocation for HeapBatch header + tupdata array */ + alloc_size = sizeof(HeapBatch) + sizeof(HeapTupleData) * maxitems; + hb = palloc(alloc_size); + hb->tupdata = (HeapTupleData *) ((char *) hb + sizeof(HeapBatch)); + hb->maxitems = maxitems; + hb->nitems = 0; + hb->buf = InvalidBuffer; + + /* Initialize static fields of HeapTupleData. Row bodies remain on page. */ + relid = RelationGetRelid(sscan->rs_rd); + for (int i = 0; i < maxitems; i++) + hb->tupdata[i].t_tableOid = relid; + + return hb; +} + +/* + * heap_scan_end_batch + * + * Release any outstanding pin and free the batch allocations. Caller will + * not use 'am_batch' after this point. + */ +void +heap_end_batch(TableScanDesc sscan, void *am_batch) +{ + HeapBatch *hb = (HeapBatch *) am_batch; + + if (BufferIsValid(hb->buf)) + ReleaseBuffer(hb->buf); + + pfree(hb); +} + +int +heap_getnextbatch(TableScanDesc sscan, void *am_batch, ScanDirection dir) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + HeapBatch *hb = (HeapBatch *) am_batch; + Buffer curbuf; + int n; + + Assert(ScanDirectionIsForward(dir)); + Assert(sscan->rs_flags & SO_ALLOW_PAGEMODE); + Assert(hb->maxitems > 0); + + /* Drop prior batch pin, if any. */ + if (BufferIsValid(hb->buf)) + { + ReleaseBuffer(hb->buf); + hb->buf = InvalidBuffer; + } + + hb->nitems = 0; + + /* One call per batch, never crosses a page. */ + n = heapgettup_pagemode_batch(scan, dir, + sscan->rs_nkeys, sscan->rs_key, + hb->tupdata, hb->maxitems); + + if (n == 0) + return 0; /* end of scan */ + + /* Hold a shared pin for the batch lifetime so t_data stays valid. */ + curbuf = scan->rs_cbuf; + IncrBufferRefCount(curbuf); + hb->buf = curbuf; + + /* Per-tuple stats (can be collapsed into a future _multi() call). */ + pgstat_count_heap_getnext_batch(sscan->rs_rd, n); + + hb->nitems = n; + return n; +} + +/*----- End of batching support -----*/ + void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid) diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index cbef73e5d4b..e4cf7fc296b 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2637,6 +2637,10 @@ static const TableAmRoutine heapam_methods = { .scan_rescan = heap_rescan, .scan_getnextslot = heap_getnextslot, + .scan_begin_batch = heap_begin_batch, + .scan_getnextbatch = heap_getnextbatch, + .scan_end_batch = heap_end_batch, + .scan_set_tidrange = heap_set_tidrange, .scan_getnextslot_tidrange = heap_getnextslot_tidrange, diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 3c0961ab36b..e2417650c5f 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -101,6 +101,19 @@ typedef struct HeapScanDescData } HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; +/* + * HeapBatch -- stateless per-batch buffer. A batch pins one page and + * exposes up to maxitems HeapTupleData headers whose t_data point into that + * page. + */ +typedef struct HeapBatch +{ + HeapTupleData *tupdata; /* len = maxitems; headers only */ + int nitems; /* tuples produced in last getnextbatch() */ + int maxitems; /* fixed capacity set at begin_batch() */ + Buffer buf; /* single pinned buffer for this batch */ +} HeapBatch; + typedef struct BitmapHeapScanDescData { HeapScanDescData rs_heap_base; @@ -337,6 +350,11 @@ extern void heap_endscan(TableScanDesc sscan); extern HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction); extern bool heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot); + +extern void *heap_begin_batch(TableScanDesc sscan, int maxitems); +extern void heap_end_batch(TableScanDesc sscan, void *am_batch); +extern int heap_getnextbatch(TableScanDesc sscan, void *am_batch, ScanDirection dir); + extern void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid); extern bool heap_getnextslot_tidrange(TableScanDesc sscan, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index e2ec5289d4d..584b580f7a1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -351,6 +351,16 @@ typedef struct TableAmRoutine ScanDirection direction, TupleTableSlot *slot); + /* ------------------------------------------------------------------------ + * Batched scan support + * ------------------------------------------------------------------------ + */ + + void *(*scan_begin_batch)(TableScanDesc sscan, int maxitems); + int (*scan_getnextbatch)(TableScanDesc sscan, void *am_batch, + ScanDirection dir); + void (*scan_end_batch)(TableScanDesc sscan, void *am_batch); + /*----------- * Optional functions to provide scanning for ranges of ItemPointers. * Implementations must either provide both of these functions, or neither @@ -1036,6 +1046,54 @@ table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableS return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot); } +/* + * table_scan_begin_batch + * Allocate AM-owned batch payload with capacity 'maxitems'. + */ +static inline void * +table_scan_begin_batch(TableScanDesc sscan, int maxitems) +{ + const TableAmRoutine *tam = sscan->rs_rd->rd_tableam; + + Assert(tam->scan_begin_batch != NULL); + + return tam->scan_begin_batch(sscan, maxitems); +} + +/* + * table_scan_getnextbatch + * Fill next batch from the AM. Returns number of tuples, 0 => EOS. + * Batches are single-page in v1. Direction is forward only in v1. + */ +static inline int +table_scan_getnextbatch(TableScanDesc sscan, void *am_batch, ScanDirection dir) +{ + const TableAmRoutine *tam = sscan->rs_rd->rd_tableam; + + /* Only forward scans are supported in the batched mode. */ + Assert(ScanDirectionIsForward(dir)); + Assert(tam->scan_getnextbatch != NULL); + + return tam->scan_getnextbatch(sscan, am_batch, dir); +} + +/* + * table_scan_end_batch + * Release AM-owned resources for the batch payload. + */ +static inline void +table_scan_end_batch(TableScanDesc sscan, void *am_batch) +{ + const TableAmRoutine *tam = sscan->rs_rd->rd_tableam; + + if (am_batch == NULL) + return; + + Assert(tam->scan_end_batch != NULL); + + tam->scan_end_batch(sscan, am_batch); +} + /* ---------------------------------------------------------------------------- * TID Range scanning related functions. * ---------------------------------------------------------------------------- diff --git a/src/include/pgstat.h b/src/include/pgstat.h index fff7ecc2533..48e4e034a33 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -697,6 +697,11 @@ extern void pgstat_report_analyze(Relation rel, if (pgstat_should_count_relation(rel)) \ (rel)->pgstat_info->counts.tuples_returned++; \ } while (0) +#define pgstat_count_heap_getnext_batch(rel, n) \ + do { \ + if (pgstat_should_count_relation(rel)) \ + (rel)->pgstat_info->counts.tuples_returned += n; \ + } while (0) #define pgstat_count_heap_fetch(rel) \ do { \ if (pgstat_should_count_relation(rel)) \ -- 2.47.3