From a06abff86337483ddcd4cd2a49ffbc03c30df966 Mon Sep 17 00:00:00 2001 From: Shinya Kato Date: Fri, 6 Mar 2026 16:10:59 +0900 Subject: [PATCH v3 1/2] Fix spurious NULL lag in pg_stat_replication Previously, ProcessStandbyReplyMessage() cleared replication lag times whenever the standby reported fully-applied WAL in two consecutive reply messages. This heuristic was too aggressive: in bursty reply patterns one message could consume all lag tracker samples, and the next message -- arriving before new samples accumulated -- would see no samples and trigger clearing, even though the standby was still actively replaying WAL. Add two additional conditions before clearing lag times: (1) all three LagTrackerRead() calls must return -1, indicating no new lag samples, and (2) write/flush/apply positions must be unchanged from the previous reply. Together with the existing fully-applied check, this ensures lag is only cleared when the standby is truly idle. Author: Shinya Kato Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/CAOzEurTzcUrEzrH97DD7+Yz=HGPU81kzWQonKZvqBwYhx2G9_A@mail.gmail.com --- src/backend/replication/walsender.c | 34 ++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 79fc192b171..e0b2ac29d74 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2457,11 +2457,16 @@ ProcessStandbyReplyMessage(void) TimeOffset writeLag, flushLag, applyLag; - bool clearLagTimes; + bool clearLagTimes, + noLagSamples, + positionsUnchanged; TimestampTz now; TimestampTz replyTime; static bool fullyAppliedLastTime = false; + static XLogRecPtr prevWritePtr = InvalidXLogRecPtr; + static XLogRecPtr prevFlushPtr = InvalidXLogRecPtr; + static XLogRecPtr prevApplyPtr = InvalidXLogRecPtr; /* the caller already consumed the msgtype byte */ writePtr = pq_getmsgint64(&reply_message); @@ -2493,16 +2498,25 @@ ProcessStandbyReplyMessage(void) flushLag = LagTrackerRead(SYNC_REP_WAIT_FLUSH, flushPtr, now); applyLag = LagTrackerRead(SYNC_REP_WAIT_APPLY, applyPtr, now); + /* Precompute inputs for clearLagTimes decision below. */ + noLagSamples = (writeLag == -1 && flushLag == -1 && applyLag == -1); + positionsUnchanged = (writePtr == prevWritePtr && + flushPtr == prevFlushPtr && + applyPtr == prevApplyPtr); + /* - * If the standby reports that it has fully replayed the WAL in two - * consecutive reply messages, then the second such message must result - * from wal_receiver_status_interval expiring on the standby. This is a - * convenient time to forget the lag times measured when it last - * wrote/flushed/applied a WAL record, to avoid displaying stale lag data - * until more WAL traffic arrives. + * If the standby reports that it has fully replayed the WAL, there are + * no new lag samples, and positions remain unchanged across two + * consecutive reply messages, forget the lag times measured when it last + * wrote/flushed/applied a WAL record. This avoids displaying stale lag + * data until more WAL traffic arrives. + * + * The position-unchanged check prevents spuriously clearing lag in + * bursty reply patterns, where one reply consumes all lag tracker + * samples and the next arrives before new samples accumulate. */ clearLagTimes = false; - if (applyPtr == sentPtr) + if (applyPtr == sentPtr && noLagSamples && positionsUnchanged) { if (fullyAppliedLastTime) clearLagTimes = true; @@ -2511,6 +2525,10 @@ ProcessStandbyReplyMessage(void) else fullyAppliedLastTime = false; + prevWritePtr = writePtr; + prevFlushPtr = flushPtr; + prevApplyPtr = applyPtr; + /* Send a reply if the standby requested one. */ if (replyRequested) WalSndKeepalive(false, InvalidXLogRecPtr); -- 2.47.3