From 92ac4ada1e4833f81ce30164b48868dc1ade102f Mon Sep 17 00:00:00 2001
From: Manni Wood <manni.wood@enterprisedb.com>
Date: Fri, 5 Dec 2025 18:33:46 -0600
Subject: [PATCH v4.2 2/3] Speed up COPY FROM text/CSV parsing using SIMD

Authors: Shinya Kato <shinya11.kato@gmail.com>,
Nazir Bilal Yavuz <byavuz81@gmail.com>,
Ayoub Kazar <ma_kazar@esi.dz>
Reviewers: Andrew Dunstan <andrew@dunslane.net>
Descussion:
https://www.postgresql.org/message-id/flat/CAOzEurSW8cNr6TPKsjrstnPfhf4QyQqB4tnPXGGe8N4e_v7Jig@mail.gmail.com
---
 src/include/commands/copyfrom_internal.h | 11 +++++++++
 src/backend/commands/copyfrom.c          |  3 +++
 src/backend/commands/copyfromparse.c     | 29 +++++++++++++++++++++++-
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index c8b22af22d8..215215f909f 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -181,6 +181,17 @@ typedef struct CopyFromStateData
 #define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
 
 	uint64		bytes_processed;	/* number of bytes processed so far */
+
+	/* the amount of bytes to read until checking if we should try simd */
+#define BYTES_PROCESSED_UNTIL_SIMD_CHECK 100000
+	/* the number of special chars read below which we use simd */
+#define SPECIAL_CHAR_SIMD_THRESHOLD 20000
+	uint64		special_chars_encountered;	/* number of special chars
+											 * encountered so far */
+	bool		checked_simd;	/* we read BYTES_PROCESSED_UNTIL_SIMD_CHECK
+								 * and checked if we should use SIMD on the
+								 * rest of the file */
+	bool		use_simd;		/* use simd to speed up copying */
 } CopyFromStateData;
 
 extern void ReceiveCopyBegin(CopyFromState cstate);
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 2ae3d2ba86e..6711c0cfcdd 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -1720,6 +1720,9 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 	cstate->relname_only = false;
+	cstate->special_chars_encountered = 0;
+	cstate->checked_simd = false;
+	cstate->use_simd = false;
 
 	/*
 	 * Allocate buffers for the input pipeline.
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 673d6683a72..d548674c8ff 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -1346,6 +1346,28 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 
 #ifndef USE_NO_SIMD
 
+		/*
+		 * Wait until we have read more than BYTES_PROCESSED_UNTIL_SIMD_CHECK.
+		 * cstate->bytes_processed will grow an unpredictable amount with each
+		 * call to this function, so just wait until we have crossed the
+		 * threshold.
+		 */
+		if (!cstate->checked_simd && cstate->bytes_processed > BYTES_PROCESSED_UNTIL_SIMD_CHECK)
+		{
+			cstate->checked_simd = true;
+
+			/*
+			 * If we have not read too many special characters
+			 * (SPECIAL_CHAR_SIMD_THRESHOLD) then start using SIMD to speed up
+			 * processing. This heuristic assumes that input does not vary too
+			 * much from line to line and that number of special characters
+			 * encountered in the first BYTES_PROCESSED_UNTIL_SIMD_CHECK are
+			 * indicitive of the whole file.
+			 */
+			if (cstate->special_chars_encountered < SPECIAL_CHAR_SIMD_THRESHOLD)
+				cstate->use_simd = true;
+		}
+
 		/*
 		 * Use SIMD instructions to efficiently scan the input buffer for
 		 * special characters (e.g., newline, carriage return, quote, and
@@ -1358,7 +1380,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		 * sequentially. - The remaining buffer is smaller than one vector
 		 * width (sizeof(Vector8)); SIMD operates on fixed-size chunks.
 		 */
-		if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
+		if (cstate->use_simd && !last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
 		{
 			Vector8		chunk;
 			Vector8		match = vector8_broadcast(0);
@@ -1415,6 +1437,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 			 */
 			if (c == '\r')
 			{
+				cstate->special_chars_encountered++;
 				IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			}
 
@@ -1446,6 +1469,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* Process \r */
 		if (c == '\r' && (!is_csv || !in_quote))
 		{
+			cstate->special_chars_encountered++;
 			/* Check for \r\n on first line, _and_ handle \r\n. */
 			if (cstate->eol_type == EOL_UNKNOWN ||
 				cstate->eol_type == EOL_CRNL)
@@ -1502,6 +1526,7 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* Process \n */
 		if (c == '\n' && (!is_csv || !in_quote))
 		{
+			cstate->special_chars_encountered++;
 			if (cstate->eol_type == EOL_CR || cstate->eol_type == EOL_CRNL)
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
@@ -1524,6 +1549,8 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		{
 			char		c2;
 
+			cstate->special_chars_encountered++;
+
 			IF_NEED_REFILL_AND_NOT_EOF_CONTINUE(0);
 			IF_NEED_REFILL_AND_EOF_BREAK(0);
 
-- 
2.51.0

