From 0b1f786bf58c3d90e078d4afa83b7d43dda08491 Mon Sep 17 00:00:00 2001
From: Manni Wood <manni.wood@enterprisedb.com>
Date: Fri, 5 Dec 2025 18:30:00 -0600
Subject: [PATCH v4 1/2] Speed up COPY FROM text/CSV parsing using SIMD

Authors: Shinya Kato <shinya11.kato@gmail.com>,
Nazir Bilal Yavuz <byavuz81@gmail.com>,
Ayoub Kazar <ma_kazar@esi.dz>
Reviewers: Andrew Dunstan <andrew@dunslane.net>
Descussion:
https://www.postgresql.org/message-id/flat/CAOzEurSW8cNr6TPKsjrstnPfhf4QyQqB4tnPXGGe8N4e_v7Jig@mail.gmail.com
---
 src/backend/commands/copyfromparse.c | 73 ++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index a09e7fbace3..1edb525f072 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -71,7 +71,9 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/pg_bitutils.h"
 #include "port/pg_bswap.h"
+#include "port/simd.h"
 #include "utils/builtins.h"
 #include "utils/rel.h"
 
@@ -1255,6 +1257,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 	char		quotec = '\0';
 	char		escapec = '\0';
 
+#ifndef USE_NO_SIMD
+	Vector8		nl = vector8_broadcast('\n');
+	Vector8		cr = vector8_broadcast('\r');
+	Vector8		bs = vector8_broadcast('\\');
+	Vector8		quote = vector8_broadcast(0);
+	Vector8		escape = vector8_broadcast(0);
+#endif
+
 	if (is_csv)
 	{
 		quotec = cstate->opts.quote[0];
@@ -1262,6 +1272,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 		/* ignore special escape processing if it's the same as quotec */
 		if (quotec == escapec)
 			escapec = '\0';
+
+#ifndef USE_NO_SIMD
+		quote = vector8_broadcast(quotec);
+		if (quotec != escapec)
+			escape = vector8_broadcast(escapec);
+#endif
 	}
 
 	/*
@@ -1328,6 +1344,63 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
 			need_data = false;
 		}
 
+#ifndef USE_NO_SIMD
+
+		/*
+		 * Use SIMD instructions to efficiently scan the input buffer for
+		 * special characters (e.g., newline, carriage return, quote, and
+		 * escape). This is faster than byte-by-byte iteration, especially on
+		 * large buffers.
+		 *
+		 * We do not apply the SIMD fast path in either of the following
+		 * cases: - When the previously processed character was an escape
+		 * character (last_was_esc), since the next byte must be examined
+		 * sequentially. - The remaining buffer is smaller than one vector
+		 * width (sizeof(Vector8)); SIMD operates on fixed-size chunks.
+		 */
+		if (!last_was_esc && copy_buf_len - input_buf_ptr >= sizeof(Vector8))
+		{
+			Vector8		chunk;
+			Vector8		match = vector8_broadcast(0);
+			uint32		mask;
+
+			/* Load a chunk of data into a vector register */
+			vector8_load(&chunk, (const uint8 *) &copy_input_buf[input_buf_ptr]);
+
+			/* \n and \r are not special inside quotes */
+			if (!in_quote)
+				match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr));
+
+			if (is_csv)
+			{
+				match = vector8_or(match, vector8_eq(chunk, quote));
+				if (escapec != '\0')
+					match = vector8_or(match, vector8_eq(chunk, escape));
+			}
+			else
+				match = vector8_or(match, vector8_eq(chunk, bs));
+
+			/* Check if we found any special characters */
+			mask = vector8_highbit_mask(match);
+			if (mask != 0)
+			{
+				/*
+				 * Found a special character. Advance up to that point and let
+				 * the scalar code handle it.
+				 */
+				int			advance = pg_rightmost_one_pos32(mask);
+
+				input_buf_ptr += advance;
+			}
+			else
+			{
+				/* No special characters found, so skip the entire chunk */
+				input_buf_ptr += sizeof(Vector8);
+				continue;
+			}
+		}
+#endif
+
 		/* OK to fetch a character */
 		prev_raw_ptr = input_buf_ptr;
 		c = copy_input_buf[input_buf_ptr++];
-- 
2.52.0

