public inbox for [email protected]
help / color / mirror / Atom feedFrom: Nazir Bilal Yavuz <[email protected]>
To: Nathan Bossart <[email protected]>
Cc: Manni Wood <[email protected]>
Cc: KAZAR Ayoub <[email protected]>
Cc: Neil Conway <[email protected]>
Cc: Andrew Dunstan <[email protected]>
Cc: Shinya Kato <[email protected]>
Cc: PostgreSQL-development <[email protected]>
Subject: Re: Speed up COPY FROM text/CSV parsing using SIMD
Date: Mon, 23 Feb 2026 12:10:44 +0300
Message-ID: <CAN55FZ3cBN_TncLVWyXAKm-KfewguN1AUjyRhoR6zL_QCxHh7A@mail.gmail.com> (raw)
In-Reply-To: <aZikzQP6WPJ5Rq2S@nathan>
References: <CAN55FZ3g6QaiC8G4GMjdJ24egvgc-HG_xpoOztxnM_wnQNn5aw@mail.gmail.com>
<aY-vJe_ENCB-fux9@nathan>
<CAN55FZ2OpqRxUUEvgPpHCk2HnY0xZSH1x09fgFGOUyXSv8HcEA@mail.gmail.com>
<aZYudtuBLVb36pZE@nathan>
<CAN55FZ0J5iz9wFJLHcK7yNQqPb10_4ROoZiDu1wBZWSGC_fATg@mail.gmail.com>
<CAKWEB6qY=mU62oAQFAVPCFWvwRuTPKBwxvM2aZ+J7p_9_MBmhQ@mail.gmail.com>
<CAN55FZ2RPMxquXE6TH7dQkhtoiBcOOOZq8EOXj5COHv3ecP_cw@mail.gmail.com>
<CA+K2Ru=fFTUVgEDr-fBed5aOMeDbH9vrOEhapXzHEpBeOxkucg@mail.gmail.com>
<CAKWEB6pq7C0Wv1wT9Y1_c_1fn-+cR8pb210Pj3w2FcEOmNGxbQ@mail.gmail.com>
<CAN55FZ2DT4-k06umn=7NYG+NoM6gnVJVQCCwRrr2qOraO+Jadw@mail.gmail.com>
<aZikzQP6WPJ5Rq2S@nathan>
Hi,
On Fri, 20 Feb 2026 at 21:15, Nathan Bossart <[email protected]> wrote:
>
> Yeah, the couple of small regressions seem close to (or below) the noise
> level, and IIUC yours were the only benchmarks that showed them, anyway.
> Plus, I think we'll need this change regardless as a prerequisite for the
> SIMD work.
>
> > Thank you both for the benchmarks. Results look good to me!
>
> Committed that part.
Thank you! Attaching the SIMD patch only.
--
Regards,
Nazir Bilal Yavuz
Microsoft
Attachments:
[text/x-patch] v10-0001-Speed-up-COPY-FROM-text-CSV-parsing-using-SIMD.patch (7.7K, 2-v10-0001-Speed-up-COPY-FROM-text-CSV-parsing-using-SIMD.patch)
download | inline diff:
From 9ef4e1376657b577cd4b4c42fb6a592ebd5fae24 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <[email protected]>
Date: Fri, 13 Feb 2026 13:28:55 +0300
Subject: [PATCH v10] Speed up COPY FROM text/CSV parsing using SIMD
This patch disables SIMD when SIMD encounters a special character which
is neither EOF nor EOL.
Author: Shinya Kato <[email protected]>
Author: Nazir Bilal Yavuz <[email protected]>
Reviewed-by: Kazar Ayoub <[email protected]>
Reviewed-by: Nathan Bossart <[email protected]>
Reviewed-by: Neil Conway <[email protected]>
Reviewed-by: Andrew Dunstan <[email protected]>
Reviewed-by: Manni Wood <[email protected]>
Reviewed-by: Mark Wong <[email protected]>
Discussion: https://postgr.es/m/CAOzEurSW8cNr6TPKsjrstnPfhf4QyQqB4tnPXGGe8N4e_v7Jig%40mail.gmail.com
---
src/backend/commands/copyfrom.c | 3 +
src/backend/commands/copyfromparse.c | 135 ++++++++++++++++++++++-
src/include/commands/copyfrom_internal.h | 3 +
3 files changed, 137 insertions(+), 4 deletions(-)
diff --git a/src/backend/commands/copyfrom.c b/src/backend/commands/copyfrom.c
index 2b7556b287c..3dd159f15b2 100644
--- a/src/backend/commands/copyfrom.c
+++ b/src/backend/commands/copyfrom.c
@@ -1717,6 +1717,9 @@ BeginCopyFrom(ParseState *pstate,
cstate->cur_attval = NULL;
cstate->relname_only = false;
+ /* Initialize SIMD */
+ cstate->simd_enabled = true;
+
/*
* Allocate buffers for the input pipeline.
*
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 6b00d49c50f..7bdf5681628 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -72,6 +72,7 @@
#include "miscadmin.h"
#include "pgstat.h"
#include "port/pg_bswap.h"
+#include "port/simd.h"
#include "utils/builtins.h"
#include "utils/rel.h"
@@ -142,7 +143,8 @@ static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0";
/* non-export function prototypes */
static bool CopyReadLine(CopyFromState cstate, bool is_csv);
static pg_attribute_always_inline bool CopyReadLineText(CopyFromState cstate,
- bool is_csv);
+ bool is_csv,
+ bool simd_enabled);
static int CopyReadAttributesText(CopyFromState cstate);
static int CopyReadAttributesCSV(CopyFromState cstate);
static Datum CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
@@ -1182,9 +1184,19 @@ CopyReadLine(CopyFromState cstate, bool is_csv)
* specialized code with fewer branches.
*/
if (is_csv)
- result = CopyReadLineText(cstate, true);
+ {
+ if (cstate->simd_enabled)
+ result = CopyReadLineText(cstate, true, true);
+ else
+ result = CopyReadLineText(cstate, true, false);
+ }
else
- result = CopyReadLineText(cstate, false);
+ {
+ if (cstate->simd_enabled)
+ result = CopyReadLineText(cstate, false, true);
+ else
+ result = CopyReadLineText(cstate, false, false);
+ }
if (result)
{
@@ -1252,7 +1264,7 @@ CopyReadLine(CopyFromState cstate, bool is_csv)
* CopyReadLineText - inner loop of CopyReadLine for text mode
*/
static pg_attribute_always_inline bool
-CopyReadLineText(CopyFromState cstate, bool is_csv)
+CopyReadLineText(CopyFromState cstate, bool is_csv, bool simd_enabled)
{
char *copy_input_buf;
int input_buf_ptr;
@@ -1267,6 +1279,14 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
char quotec = '\0';
char escapec = '\0';
+#ifndef USE_NO_SIMD
+ Vector8 nl = vector8_broadcast('\n');
+ Vector8 cr = vector8_broadcast('\r');
+ Vector8 bs = vector8_broadcast('\\');
+ Vector8 quote = vector8_broadcast(0);
+ Vector8 escape = vector8_broadcast(0);
+#endif
+
if (is_csv)
{
quotec = cstate->opts.quote[0];
@@ -1274,6 +1294,12 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
/* ignore special escape processing if it's the same as quotec */
if (quotec == escapec)
escapec = '\0';
+
+#ifndef USE_NO_SIMD
+ quote = vector8_broadcast(quotec);
+ if (quotec != escapec)
+ escape = vector8_broadcast(escapec);
+#endif
}
/*
@@ -1340,6 +1366,107 @@ CopyReadLineText(CopyFromState cstate, bool is_csv)
need_data = false;
}
+#ifndef USE_NO_SIMD
+
+ /*
+ * Use SIMD instructions to efficiently scan the input buffer for
+ * special characters (e.g., newline, carriage return, quote, and
+ * escape). This is faster than byte-by-byte iteration, especially on
+ * large buffers.
+ *
+ * We do not apply the SIMD fast path in either of the following
+ * cases: - When the previously processed character was an escape
+ * character (last_was_esc), since the next byte must be examined
+ * sequentially. - When the remaining buffer is smaller than one
+ * vector width (sizeof(Vector8)), since SIMD operates on fixed-size
+ * chunks.
+ *
+ * Note that, SIMD may become slower when the input contains many
+ * special characters. To avoid this regression, we disable SIMD for
+ * the rest of the input once we encounter a special character which
+ * is neither EOF nor EOL.
+ */
+ if (simd_enabled && !last_was_esc && copy_buf_len - input_buf_ptr > sizeof(Vector8))
+ {
+ Vector8 chunk;
+ Vector8 match = vector8_broadcast(0);
+
+ /* Load a chunk of data into a vector register */
+ vector8_load(&chunk, (const uint8 *) ©_input_buf[input_buf_ptr]);
+
+ if (is_csv)
+ {
+ /* \n and \r are not special inside quotes */
+ if (!in_quote)
+ match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr));
+
+ match = vector8_or(match, vector8_eq(chunk, quote));
+ if (escapec != '\0')
+ match = vector8_or(match, vector8_eq(chunk, escape));
+ }
+ else
+ {
+ match = vector8_or(vector8_eq(chunk, nl), vector8_eq(chunk, cr));
+ match = vector8_or(match, vector8_eq(chunk, bs));
+ }
+
+ /* Check if we found any special characters */
+ if (vector8_is_highbit_set(match))
+ {
+ /*
+ * Found a special character. Advance up to that point and let
+ * the scalar code handle it.
+ */
+ uint32 mask;
+ int advance;
+ char c1,
+ c2;
+ bool simd_hit_eol,
+ simd_hit_eof;
+
+ mask = vector8_highbit_mask(match);
+ advance = pg_rightmost_one_pos32(mask);
+
+ input_buf_ptr += advance;
+ c1 = copy_input_buf[input_buf_ptr];
+
+ /*
+ * Since we stopped within the chunk and ((copy_buf_len -
+ * input_buf_ptr) > sizeof(Vector8)) is true,
+ * copy_input_buf[input_buf_ptr + 1] is guaranteed to be
+ * readable.
+ */
+ c2 = copy_input_buf[input_buf_ptr + 1];
+
+ simd_hit_eof = (c1 == '\\' && c2 == '.' && !is_csv);
+ simd_hit_eol = (c1 == '\r' || c1 == '\n');
+
+ /*
+ * If (is_csv && in_quote), we shouldn't have picked up '\r'
+ * or '\n' in the first place.
+ */
+ Assert(!simd_hit_eol || !(is_csv && in_quote));
+
+ /*
+ * Do not disable SIMD when we hit EOL or EOF characters. In
+ * practice, it does not matter for EOF because parsing ends
+ * there, but we keep the behavior consistent.
+ */
+ if (!(simd_hit_eof || simd_hit_eol))
+ {
+ simd_enabled = false;
+ cstate->simd_enabled = false;
+ }
+ }
+ else
+ {
+ /* No special characters found, so skip the entire chunk */
+ input_buf_ptr += sizeof(Vector8);
+ continue;
+ }
+ }
+#endif
+
/* OK to fetch a character */
prev_raw_ptr = input_buf_ptr;
c = copy_input_buf[input_buf_ptr++];
diff --git a/src/include/commands/copyfrom_internal.h b/src/include/commands/copyfrom_internal.h
index 822ef33cf69..73ce777c52b 100644
--- a/src/include/commands/copyfrom_internal.h
+++ b/src/include/commands/copyfrom_internal.h
@@ -89,6 +89,9 @@ typedef struct CopyFromStateData
const char *cur_attval; /* current att value for error messages */
bool relname_only; /* don't output line number, att, etc. */
+ /* SIMD variables */
+ bool simd_enabled;
+
/*
* Working state
*/
--
2.47.3
view thread (59+ messages) latest in thread
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], [email protected]
Subject: Re: Speed up COPY FROM text/CSV parsing using SIMD
In-Reply-To: <CAN55FZ3cBN_TncLVWyXAKm-KfewguN1AUjyRhoR6zL_QCxHh7A@mail.gmail.com>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox