public inbox for [email protected]
help / color / mirror / Atom feedFrom: Matthew Sterrett <[email protected]>
To: [email protected]
Cc: Devulapalli, Raghuveer <[email protected]>
Cc: Shankaran, Akash <[email protected]>
Cc: Matthew Sterrett <[email protected]>
Subject: Proposal for enabling auto-vectorization for checksum calculations
Date: Fri, 4 Apr 2025 15:03:43 -0700
Message-ID: <CA+vA85_5GTu+HHniSbvvP+8k3=xZO=WE84NPwiKyxztqvpfZ3Q@mail.gmail.com> (raw)
Hello,
This patch enables more compiler autovectorization for the checksum
calculations.
This code is particularly well suited for autovectorization, so just
adding pg_attribute_target and some simple dynamic dispatch logic we
can get improved vectorization.
This gives about a 2x speedup in a synthetic benchmark for
pg_checksum, which is also included as a seperate patch file.
Additionally, another 2x performance increase in the synthetic
benchmark with AVX2 can be obtained if N_SUMS was changed to 64.
However, this would change the results of the checksum. This isn't
included in this patch, but I think it is worth considering for the
future
One additional factor, without explicitly passing some optimization
flag like -O2 the makefile build won't autovectorize any of the code.
However, the meson based build does this automatically.
Attachments:
[application/octet-stream] v1-0001-Enable-autovectorizing-pg_checksum_block.patch (10.5K, 2-v1-0001-Enable-autovectorizing-pg_checksum_block.patch)
download | inline diff:
From e485110192c208becad0153e51a39514cace1377 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <[email protected]>
Date: Fri, 7 Mar 2025 11:33:45 -0800
Subject: [PATCH v1 1/2] Enable autovectorizing pg_checksum_block
---
config/c-compiler.m4 | 31 ++++++
configure | 52 ++++++++++
configure.ac | 9 ++
meson.build | 28 ++++++
src/include/pg_config.h.in | 3 +
src/include/storage/checksum_impl.h | 150 +++++++++++++++++++++++-----
6 files changed, 250 insertions(+), 23 deletions(-)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 8534cc54c1..143a8a40dd 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -670,6 +670,37 @@ fi
undefine([Ac_cachevar])dnl
])# PGAC_XSAVE_INTRINSICS
+# PGAC_AVX2_SUPPORT
+# -----------------------------
+# Check if the compiler supports AVX2 in attribute((target))
+# and using AVX2 intrinsics in those functions
+#
+# If the intrinsics are supported, sets pgac_avx2_support.
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 support], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+ #include <stdint.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("avx2")))
+ #endif
+ static int avx2_test(void)
+ {
+ const char buf@<:@sizeof(__m256i)@:>@;
+ __m256i accum = _mm256_loadu_si256((const __m256i *) buf);
+ accum = _mm256_add_epi32(accum, accum);
+ int result = _mm256_extract_epi32(accum, 0);
+ return (int) result;
+ }],
+ [return avx2_test();])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
# PGAC_AVX512_POPCNT_INTRINSICS
# -----------------------------
# Check if the compiler supports the AVX-512 popcount instructions using the
diff --git a/configure b/configure
index ceeef9b091..87080b8844 100755
--- a/configure
+++ b/configure
@@ -17113,6 +17113,58 @@ $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
fi
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
+$as_echo_n "checking for AVX2 support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <immintrin.h>
+ #include <stdint.h>
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("avx2")))
+ #endif
+ static int avx2_test(void)
+ {
+ const char buf[sizeof(__m256i)];
+ __m256i accum = _mm256_loadu_si256((const __m256i *) buf);
+ accum = _mm256_add_epi32(accum, accum);
+ int result = _mm256_extract_epi32(accum, 0);
+ return (int) result;
+ }
+int
+main ()
+{
+return avx2_test();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_avx2_support=yes
+else
+ pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+ pgac_avx2_support=yes
+fi
+
+ if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ fi
+fi
+
# Check for AVX-512 popcount intrinsics
#
if test x"$host_cpu" = x"x86_64"; then
diff --git a/configure.ac b/configure.ac
index d713360f34..7effbf40e2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2012,6 +2012,15 @@ if test x"$pgac_xsave_intrinsics" = x"yes"; then
AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
fi
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+ PGAC_AVX2_SUPPORT()
+ if test x"$pgac_avx2_support" = x"yes"; then
+ AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+ fi
+fi
+
# Check for AVX-512 popcount intrinsics
#
if test x"$host_cpu" = x"x86_64"; then
diff --git a/meson.build b/meson.build
index 8e128f4982..4fd96fee1d 100644
--- a/meson.build
+++ b/meson.build
@@ -2159,6 +2159,34 @@ int main(void)
endif
+###############################################################
+# Check for the availability of AVX2 support
+###############################################################
+
+if host_cpu == 'x86_64'
+
+ prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx2")))
+#endif
+int main(void)
+{
+ const char buf[sizeof(__m256i)];
+ __m256i accum = _mm256_loadu_si256((const __m256i *) buf);
+ accum = _mm256_add_epi32(accum, accum);
+ int result = _mm256_extract_epi32(accum, 0);
+ return (int) result;
+}
+'''
+
+ if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+ cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+ endif
+
+endif
+
###############################################################
# Check for the availability of AVX-512 popcount intrinsics.
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 07b2f798ab..0fcd9f05a7 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -645,6 +645,9 @@
/* Define to 1 to build with assertion checks. (--enable-cassert) */
#undef USE_ASSERT_CHECKING
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index da87d61ba5..5ea1f698b5 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -101,12 +101,83 @@
*/
#include "storage/bufpage.h"
+#include "pg_config.h"
/* number of checksums to calculate in parallel */
#define N_SUMS 32
/* prime multiplier of FNV-1a hash */
#define FNV_PRIME 16777619
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#include <immintrin.h>
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+/*
+ * Does CPUID say there's support for XSAVE instructions?
+ */
+static inline bool
+xsave_available(void)
+{
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+ return (exx[2] & (1 << 27)) != 0; /* osxsave */
+}
+
+/*
+ * Does XGETBV say the YMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that xsave_available() returns true
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+static inline bool
+ymm_regs_available(void)
+{
+#ifdef HAVE_XSAVE_INTRINSICS
+ return (_xgetbv(0) & 0x06) == 0x06;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Does CPUID say there's support for AVX-2
+ */
+static inline bool
+avx2_available(void)
+{
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+ unsigned int exx[4] = {0, 0, 0, 0};
+ if (!xsave_available() || !ymm_regs_available()) return false;
+
+#if defined(HAVE__GET_CPUID_COUNT)
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+ __cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+ return (exx[1] & (1 << 5)) != 0; /* avx2 */
+#else
+ return false;
+#endif
+}
+
/* Use a union so that this code is valid under strict aliasing */
typedef union
{
@@ -142,35 +213,68 @@ do { \
* Block checksum algorithm. The page must be adequately aligned
* (at least on 4-byte boundary).
*/
-static uint32
-pg_checksum_block(const PGChecksummablePage *page)
-{
- uint32 sums[N_SUMS];
- uint32 result = 0;
- uint32 i,
- j;
+
+#define PG_DECLARE_CHECKSUM_ISA(ISANAME) \
+static uint32 \
+pg_checksum_block_##ISANAME(const PGChecksummablePage *page);
+
+#define PG_DEFINE_CHECKSUM_ISA(ISANAME) \
+pg_attribute_target(#ISANAME) \
+static uint32 \
+pg_checksum_block_##ISANAME(const PGChecksummablePage *page) \
+{ \
+ uint32 sums[N_SUMS]; \
+ uint32 result = 0; \
+ uint32 i, \
+ j; \
+ \
+ /* ensure that the size is compatible with the algorithm */ \
+ Assert(sizeof(PGChecksummablePage) == BLCKSZ); \
+ \
+ /* initialize partial checksums to their corresponding offsets */ \
+ memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); \
+ \
+ /* main checksum calculation */ \
+ /* this is the main place that autovectorization occurs */ \
+ for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) \
+ for (j = 0; j < N_SUMS; j++) \
+ CHECKSUM_COMP(sums[j], page->data[i][j]); \
+ \
+ /* finally add in two rounds of zeroes for additional mixing */ \
+ for (i = 0; i < 2; i++) \
+ for (j = 0; j < N_SUMS; j++) \
+ CHECKSUM_COMP(sums[j], 0); \
+ \
+ /* xor fold partial checksums together */ \
+ for (i = 0; i < N_SUMS; i++) \
+ result ^= sums[i]; \
+ \
+ return result; \
+}
- /* ensure that the size is compatible with the algorithm */
- Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+/* Declarations are always defined to make dynamic dispatch code simpler */
- /* initialize partial checksums to their corresponding offsets */
- memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+PG_DECLARE_CHECKSUM_ISA(default);
+PG_DECLARE_CHECKSUM_ISA(avx2);
- /* main checksum calculation */
- for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
- for (j = 0; j < N_SUMS; j++)
- CHECKSUM_COMP(sums[j], page->data[i][j]);
+PG_DEFINE_CHECKSUM_ISA(default);
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+PG_DEFINE_CHECKSUM_ISA(avx2);
+#endif
- /* finally add in two rounds of zeroes for additional mixing */
- for (i = 0; i < 2; i++)
- for (j = 0; j < N_SUMS; j++)
- CHECKSUM_COMP(sums[j], 0);
+static uint32
+pg_checksum_block_dispatch(const PGChecksummablePage *page);
- /* xor fold partial checksums together */
- for (i = 0; i < N_SUMS; i++)
- result ^= sums[i];
+static uint32 (*pg_checksum_block)(const PGChecksummablePage *page) = pg_checksum_block_dispatch;
- return result;
+static uint32
+pg_checksum_block_dispatch(const PGChecksummablePage *page){
+ if (avx2_available()){
+ pg_checksum_block = pg_checksum_block_avx2;
+ }else{
+ pg_checksum_block = pg_checksum_block_default;
+ }
+ return pg_checksum_block(page);
}
/*
--
2.43.0
[application/octet-stream] v1-0002-Benchmark-code-for-postgres-checksums.patch (4.7K, 3-v1-0002-Benchmark-code-for-postgres-checksums.patch)
download | inline diff:
From 0013d718f95abb2ea7997e4dc374d80c5560ff2b Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <[email protected]>
Date: Fri, 7 Mar 2025 11:33:27 -0800
Subject: [PATCH v1 2/2] Benchmark code for postgres checksums
---
contrib/meson.build | 1 +
contrib/pg_checksum_bench/meson.build | 23 +++++++++++++
.../pg_checksum_bench--1.0.sql | 8 +++++
contrib/pg_checksum_bench/pg_checksum_bench.c | 34 +++++++++++++++++++
.../pg_checksum_bench.control | 4 +++
.../sql/pg_checksum_bench.sql | 17 ++++++++++
6 files changed, 87 insertions(+)
create mode 100644 contrib/pg_checksum_bench/meson.build
create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql
create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench.c
create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench.control
create mode 100644 contrib/pg_checksum_bench/sql/pg_checksum_bench.sql
diff --git a/contrib/meson.build b/contrib/meson.build
index 1ba73ebd67..d7cad22ac8 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -12,6 +12,7 @@ contrib_doc_args = {
'install_dir': contrib_doc_dir,
}
+subdir('pg_checksum_bench')
subdir('amcheck')
subdir('auth_delay')
subdir('auto_explain')
diff --git a/contrib/pg_checksum_bench/meson.build b/contrib/pg_checksum_bench/meson.build
new file mode 100644
index 0000000000..32ccd9efa0
--- /dev/null
+++ b/contrib/pg_checksum_bench/meson.build
@@ -0,0 +1,23 @@
+# Copyright (c) 2022-2025, PostgreSQL Global Development Group
+
+pg_checksum_bench_sources = files(
+ 'pg_checksum_bench.c',
+)
+
+if host_system == 'windows'
+ pg_checksum_bench_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+ '--NAME', 'pg_checksum_bench',
+ '--FILEDESC', 'pg_checksum_bench',])
+endif
+
+pg_checksum_bench = shared_module('pg_checksum_bench',
+ pg_checksum_bench_sources,
+ kwargs: contrib_mod_args,
+)
+contrib_targets += pg_checksum_bench
+
+install_data(
+ 'pg_checksum_bench--1.0.sql',
+ 'pg_checksum_bench.control',
+ kwargs: contrib_data_args,
+)
diff --git a/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql b/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql
new file mode 100644
index 0000000000..5f13cbe3c5
--- /dev/null
+++ b/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql
@@ -0,0 +1,8 @@
+/* contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+-- \echo Use "CREATE EXTENSION pg_checksum_bench" to load this file. \quit
+
+CREATE FUNCTION drive_pg_checksum(page_count int)
+ RETURNS pg_catalog.void
+ AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.c b/contrib/pg_checksum_bench/pg_checksum_bench.c
new file mode 100644
index 0000000000..f40f335ff5
--- /dev/null
+++ b/contrib/pg_checksum_bench/pg_checksum_bench.c
@@ -0,0 +1,34 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "storage/checksum_impl.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+PG_MODULE_MAGIC;
+
+#define REPEATS 1000000
+
+PG_FUNCTION_INFO_V1(drive_pg_checksum);
+Datum
+drive_pg_checksum(PG_FUNCTION_ARGS)
+{
+ int page_count = PG_GETARG_INT32(0);
+
+ PGChecksummablePage * pages = palloc(page_count * sizeof(PGChecksummablePage));
+ srand(0);
+ for (size_t i = 0; i < page_count * sizeof(PGChecksummablePage); i++){
+ char * byte_ptr = (char *) pages;
+ byte_ptr[i] = rand() % 256;
+ }
+
+ for (int i = 0; i < REPEATS; i++){
+ const PGChecksummablePage * test_page = pages + (i % page_count);
+ volatile uint32 result = pg_checksum_block(test_page);
+ (void) result;
+ }
+
+ pfree((void *) pages);
+
+ PG_RETURN_VOID();
+}
diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.control b/contrib/pg_checksum_bench/pg_checksum_bench.control
new file mode 100644
index 0000000000..4a4e2c9363
--- /dev/null
+++ b/contrib/pg_checksum_bench/pg_checksum_bench.control
@@ -0,0 +1,4 @@
+comment = 'pg_checksum benchmark'
+default_version = '1.0'
+module_pathname = '$libdir/pg_checksum_bench'
+relocatable = true
diff --git a/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql b/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql
new file mode 100644
index 0000000000..4b34769995
--- /dev/null
+++ b/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql
@@ -0,0 +1,17 @@
+CREATE EXTENSION pg_checksum_bench;
+
+SELECT drive_pg_checksum(-1);
+
+\timing on
+
+SELECT drive_pg_checksum(1);
+SELECT drive_pg_checksum(2);
+SELECT drive_pg_checksum(4);
+SELECT drive_pg_checksum(8);
+SELECT drive_pg_checksum(16);
+SELECT drive_pg_checksum(32);
+SELECT drive_pg_checksum(64);
+SELECT drive_pg_checksum(128);
+SELECT drive_pg_checksum(256);
+SELECT drive_pg_checksum(512);
+SELECT drive_pg_checksum(1024);
--
2.43.0
view thread (8+ messages) latest in thread
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected], [email protected], [email protected], [email protected]
Subject: Re: Proposal for enabling auto-vectorization for checksum calculations
In-Reply-To: <CA+vA85_5GTu+HHniSbvvP+8k3=xZO=WE84NPwiKyxztqvpfZ3Q@mail.gmail.com>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox