public inbox for [email protected]  
help / color / mirror / Atom feed
From: Matthew Sterrett <[email protected]>
To: Stepan Neretin <[email protected]>
Cc: [email protected]
Cc: Devulapalli, Raghuveer <[email protected]>
Cc: Shankaran, Akash <[email protected]>
Subject: Re: Proposal for enabling auto-vectorization for checksum calculations
Date: Mon, 19 May 2025 16:54:09 -0700
Message-ID: <CA+vA858b4J1k_34vRR+=UsM5U4JSVc4QkigHExk0RYH9nk3iRQ@mail.gmail.com> (raw)
In-Reply-To: <CA+Yyo5Qw=v2bcPSeyGgX3WuHeDXm7vxPcET1n+yUreUmO0rk_A@mail.gmail.com>
References: <CA+vA85_5GTu+HHniSbvvP+8k3=xZO=WE84NPwiKyxztqvpfZ3Q@mail.gmail.com>
	<CA+vA8587=RpZuL+EJ=XKM-8-xdPxz2mFexbAjaimN+OkXysXqw@mail.gmail.com>
	<CA+Yyo5RihTRUdUdanuNYhjQeXQY6412FWjzaxJAQR5MGX83=EQ@mail.gmail.com>
	<CA+Yyo5Qw=v2bcPSeyGgX3WuHeDXm7vxPcET1n+yUreUmO0rk_A@mail.gmail.com>

Hello! Thanks for helping me with this.
I'm still trying to figure out what is going on with the Bookworm test
failures. I'm pretty sure this patchset should resolve all the issues
with the macOS build, but I don't think it will help the linux
failures unfortunately.

On Sat, May 10, 2025 at 4:02 AM Stepan Neretin <[email protected]> wrote:
>
>
>
> On Sat, May 10, 2025 at 6:01 PM Stepan Neretin <[email protected]> wrote:
>>
>>
>>
>> On Thu, May 8, 2025 at 6:57 AM Matthew Sterrett <[email protected]> wrote:
>>>
>>> Hello! I'm still trying to figure out those CI failures, I just wanted
>>> to update things.
>>>
>>> From my testing, with this patch repeatedly disabling/enabling
>>> checksums is about 12.4% on an approximately 15 GB database.
>>>
>>> By the way, I'd love it if anyone could help me figure out how to
>>> replicate a CI failure in the Cirrus CI.
>>> I haven't been able to figure out how to test CI runs locally, does
>>> anyone know a good method to do that?
>>>
>>>
>>
>> Hi Matthew,
>>
>> Thanks for the patch!
>>
>> I ran some timing tests:
>>
>> (without avx2)
>>
>> Time: 4034.351 ms
>> SELECT drive_pg_checksum(512);
>>
>> (with avx2)
>>
>> Time: 3559.076 ms
>> SELECT drive_pg_checksum(512);
>>
>> Also attached two patches that should fix the CI issues.
>>
>> Best,
>>
>> Stepan Neretin
>>
>>
>>
>
> Oops, forgot to attach patches :)
>
> Best,
>
> Stepan Neretin
>
>


Attachments:

  [application/octet-stream] v3-0002-Fix-compilation-on-systems-where-immintrin.h-is-n.patch (1.1K, 2-v3-0002-Fix-compilation-on-systems-where-immintrin.h-is-n.patch)
  download | inline diff:
From 30d916546860d3d19f8d91a0ab4bc99f9c350aaa Mon Sep 17 00:00:00 2001
From: Stepan Neretin <[email protected]>
Date: Sat, 10 May 2025 16:37:13 +0700
Subject: [PATCH v3 2/5] Fix compilation on systems where <immintrin.h> is not
 available or inappropriate, such as older GCC versions or non-x86 platforms.

---
 src/include/storage/checksum_impl.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index 5ea1f698b57..042ee8af120 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -112,7 +112,9 @@
 #include <cpuid.h>
 #endif
 
+#ifdef HAVE_XSAVE_INTRINSICS
 #include <immintrin.h>
+#endif
 
 #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
 #include <intrin.h>
@@ -253,10 +255,6 @@ pg_checksum_block_##ISANAME(const PGChecksummablePage *page) \
 }
 
 /* Declarations are always defined to make dynamic dispatch code simpler */
-
-PG_DECLARE_CHECKSUM_ISA(default);
-PG_DECLARE_CHECKSUM_ISA(avx2);
-
 PG_DEFINE_CHECKSUM_ISA(default);
 #ifdef USE_AVX2_WITH_RUNTIME_CHECK
 PG_DEFINE_CHECKSUM_ISA(avx2);
-- 
2.43.0



  [application/octet-stream] v3-0005-Use-dummy-function-to-avoid-linker-error-move-dec.patch (1.9K, 3-v3-0005-Use-dummy-function-to-avoid-linker-error-move-dec.patch)
  download | inline diff:
From f7b28c6378db9cc98371e89e830a2ae8a571e9a0 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <[email protected]>
Date: Mon, 19 May 2025 13:23:55 -0700
Subject: [PATCH v3 5/5] Use dummy function to avoid linker error, move
 declarations

---
 src/include/storage/checksum_impl.h | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index 042ee8af120..4070646e23e 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -163,7 +163,7 @@ ymm_regs_available(void)
 static inline bool
 avx2_available(void)
 {
-#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+#if defined (USE_AVX2_WITH_RUNTIME_CHECK) && defined(__x86_64__)
 	unsigned int exx[4] = {0, 0, 0, 0};
 	if (!xsave_available() || !ymm_regs_available()) return false;
 
@@ -220,7 +220,20 @@ do { \
 static uint32 \
 pg_checksum_block_##ISANAME(const PGChecksummablePage *page);
 
+#define PG_DEFINE_CHECKSUM_DUMMY(ISANAME) \
+static uint32 \
+pg_checksum_block_##ISANAME(const PGChecksummablePage *page); \
+pg_attribute_target(#ISANAME) \
+static uint32 \
+pg_checksum_block_##ISANAME(const PGChecksummablePage *page) \
+{ \
+	Assert(false); /* This function should never be called */ \
+	return pg_checksum_block_default(page); /* Just in case it somehow is */ \
+}
+
 #define PG_DEFINE_CHECKSUM_ISA(ISANAME) \
+static uint32 \
+pg_checksum_block_##ISANAME(const PGChecksummablePage *page); \
 pg_attribute_target(#ISANAME) \
 static uint32 \
 pg_checksum_block_##ISANAME(const PGChecksummablePage *page) \
@@ -254,10 +267,11 @@ pg_checksum_block_##ISANAME(const PGChecksummablePage *page) \
 	return result; \
 }
 
-/* Declarations are always defined to make dynamic dispatch code simpler */
 PG_DEFINE_CHECKSUM_ISA(default);
 #ifdef USE_AVX2_WITH_RUNTIME_CHECK
 PG_DEFINE_CHECKSUM_ISA(avx2);
+#else
+PG_DEFINE_CHECKSUM_DUMMY(avx2);
 #endif
 
 static uint32
-- 
2.43.0



  [application/octet-stream] v3-0004-fix-bench-compiling.patch (692B, 4-v3-0004-fix-bench-compiling.patch)
  download | inline diff:
From 78873aec488678accbed1c4d1fc7ce15d12df303 Mon Sep 17 00:00:00 2001
From: Stepan Neretin <[email protected]>
Date: Sat, 10 May 2025 17:57:47 +0700
Subject: [PATCH v3 4/5] fix bench compiling

---
 contrib/pg_checksum_bench/pg_checksum_bench.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.c b/contrib/pg_checksum_bench/pg_checksum_bench.c
index f40f335ff59..0296dcfd259 100644
--- a/contrib/pg_checksum_bench/pg_checksum_bench.c
+++ b/contrib/pg_checksum_bench/pg_checksum_bench.c
@@ -1,5 +1,6 @@
 #include "postgres.h"
 #include "fmgr.h"
+#include "storage/checksum.h"
 #include "storage/checksum_impl.h"
 
 #include <stdio.h>
-- 
2.43.0



  [application/octet-stream] v3-0003-Benchmark-code-for-postgres-checksums.patch (4.7K, 5-v3-0003-Benchmark-code-for-postgres-checksums.patch)
  download | inline diff:
From cd6dc517b897faf3d960fafe8ea6b2cf9bbc2e05 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <[email protected]>
Date: Fri, 7 Mar 2025 11:33:27 -0800
Subject: [PATCH v3 3/5] Benchmark code for postgres checksums

---
 contrib/meson.build                           |  1 +
 contrib/pg_checksum_bench/meson.build         | 23 +++++++++++++
 .../pg_checksum_bench--1.0.sql                |  8 +++++
 contrib/pg_checksum_bench/pg_checksum_bench.c | 34 +++++++++++++++++++
 .../pg_checksum_bench.control                 |  4 +++
 .../sql/pg_checksum_bench.sql                 | 17 ++++++++++
 6 files changed, 87 insertions(+)
 create mode 100644 contrib/pg_checksum_bench/meson.build
 create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql
 create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench.c
 create mode 100644 contrib/pg_checksum_bench/pg_checksum_bench.control
 create mode 100644 contrib/pg_checksum_bench/sql/pg_checksum_bench.sql

diff --git a/contrib/meson.build b/contrib/meson.build
index ed30ee7d639..fe5149aadff 100644
--- a/contrib/meson.build
+++ b/contrib/meson.build
@@ -12,6 +12,7 @@ contrib_doc_args = {
   'install_dir': contrib_doc_dir,
 }
 
+subdir('pg_checksum_bench')
 subdir('amcheck')
 subdir('auth_delay')
 subdir('auto_explain')
diff --git a/contrib/pg_checksum_bench/meson.build b/contrib/pg_checksum_bench/meson.build
new file mode 100644
index 00000000000..32ccd9efa0f
--- /dev/null
+++ b/contrib/pg_checksum_bench/meson.build
@@ -0,0 +1,23 @@
+# Copyright (c) 2022-2025, PostgreSQL Global Development Group
+
+pg_checksum_bench_sources = files(
+  'pg_checksum_bench.c',
+)
+
+if host_system == 'windows'
+  pg_checksum_bench_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'pg_checksum_bench',
+    '--FILEDESC', 'pg_checksum_bench',])
+endif
+
+pg_checksum_bench = shared_module('pg_checksum_bench',
+  pg_checksum_bench_sources,
+  kwargs: contrib_mod_args,
+)
+contrib_targets += pg_checksum_bench
+
+install_data(
+  'pg_checksum_bench--1.0.sql',
+  'pg_checksum_bench.control',
+  kwargs: contrib_data_args,
+)
diff --git a/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql b/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql
new file mode 100644
index 00000000000..5f13cbe3c5e
--- /dev/null
+++ b/contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql
@@ -0,0 +1,8 @@
+/* contrib/pg_checksum_bench/pg_checksum_bench--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+-- \echo Use "CREATE EXTENSION pg_checksum_bench" to load this file. \quit
+
+CREATE FUNCTION drive_pg_checksum(page_count int)
+	RETURNS pg_catalog.void
+	AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.c b/contrib/pg_checksum_bench/pg_checksum_bench.c
new file mode 100644
index 00000000000..f40f335ff59
--- /dev/null
+++ b/contrib/pg_checksum_bench/pg_checksum_bench.c
@@ -0,0 +1,34 @@
+#include "postgres.h"
+#include "fmgr.h"
+#include "storage/checksum_impl.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+PG_MODULE_MAGIC;
+
+#define REPEATS 1000000
+
+PG_FUNCTION_INFO_V1(drive_pg_checksum);
+Datum
+drive_pg_checksum(PG_FUNCTION_ARGS)
+{
+	int page_count = PG_GETARG_INT32(0);
+
+	PGChecksummablePage * pages = palloc(page_count * sizeof(PGChecksummablePage));
+	srand(0);
+	for (size_t i = 0; i < page_count * sizeof(PGChecksummablePage); i++){
+		char * byte_ptr = (char *) pages;
+		byte_ptr[i] = rand() % 256;
+	}
+
+	for (int i = 0; i < REPEATS; i++){
+		const PGChecksummablePage * test_page = pages + (i % page_count);
+		volatile uint32 result = pg_checksum_block(test_page);
+		(void) result;
+	}
+
+	pfree((void *) pages);
+
+	PG_RETURN_VOID();
+}
diff --git a/contrib/pg_checksum_bench/pg_checksum_bench.control b/contrib/pg_checksum_bench/pg_checksum_bench.control
new file mode 100644
index 00000000000..4a4e2c9363c
--- /dev/null
+++ b/contrib/pg_checksum_bench/pg_checksum_bench.control
@@ -0,0 +1,4 @@
+comment = 'pg_checksum benchmark'
+default_version = '1.0'
+module_pathname = '$libdir/pg_checksum_bench'
+relocatable = true
diff --git a/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql b/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql
new file mode 100644
index 00000000000..4b347699953
--- /dev/null
+++ b/contrib/pg_checksum_bench/sql/pg_checksum_bench.sql
@@ -0,0 +1,17 @@
+CREATE EXTENSION pg_checksum_bench;
+
+SELECT drive_pg_checksum(-1);
+
+\timing on
+
+SELECT drive_pg_checksum(1);
+SELECT drive_pg_checksum(2);
+SELECT drive_pg_checksum(4);
+SELECT drive_pg_checksum(8);
+SELECT drive_pg_checksum(16);
+SELECT drive_pg_checksum(32);
+SELECT drive_pg_checksum(64);
+SELECT drive_pg_checksum(128);
+SELECT drive_pg_checksum(256);
+SELECT drive_pg_checksum(512);
+SELECT drive_pg_checksum(1024);
-- 
2.43.0



  [application/octet-stream] v3-0001-Enable-autovectorizing-pg_checksum_block.patch (10.5K, 6-v3-0001-Enable-autovectorizing-pg_checksum_block.patch)
  download | inline diff:
From 58122ebe1bbebc70da90c7ec3e77f3c3d524aa85 Mon Sep 17 00:00:00 2001
From: Matthew Sterrett <[email protected]>
Date: Fri, 7 Mar 2025 11:33:45 -0800
Subject: [PATCH v3 1/5] Enable autovectorizing pg_checksum_block

---
 config/c-compiler.m4                |  31 ++++++
 configure                           |  52 ++++++++++
 configure.ac                        |   9 ++
 meson.build                         |  28 ++++++
 src/include/pg_config.h.in          |   3 +
 src/include/storage/checksum_impl.h | 150 +++++++++++++++++++++++-----
 6 files changed, 250 insertions(+), 23 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5f3e1d1faf9..4d5bceafe9e 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -710,6 +710,37 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_XSAVE_INTRINSICS
 
+# PGAC_AVX2_SUPPORT
+# -----------------------------
+# Check if the compiler supports AVX2 in attribute((target))
+# and using AVX2 intrinsics in those functions
+#
+# If the intrinsics are supported, sets pgac_avx2_support.
+AC_DEFUN([PGAC_AVX2_SUPPORT],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx2_support])])dnl
+AC_CACHE_CHECK([for AVX2 support], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>
+    #include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    #endif
+    static int avx2_test(void)
+    {
+      const char buf@<:@sizeof(__m256i)@:>@;
+      __m256i accum = _mm256_loadu_si256((const __m256i *) buf);
+	  accum = _mm256_add_epi32(accum, accum);
+      int result = _mm256_extract_epi32(accum, 0);
+      return (int) result;
+    }],
+  [return avx2_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_AVX2_SUPPORT
+
 # PGAC_AVX512_POPCNT_INTRINSICS
 # -----------------------------
 # Check if the compiler supports the AVX-512 popcount instructions using the
diff --git a/configure b/configure
index 4f15347cc95..0b328dc22db 100755
--- a/configure
+++ b/configure
@@ -17724,6 +17724,58 @@ $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
 
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
+$as_echo_n "checking for AVX2 support... " >&6; }
+if ${pgac_cv_avx2_support+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <immintrin.h>
+    #include <stdint.h>
+    #if defined(__has_attribute) && __has_attribute (target)
+    __attribute__((target("avx2")))
+    #endif
+    static int avx2_test(void)
+    {
+      const char buf[sizeof(__m256i)];
+      __m256i accum = _mm256_loadu_si256((const __m256i *) buf);
+	  accum = _mm256_add_epi32(accum, accum);
+      int result = _mm256_extract_epi32(accum, 0);
+      return (int) result;
+    }
+int
+main ()
+{
+return avx2_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_avx2_support=yes
+else
+  pgac_cv_avx2_support=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx2_support" >&5
+$as_echo "$pgac_cv_avx2_support" >&6; }
+if test x"$pgac_cv_avx2_support" = x"yes"; then
+  pgac_avx2_support=yes
+fi
+
+  if test x"$pgac_avx2_support" = x"yes"; then
+
+$as_echo "#define USE_AVX2_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+  fi
+fi
+
 # Check for AVX-512 popcount intrinsics
 #
 if test x"$host_cpu" = x"x86_64"; then
diff --git a/configure.ac b/configure.ac
index 4b8335dc613..b980429282f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2089,6 +2089,15 @@ if test x"$pgac_xsave_intrinsics" = x"yes"; then
   AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
 fi
 
+# Check for AVX2 target and intrinsic support
+#
+if test x"$host_cpu" = x"x86_64"; then
+  PGAC_AVX2_SUPPORT()
+  if test x"$pgac_avx2_support" = x"yes"; then
+    AC_DEFINE(USE_AVX2_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX2 instructions with a runtime check.])
+  fi
+fi
+
 # Check for AVX-512 popcount intrinsics
 #
 if test x"$host_cpu" = x"x86_64"; then
diff --git a/meson.build b/meson.build
index d142e3e408b..e0ab1f9cf2f 100644
--- a/meson.build
+++ b/meson.build
@@ -2301,6 +2301,34 @@ int main(void)
 
 endif
 
+###############################################################
+# Check for the availability of AVX2 support
+###############################################################
+
+if host_cpu == 'x86_64'
+
+  prog = '''
+#include <immintrin.h>
+#include <stdint.h>
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("avx2")))
+#endif
+int main(void)
+{
+  const char buf[sizeof(__m256i)];
+  __m256i accum = _mm256_loadu_si256((const __m256i *) buf);
+  accum = _mm256_add_epi32(accum, accum);
+  int result = _mm256_extract_epi32(accum, 0);
+  return (int) result;
+}
+'''
+
+  if cc.links(prog, name: 'AVX2 support', args: test_c_args)
+    cdata.set('USE_AVX2_WITH_RUNTIME_CHECK', 1)
+  endif
+
+endif
+
 
 ###############################################################
 # Check for the availability of AVX-512 popcount intrinsics.
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 726a7c1be1f..34fa398ee8c 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -672,6 +672,9 @@
 /* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
 #undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use AVX2 instructions with a runtime check. */
+#undef USE_AVX2_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
 #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
 
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index da87d61ba52..5ea1f698b57 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -101,12 +101,83 @@
  */
 
 #include "storage/bufpage.h"
+#include "pg_config.h"
 
 /* number of checksums to calculate in parallel */
 #define N_SUMS 32
 /* prime multiplier of FNV-1a hash */
 #define FNV_PRIME 16777619
 
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#include <immintrin.h>
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+/*
+ * Does CPUID say there's support for XSAVE instructions?
+ */
+static inline bool
+xsave_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+	return (exx[2] & (1 << 27)) != 0;	/* osxsave */
+}
+
+/*
+ * Does XGETBV say the YMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that xsave_available() returns true
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
+static inline bool
+ymm_regs_available(void)
+{
+#ifdef HAVE_XSAVE_INTRINSICS
+	return (_xgetbv(0) & 0x06) == 0x06;
+#else
+	return false;
+#endif
+}
+
+/*
+ * Does CPUID say there's support for AVX-2
+ */
+static inline bool
+avx2_available(void)
+{
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+	unsigned int exx[4] = {0, 0, 0, 0};
+	if (!xsave_available() || !ymm_regs_available()) return false;
+
+#if defined(HAVE__GET_CPUID_COUNT)
+	__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+	__cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+	return (exx[1] & (1 << 5)) != 0; /* avx2 */
+#else
+	return false;
+#endif
+}
+
 /* Use a union so that this code is valid under strict aliasing */
 typedef union
 {
@@ -142,35 +213,68 @@ do { \
  * Block checksum algorithm.  The page must be adequately aligned
  * (at least on 4-byte boundary).
  */
-static uint32
-pg_checksum_block(const PGChecksummablePage *page)
-{
-	uint32		sums[N_SUMS];
-	uint32		result = 0;
-	uint32		i,
-				j;
+ 
+#define PG_DECLARE_CHECKSUM_ISA(ISANAME) \
+static uint32 \
+pg_checksum_block_##ISANAME(const PGChecksummablePage *page);
+
+#define PG_DEFINE_CHECKSUM_ISA(ISANAME) \
+pg_attribute_target(#ISANAME) \
+static uint32 \
+pg_checksum_block_##ISANAME(const PGChecksummablePage *page) \
+{ \
+	uint32		sums[N_SUMS]; \
+	uint32		result = 0; \
+	uint32		i, \
+				j; \
+ \
+	/* ensure that the size is compatible with the algorithm */ \
+	Assert(sizeof(PGChecksummablePage) == BLCKSZ); \
+ \
+	/* initialize partial checksums to their corresponding offsets */ \
+	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets)); \
+ \
+	/* main checksum calculation */ \
+	/* this is the main place that autovectorization occurs */ \
+	for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++) \
+		for (j = 0; j < N_SUMS; j++) \
+			CHECKSUM_COMP(sums[j], page->data[i][j]); \
+ \
+	/* finally add in two rounds of zeroes for additional mixing */ \
+	for (i = 0; i < 2; i++) \
+		for (j = 0; j < N_SUMS; j++) \
+			CHECKSUM_COMP(sums[j], 0); \
+ \
+	/* xor fold partial checksums together */ \
+	for (i = 0; i < N_SUMS; i++) \
+		result ^= sums[i]; \
+ \
+	return result; \
+}
 
-	/* ensure that the size is compatible with the algorithm */
-	Assert(sizeof(PGChecksummablePage) == BLCKSZ);
+/* Declarations are always defined to make dynamic dispatch code simpler */
 
-	/* initialize partial checksums to their corresponding offsets */
-	memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets));
+PG_DECLARE_CHECKSUM_ISA(default);
+PG_DECLARE_CHECKSUM_ISA(avx2);
 
-	/* main checksum calculation */
-	for (i = 0; i < (uint32) (BLCKSZ / (sizeof(uint32) * N_SUMS)); i++)
-		for (j = 0; j < N_SUMS; j++)
-			CHECKSUM_COMP(sums[j], page->data[i][j]);
+PG_DEFINE_CHECKSUM_ISA(default);
+#ifdef USE_AVX2_WITH_RUNTIME_CHECK
+PG_DEFINE_CHECKSUM_ISA(avx2);
+#endif
 
-	/* finally add in two rounds of zeroes for additional mixing */
-	for (i = 0; i < 2; i++)
-		for (j = 0; j < N_SUMS; j++)
-			CHECKSUM_COMP(sums[j], 0);
+static uint32
+pg_checksum_block_dispatch(const PGChecksummablePage *page);
 
-	/* xor fold partial checksums together */
-	for (i = 0; i < N_SUMS; i++)
-		result ^= sums[i];
+static uint32 (*pg_checksum_block)(const PGChecksummablePage *page) = pg_checksum_block_dispatch;
 
-	return result;
+static uint32
+pg_checksum_block_dispatch(const PGChecksummablePage *page){
+	if (avx2_available()){
+		pg_checksum_block = pg_checksum_block_avx2;
+	}else{
+		pg_checksum_block = pg_checksum_block_default;
+	}
+	return pg_checksum_block(page);
 }
 
 /*
-- 
2.43.0



view thread (36+ messages)  latest in thread

reply

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Reply to all the recipients using the --to and --cc options:
  reply via email

  To: [email protected]
  Cc: [email protected], [email protected], [email protected], [email protected], [email protected]
  Subject: Re: Proposal for enabling auto-vectorization for checksum calculations
  In-Reply-To: <CA+vA858b4J1k_34vRR+=UsM5U4JSVc4QkigHExk0RYH9nk3iRQ@mail.gmail.com>

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox