public inbox for [email protected]help / color / mirror / Atom feed
Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. 6+ messages / 2 participants [nested] [flat]
* Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. @ 2024-08-22 18:00 Jeff Davis <[email protected]> 0 siblings, 2 replies; 6+ messages in thread From: Jeff Davis @ 2024-08-22 18:00 UTC (permalink / raw) To: pgsql-hackers Like ICU, allow -1 length to mean that the input string is NUL- terminated for pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix(). This simplifies the API and code a bit. Along with some other refactoring in this area, we are getting close to the point where the collation provider can just be a table of methods, which means we can add an extension hook to provide a different method table. That still requires more work, I'm just mentioning it here for context. Regards, Jeff Davis Attachments: [text/x-patch] v1-0001-Allow-length-1-for-NUL-terminated-input-to-pg_str.patch (12.7K, 2-v1-0001-Allow-length-1-for-NUL-terminated-input-to-pg_str.patch) download | inline diff: From 6f0c0a9e05039cd295c6c090b3d98d381244b35c Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Wed, 21 Aug 2024 10:59:28 -0700 Subject: [PATCH v1] Allow length=-1 for NUL-terminated input to pg_strncoll(), etc. Like ICU, allow a length of -1 to be specified for NUL-terminated arguments to pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix(). Simplifies the code and comments. --- src/backend/utils/adt/pg_locale.c | 186 ++++++++++-------------------- 1 file changed, 64 insertions(+), 122 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 48b7e16d81b..26b0f4577f0 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1809,6 +1809,8 @@ get_collation_actual_version(char collprovider, const char *collcollate) * * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and * invoke wcscoll_l(). + * + * An input string length of -1 means that it's NUL-terminated. */ #ifdef WIN32 static int @@ -1819,8 +1821,8 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, char *buf = sbuf; char *a1p, *a2p; - int a1len = len1 * 2 + 2; - int a2len = len2 * 2 + 2; + int a1len; + int a2len; int r; int result; @@ -1830,6 +1832,14 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, Assert(false); #endif + if (len1 == -1) + len1 = strlen(arg1); + if (len2 == -1) + len2 = strlen(arg2); + + a1len = len1 * 2 + 2; + a2len = len2 * 2 + 2; + if (a1len + a2len > TEXTBUFLEN) buf = palloc(a1len + a2len); @@ -1876,40 +1886,10 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, } #endif /* WIN32 */ -/* - * pg_strcoll_libc - * - * Call strcoll_l() or wcscoll_l() as appropriate for the given locale, - * platform, and database encoding. If the locale is NULL, use the database - * collation. - * - * Arguments must be encoded in the database encoding and nul-terminated. - */ -static int -pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale) -{ - int result; - - Assert(locale->provider == COLLPROVIDER_LIBC); -#ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8) - { - size_t len1 = strlen(arg1); - size_t len2 = strlen(arg2); - - result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); - } - else -#endif /* WIN32 */ - result = strcoll_l(arg1, arg2, locale->info.lt); - - return result; -} - /* * pg_strncoll_libc * - * Nul-terminate the arguments and call pg_strcoll_libc(). + * An input string length of -1 means that it's NUL-terminated. */ static int pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, @@ -1917,10 +1897,10 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, { char sbuf[TEXTBUFLEN]; char *buf = sbuf; - size_t bufsize1 = len1 + 1; - size_t bufsize2 = len2 + 1; - char *arg1n; - char *arg2n; + size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1; + size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1; + const char *arg1n; + const char *arg2n; int result; Assert(locale->provider == COLLPROVIDER_LIBC); @@ -1934,16 +1914,32 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); - arg1n = buf; - arg2n = buf + bufsize1; + /* nul-terminate arguments if necessary */ + if (len1 == -1) + { + arg1n = arg1; + } + else + { + char *buf1 = buf; + memcpy(buf1, arg1, len1); + buf1[len1] = '\0'; + arg1n = buf1; + } - /* nul-terminate arguments */ - memcpy(arg1n, arg1, len1); - arg1n[len1] = '\0'; - memcpy(arg2n, arg2, len2); - arg2n[len2] = '\0'; + if (len2 == -1) + { + arg2n = arg2; + } + else + { + char *buf2 = buf + bufsize1; + memcpy(buf2, arg2, len2); + buf2[len2] = '\0'; + arg2n = buf2; + } - result = pg_strcoll_libc(arg1n, arg2n, locale); + result = strcoll_l(arg1n, arg2n, locale->info.lt); if (buf != sbuf) pfree(buf); @@ -2015,8 +2011,6 @@ pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1, * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given * database encoding. An argument length of -1 means the string is * NUL-terminated. - * - * Arguments must be encoded in the database encoding. */ static int pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2, @@ -2054,15 +2048,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2, /* * pg_strcoll * - * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as - * appropriate for the given locale, platform, and database encoding. If the - * locale is not specified, use the database collation. - * - * Arguments must be encoded in the database encoding and nul-terminated. - * - * The caller is responsible for breaking ties if the collation is - * deterministic; this maintains consistency with pg_strxfrm(), which cannot - * easily account for deterministic collations. + * Like pg_strncoll for NUL-terminated input strings. */ int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) @@ -2070,7 +2056,7 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) int result; if (locale->provider == COLLPROVIDER_LIBC) - result = pg_strcoll_libc(arg1, arg2, locale); + result = pg_strncoll_libc(arg1, -1, arg2, -1, locale); #ifdef USE_ICU else if (locale->provider == COLLPROVIDER_ICU) result = pg_strncoll_icu(arg1, -1, arg2, -1, locale); @@ -2089,11 +2075,8 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) * appropriate for the given locale, platform, and database encoding. If the * locale is not specified, use the database collation. * - * Arguments must be encoded in the database encoding. - * - * This function may need to nul-terminate the arguments for libc functions; - * so if the caller already has nul-terminated strings, it should call - * pg_strcoll() instead. + * The input strings must be encoded in the database encoding. If an input + * string is NUL-terminated, its length may be specified as -1. * * The caller is responsible for breaking ties if the collation is * deterministic; this maintains consistency with pg_strnxfrm(), which cannot @@ -2119,14 +2102,6 @@ pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, } -static size_t -pg_strxfrm_libc(char *dest, const char *src, size_t destsize, - pg_locale_t locale) -{ - Assert(locale->provider == COLLPROVIDER_LIBC); - return strxfrm_l(dest, src, destsize, locale->info.lt); -} - static size_t pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, pg_locale_t locale) @@ -2138,14 +2113,17 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, Assert(locale->provider == COLLPROVIDER_LIBC); + if (srclen == -1) + return strxfrm_l(dest, src, destsize, locale->info.lt); + if (bufsize > TEXTBUFLEN) buf = palloc(bufsize); - /* nul-terminate arguments */ + /* nul-terminate argument */ memcpy(buf, src, srclen); buf[srclen] = '\0'; - result = pg_strxfrm_libc(dest, buf, destsize, locale); + result = strxfrm_l(dest, buf, destsize, locale->info.lt); if (buf != sbuf) pfree(buf); @@ -2326,20 +2304,7 @@ pg_strxfrm_enabled(pg_locale_t locale) /* * pg_strxfrm * - * Transforms 'src' to a nul-terminated string stored in 'dest' such that - * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on - * untransformed strings. - * - * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest' - * may be NULL. - * - * Not all providers support pg_strxfrm() safely. The caller should check - * pg_strxfrm_enabled() first, otherwise this function may return wrong - * results or an error. - * - * Returns the number of bytes needed (or more) to store the transformed - * string, excluding the terminating nul byte. If the value returned is - * 'destsize' or greater, the resulting contents of 'dest' are undefined. + * Like pg_strnxfrm for a NUL-terminated input string. */ size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) @@ -2347,7 +2312,7 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) size_t result = 0; /* keep compiler quiet */ if (locale->provider == COLLPROVIDER_LIBC) - result = pg_strxfrm_libc(dest, src, destsize, locale); + result = pg_strnxfrm_libc(dest, src, -1, destsize, locale); #ifdef USE_ICU else if (locale->provider == COLLPROVIDER_ICU) result = pg_strnxfrm_icu(dest, src, -1, destsize, locale); @@ -2366,8 +2331,9 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on * untransformed strings. * - * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may - * be NULL. + * The input string must be encoded in the database encoding. If the input + * string is NUL-terminated, its length may be specified as -1. If 'destsize' + * is zero, 'dest' may be NULL. * * Not all providers support pg_strnxfrm() safely. The caller should check * pg_strxfrm_enabled() first, otherwise this function may return wrong @@ -2376,10 +2342,6 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) * Returns the number of bytes needed (or more) to store the transformed * string, excluding the terminating nul byte. If the value returned is * 'destsize' or greater, the resulting contents of 'dest' are undefined. - * - * This function may need to nul-terminate the argument for libc functions; - * so if the caller already has a nul-terminated string, it should call - * pg_strxfrm() instead. */ size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen, @@ -2421,44 +2383,24 @@ pg_strxfrm_prefix_enabled(pg_locale_t locale) /* * pg_strxfrm_prefix * - * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary - * memcmp() on the byte sequence is equivalent to pg_strcoll() on - * untransformed strings. The result is not nul-terminated. - * - * The provided 'src' must be nul-terminated. - * - * Not all providers support pg_strxfrm_prefix() safely. The caller should - * check pg_strxfrm_prefix_enabled() first, otherwise this function may return - * wrong results or an error. - * - * If destsize is not large enough to hold the resulting byte sequence, stores - * only the first destsize bytes in 'dest'. Returns the number of bytes - * actually copied to 'dest'. + * Like pg_strnxfrm_prefix for a NUL-terminated input string. */ size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - -#ifdef USE_ICU - if (locale->provider == COLLPROVIDER_ICU) - result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); - else -#endif - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return pg_strnxfrm_prefix(dest, destsize, src, -1, locale); } /* * pg_strnxfrm_prefix * * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary - * memcmp() on the byte sequence is equivalent to pg_strcoll() on + * memcmp() on the byte sequence is equivalent to pg_strncoll() on * untransformed strings. The result is not nul-terminated. * - * The provided 'src' must be nul-terminated. + * The input string must be encoded in the database encoding. If the input + * string is NUL-terminated, its length may be specified as -1. * * Not all providers support pg_strnxfrm_prefix() safely. The caller should * check pg_strxfrm_prefix_enabled() first, otherwise this function may return @@ -2467,10 +2409,6 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, * If destsize is not large enough to hold the resulting byte sequence, stores * only the first destsize bytes in 'dest'. Returns the number of bytes * actually copied to 'dest'. - * - * This function may need to nul-terminate the argument for libc functions; - * so if the caller already has a nul-terminated string, it should call - * pg_strxfrm_prefix() instead. */ size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, @@ -2661,6 +2599,8 @@ init_icu_converter(void) /* * Find length, in UChars, of given string if converted to UChar string. + * + * A length of -1 indicates that the input string is NUL-terminated. */ static size_t uchar_length(UConverter *converter, const char *str, int32_t len) @@ -2678,6 +2618,8 @@ uchar_length(UConverter *converter, const char *str, int32_t len) /* * Convert the given source string into a UChar string, stored in dest, and * return the length (in UChars). + * + * A srclen of -1 indicates that the input string is NUL-terminated. */ static int32_t uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, -- 2.34.1 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. @ 2024-09-21 00:28 Jeff Davis <[email protected]> parent: Jeff Davis <[email protected]> 1 sibling, 0 replies; 6+ messages in thread From: Jeff Davis @ 2024-09-21 00:28 UTC (permalink / raw) To: pgsql-hackers On Thu, 2024-08-22 at 11:00 -0700, Jeff Davis wrote: > Like ICU, allow -1 length to mean that the input string is NUL- > terminated for pg_strncoll(), pg_strnxfrm(), and > pg_strnxfrm_prefix(). To better illustrate the direction I'm going, I roughly implemented some patches that implement collation using a table of methods rather than lots branching based on the provider. This more cleanly separates the API for a provider, which will enable us to use a hook to create a custom provider with arbitrary methods, that may have nothing to do with ICU or libc. Or, we could go so far as to implement a "CREATE LOCALE PROVIDER" that would provide the methods using a handler function, and "datlocprovider" would be an OID rather than a char. From a practical perspective, I expect that extensions would use this to lock down the version of a particular provider rather than implement a completely arbitrary one. But the API is good for either case, and offers quite a bit of code cleanup. There are quite a few loose ends, of course: * There is still a lot of branching on the provider for DDL and catalog access. I'm not sure if we will ever eliminate all of this, or if we would even want to. * I haven't done anything with get_collation_actual_version(). Perhaps that should be a method, too, but it requires some extra thought if we want this to be useful for "multilib" (having multiple versions of a provider library at once). * I didn't add methods for formatting.c yet. * initdb -- should it offer a way to preload a library and then use that for the provider? * I need to allow an arbitrary per-provider context, rather than the current union designed for the existing providers. Again, the patches are rough and there's a lot of code churn. I'd like some feedback on whether people generally like the direction this is going. If so I will clean up the patch series into smaller, more reviewable chunks. Regards, Jeff Davis Attachments: [text/x-patch] v4-0007-Use-method-table-for-collation.patch (97.3K, 2-v4-0007-Use-method-table-for-collation.patch) download | inline diff: From c9ace91726c2889fe96dec28fd9f3c655e13afd7 Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Thu, 19 Sep 2024 11:12:41 -0700 Subject: [PATCH v4 7/7] Use method table for collation. --- src/backend/regex/regc_pg_locale.c | 376 ++----- src/backend/utils/adt/Makefile | 2 + src/backend/utils/adt/meson.build | 2 + src/backend/utils/adt/pg_locale.c | 1338 +++--------------------- src/backend/utils/adt/pg_locale_icu.c | 873 ++++++++++++++++ src/backend/utils/adt/pg_locale_libc.c | 604 +++++++++++ src/include/utils/pg_locale.h | 44 +- 7 files changed, 1727 insertions(+), 1512 deletions(-) create mode 100644 src/backend/utils/adt/pg_locale_icu.c create mode 100644 src/backend/utils/adt/pg_locale_libc.c diff --git a/src/backend/regex/regc_pg_locale.c b/src/backend/regex/regc_pg_locale.c index b75784b6ce5..f7cd3f1787c 100644 --- a/src/backend/regex/regc_pg_locale.c +++ b/src/backend/regex/regc_pg_locale.c @@ -63,32 +63,17 @@ * NB: the coding here assumes pg_wchar is an unsigned type. */ -typedef enum -{ - PG_REGEX_STRATEGY_C, /* C locale (encoding independent) */ - PG_REGEX_STRATEGY_BUILTIN, /* built-in Unicode semantics */ - PG_REGEX_STRATEGY_LIBC_WIDE, /* Use locale_t <wctype.h> functions */ - PG_REGEX_STRATEGY_LIBC_1BYTE, /* Use locale_t <ctype.h> functions */ - PG_REGEX_STRATEGY_ICU, /* Use ICU uchar.h functions */ -} PG_Locale_Strategy; - -static PG_Locale_Strategy pg_regex_strategy; static pg_locale_t pg_regex_locale; static Oid pg_regex_collation; +static struct pg_locale_struct dummy_c_locale = { + .collate_is_c = true, + .ctype_is_c = true, +}; + /* * Hard-wired character properties for C locale */ -#define PG_ISDIGIT 0x01 -#define PG_ISALPHA 0x02 -#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA) -#define PG_ISUPPER 0x04 -#define PG_ISLOWER 0x08 -#define PG_ISGRAPH 0x10 -#define PG_ISPRINT 0x20 -#define PG_ISPUNCT 0x40 -#define PG_ISSPACE 0x80 - static const unsigned char pg_char_properties[128] = { /* NUL */ 0, /* ^A */ 0, @@ -232,7 +217,6 @@ void pg_set_regex_collation(Oid collation) { pg_locale_t locale = 0; - PG_Locale_Strategy strategy; if (!OidIsValid(collation)) { @@ -253,8 +237,8 @@ pg_set_regex_collation(Oid collation) * catalog access is available, so we can't call * pg_newlocale_from_collation(). */ - strategy = PG_REGEX_STRATEGY_C; collation = C_COLLATION_OID; + locale = &dummy_c_locale; } else { @@ -271,32 +255,11 @@ pg_set_regex_collation(Oid collation) * C/POSIX collations use this path regardless of database * encoding */ - strategy = PG_REGEX_STRATEGY_C; - locale = 0; + locale = &dummy_c_locale; collation = C_COLLATION_OID; } - else if (locale->provider == COLLPROVIDER_BUILTIN) - { - Assert(GetDatabaseEncoding() == PG_UTF8); - strategy = PG_REGEX_STRATEGY_BUILTIN; - } -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - { - strategy = PG_REGEX_STRATEGY_ICU; - } -#endif - else - { - Assert(locale->provider == COLLPROVIDER_LIBC); - if (GetDatabaseEncoding() == PG_UTF8) - strategy = PG_REGEX_STRATEGY_LIBC_WIDE; - else - strategy = PG_REGEX_STRATEGY_LIBC_1BYTE; - } } - pg_regex_strategy = strategy; pg_regex_locale = locale; pg_regex_collation = collation; } @@ -304,82 +267,31 @@ pg_set_regex_collation(Oid collation) static int pg_wc_isdigit(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISDIGIT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isdigit(c, true); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswdigit_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isdigit_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isdigit(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISDIGIT)); + else + return char_props(c, PG_ISDIGIT, pg_regex_locale) != 0; } static int pg_wc_isalpha(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALPHA)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalpha(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalpha_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalpha_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalpha(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALPHA)); + else + return char_props(c, PG_ISALPHA, pg_regex_locale) != 0; } static int pg_wc_isalnum(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISALNUM)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isalnum(c, true); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswalnum_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isalnum_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isalnum(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISALNUM)); + else + return char_props(c, PG_ISDIGIT|PG_ISALPHA, pg_regex_locale) != 0; } static int @@ -394,219 +306,87 @@ pg_wc_isword(pg_wchar c) static int pg_wc_isupper(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISUPPER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isupper(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isupper_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isupper(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISUPPER)); + else + return char_props(c, PG_ISUPPER, pg_regex_locale) != 0; } static int pg_wc_islower(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISLOWER)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_islower(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - islower_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_islower(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISLOWER)); + else + return char_props(c, PG_ISLOWER, pg_regex_locale) != 0; } static int pg_wc_isgraph(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISGRAPH)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isgraph(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswgraph_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isgraph_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isgraph(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISGRAPH)); + else + return char_props(c, PG_ISGRAPH, pg_regex_locale) != 0; } static int pg_wc_isprint(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPRINT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isprint(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswprint_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isprint_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isprint(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPRINT)); + else + return char_props(c, PG_ISPRINT, pg_regex_locale) != 0; } static int pg_wc_ispunct(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISPUNCT)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_ispunct(c, true); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswpunct_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - ispunct_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_ispunct(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISPUNCT)); + else + return char_props(c, PG_ISPUNCT, pg_regex_locale) != 0; } static int pg_wc_isspace(pg_wchar c) { - switch (pg_regex_strategy) - { - case PG_REGEX_STRATEGY_C: - return (c <= (pg_wchar) 127 && - (pg_char_properties[c] & PG_ISSPACE)); - case PG_REGEX_STRATEGY_BUILTIN: - return pg_u_isspace(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return iswspace_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - return (c <= (pg_wchar) UCHAR_MAX && - isspace_l((unsigned char) c, pg_regex_locale->info.lt)); - break; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_isspace(c); -#endif - break; - } - return 0; /* can't get here, but keep compiler quiet */ + if (pg_regex_locale->ctype_is_c) + return (c <= (pg_wchar) 127 && + (pg_char_properties[c] & PG_ISSPACE)); + else + return char_props(c, PG_ISSPACE, pg_regex_locale) != 0; } static pg_wchar pg_wc_toupper(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_toupper((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_uppercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towupper_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - if (c <= (pg_wchar) UCHAR_MAX) - return toupper_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_toupper(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_toupper((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_toupper(c, pg_regex_locale); } static pg_wchar pg_wc_tolower(pg_wchar c) { - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: - if (c <= (pg_wchar) 127) - return pg_ascii_tolower((unsigned char) c); - return c; - case PG_REGEX_STRATEGY_BUILTIN: - return unicode_lowercase_simple(c); - case PG_REGEX_STRATEGY_LIBC_WIDE: - if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF) - return towlower_l((wint_t) c, pg_regex_locale->info.lt); - /* FALL THRU */ - case PG_REGEX_STRATEGY_LIBC_1BYTE: - if (c <= (pg_wchar) UCHAR_MAX) - return tolower_l((unsigned char) c, pg_regex_locale->info.lt); - return c; - case PG_REGEX_STRATEGY_ICU: -#ifdef USE_ICU - return u_tolower(c); -#endif - break; + if (c <= (pg_wchar) 127) + return pg_ascii_tolower((unsigned char) c); + return c; } - return 0; /* can't get here, but keep compiler quiet */ + else + return pg_regex_locale->ctype->wc_tolower(c, pg_regex_locale); } @@ -732,37 +512,27 @@ pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode) * would always be true for production values of MAX_SIMPLE_CHR, but it's * useful to allow it to be small for testing purposes.) */ - switch (pg_regex_strategy) + if (pg_regex_locale->ctype_is_c) { - case PG_REGEX_STRATEGY_C: #if MAX_SIMPLE_CHR >= 127 max_chr = (pg_wchar) 127; pcc->cv.cclasscode = -1; #else max_chr = (pg_wchar) MAX_SIMPLE_CHR; #endif - break; - case PG_REGEX_STRATEGY_BUILTIN: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_WIDE: - max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - case PG_REGEX_STRATEGY_LIBC_1BYTE: + } + else + { #if MAX_SIMPLE_CHR >= UCHAR_MAX + if (pg_regex_locale->provider == COLLPROVIDER_LIBC && + GetDatabaseEncoding() != PG_UTF8) + { max_chr = (pg_wchar) UCHAR_MAX; pcc->cv.cclasscode = -1; -#else - max_chr = (pg_wchar) MAX_SIMPLE_CHR; + } + else #endif - break; - case PG_REGEX_STRATEGY_ICU: max_chr = (pg_wchar) MAX_SIMPLE_CHR; - break; - default: - Assert(false); - max_chr = 0; /* can't get here, but keep compiler quiet */ - break; } /* diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index edb09d4e356..85e5eaf32eb 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -79,6 +79,8 @@ OBJS = \ orderedsetaggs.o \ partitionfuncs.o \ pg_locale.o \ + pg_locale_icu.o \ + pg_locale_libc.o \ pg_lsn.o \ pg_upgrade_support.o \ pgstatfuncs.o \ diff --git a/src/backend/utils/adt/meson.build b/src/backend/utils/adt/meson.build index 8c6fc80c373..f73f294b8f5 100644 --- a/src/backend/utils/adt/meson.build +++ b/src/backend/utils/adt/meson.build @@ -66,6 +66,8 @@ backend_sources += files( 'orderedsetaggs.c', 'partitionfuncs.c', 'pg_locale.c', + 'pg_locale_icu.c', + 'pg_locale_libc.c', 'pg_lsn.c', 'pg_upgrade_support.c', 'pgstatfuncs.c', diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index cfba55a6e31..1802b7a1589 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -58,6 +58,8 @@ #include "catalog/pg_collation.h" #include "catalog/pg_database.h" #include "common/hashfn.h" +#include "common/unicode_case.h" +#include "common/unicode_category.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "utils/builtins.h" @@ -87,12 +89,6 @@ #define PGLOCALE_SUPPORT_ERROR(provider) \ elog(ERROR, "unsupported collprovider for %s: %c", __func__, provider) -/* - * This should be large enough that most strings will fit, but small enough - * that we feel comfortable putting it on the stack - */ -#define TEXTBUFLEN 1024 - #define MAX_L10N_DATA 80 @@ -119,7 +115,21 @@ char *localized_full_months[12 + 1]; /* is the databases's LC_CTYPE the C locale? */ bool database_ctype_is_c = false; -static struct pg_locale_struct default_locale; +#ifdef USE_ICU +extern pg_locale_t icu_dat_create_locale(HeapTuple dattuple); +extern pg_locale_t icu_coll_create_locale(MemoryContext context, + ResourceOwner resowner, + HeapTuple colltuple); +extern UCollator *pg_ucol_open(const char *loc_str); +#endif + + +extern pg_locale_t libc_dat_create_locale(HeapTuple dattuple); +extern pg_locale_t libc_coll_create_locale(MemoryContext context, + ResourceOwner resowner, + HeapTuple colltuple); + +static pg_locale_t default_locale = NULL; /* indicates whether locale information cache is valid */ static bool CurrentLocaleConvValid = false; @@ -170,51 +180,48 @@ static pg_locale_t last_collation_cache_locale = NULL; static char *IsoLocaleName(const char *); #endif -#ifdef USE_ICU -/* - * Converter object for converting between ICU's UChar strings and C strings - * in database encoding. Since the database encoding doesn't change, we only - * need one of these per session. - */ -static UConverter *icu_converter = NULL; - -static UCollator *pg_ucol_open(const char *loc_str); -static void init_icu_converter(void); -static size_t uchar_length(UConverter *converter, - const char *str, int32_t len); -static int32_t uchar_convert(UConverter *converter, - UChar *dest, int32_t destlen, - const char *src, int32_t srclen); -static void icu_set_collation_attributes(UCollator *collator, const char *loc, - UErrorCode *status); - -static void ResourceOwnerRememberUCollator(ResourceOwner owner, - UCollator *collator); -static void ResOwnerReleaseUCollator(Datum val); - -static const ResourceOwnerDesc UCollatorResourceKind = -{ - .name = "UCollator reference", - .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, - .release_priority = RELEASE_PRIO_LAST, - .ReleaseResource = ResOwnerReleaseUCollator, - .DebugPrint = NULL /* the default message is fine */ -}; -#endif +static int +char_props_builtin(pg_wchar wc, int mask, pg_locale_t locale) +{ + int result = 0; + + if ((mask & PG_ISDIGIT) && pg_u_isdigit(wc, true)) + result |= PG_ISDIGIT; + if ((mask & PG_ISALPHA) && pg_u_isalpha(wc)) + result |= PG_ISALPHA; + if ((mask & PG_ISUPPER) && pg_u_isupper(wc)) + result |= PG_ISUPPER; + if ((mask & PG_ISLOWER) && pg_u_islower(wc)) + result |= PG_ISLOWER; + if ((mask & PG_ISGRAPH) && pg_u_isgraph(wc)) + result |= PG_ISGRAPH; + if ((mask & PG_ISPRINT) && pg_u_isprint(wc)) + result |= PG_ISPRINT; + if ((mask & PG_ISPUNCT) && pg_u_ispunct(wc, true)) + result |= PG_ISPUNCT; + if ((mask & PG_ISSPACE) && pg_u_isspace(wc)) + result |= PG_ISSPACE; + + return result; +} -static void ResourceOwnerRememberLocaleT(ResourceOwner owner, - locale_t locale); -static void ResOwnerReleaseLocaleT(Datum val); +static pg_wchar +toupper_builtin(pg_wchar wc, pg_locale_t locale) +{ + return unicode_uppercase_simple(wc); +} -static const ResourceOwnerDesc LocaleTResourceKind = +static pg_wchar +tolower_builtin(pg_wchar wc, pg_locale_t locale) { - .name = "locale_t reference", - .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, - .release_priority = RELEASE_PRIO_LAST, - .ReleaseResource = ResOwnerReleaseLocaleT, - .DebugPrint = NULL /* the default message is fine */ -}; + return unicode_lowercase_simple(wc); +} +struct ctype_methods builtin_ctype_methods = { + .char_props = char_props_builtin, + .wc_toupper = toupper_builtin, + .wc_tolower = tolower_builtin, +}; /* * POSIX doesn't define _l-variants of these functions, but several systems @@ -1262,206 +1269,6 @@ IsoLocaleName(const char *winlocname) #endif /* WIN32 && LC_MESSAGES */ -/* simple subroutine for reporting errors from newlocale() */ -static void -report_newlocale_failure(const char *localename) -{ - int save_errno; - - /* - * Windows doesn't provide any useful error indication from - * _create_locale(), and BSD-derived platforms don't seem to feel they - * need to set errno either (even though POSIX is pretty clear that - * newlocale should do so). So, if errno hasn't been set, assume ENOENT - * is what to report. - */ - if (errno == 0) - errno = ENOENT; - - /* - * ENOENT means "no such locale", not "no such file", so clarify that - * errno with an errdetail message. - */ - save_errno = errno; /* auxiliary funcs might change errno */ - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not create locale \"%s\": %m", - localename), - (save_errno == ENOENT ? - errdetail("The operating system could not find any locale data for the locale name \"%s\".", - localename) : 0))); -} - -static void -ResourceOwnerRememberLocaleT(ResourceOwner owner, locale_t locale) -{ - ResourceOwnerRemember(owner, PointerGetDatum(locale), - &LocaleTResourceKind); -} - -static void -ResOwnerReleaseLocaleT(Datum val) -{ - locale_t locale = (locale_t) DatumGetPointer(val); - freelocale(locale); -} - -/* - * Create a locale_t with the given collation and ctype. - * - * The "C" and "POSIX" locales are not actually handled by libc, so return - * NULL. - * - * Ensure that no path leaks a locale_t. - */ -static locale_t -make_libc_collator(const char *collate, const char *ctype) -{ - locale_t loc = 0; - - if (strcmp(collate, ctype) == 0) - { - if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0) - { - /* Normal case where they're the same */ - errno = 0; -#ifndef WIN32 - loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate, - NULL); -#else - loc = _create_locale(LC_ALL, collate); -#endif - if (!loc) - report_newlocale_failure(collate); - } - } - else - { -#ifndef WIN32 - /* We need two newlocale() steps */ - locale_t loc1 = 0; - - if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0) - { - errno = 0; - loc1 = newlocale(LC_COLLATE_MASK, collate, NULL); - if (!loc1) - report_newlocale_failure(collate); - } - - if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0) - { - errno = 0; - loc = newlocale(LC_CTYPE_MASK, ctype, loc1); - if (!loc) - { - if (loc1) - freelocale(loc1); - report_newlocale_failure(ctype); - } - } - else - loc = loc1; -#else - - /* - * XXX The _create_locale() API doesn't appear to support this. Could - * perhaps be worked around by changing pg_locale_t to contain two - * separate fields. - */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("collations with different collate and ctype values are not supported on this platform"))); -#endif - } - - return loc; -} - -/* - * Create a UCollator with the given locale string and rules. - * - * Ensure that no path leaks a UCollator. - */ -#ifdef USE_ICU -static void -ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator) -{ - ResourceOwnerRemember(owner, PointerGetDatum(collator), - &UCollatorResourceKind); -} - -static void -ResOwnerReleaseUCollator(Datum val) -{ - UCollator *collator = (UCollator *) DatumGetPointer(val); - ucol_close(collator); -} - -static UCollator * -make_icu_collator(const char *iculocstr, const char *icurules) -{ - if (!icurules) - { - /* simple case without rules */ - return pg_ucol_open(iculocstr); - } - else - { - UCollator *collator_std_rules; - UCollator *collator_all_rules; - const UChar *std_rules; - UChar *my_rules; - UChar *all_rules; - int32_t length; - int32_t total; - UErrorCode status; - - /* - * If rules are specified, we extract the rules of the standard - * collation, add our own rules, and make a new collator with the - * combined rules. - */ - icu_to_uchar(&my_rules, icurules, strlen(icurules)); - - collator_std_rules = pg_ucol_open(iculocstr); - - std_rules = ucol_getRules(collator_std_rules, &length); - - total = u_strlen(std_rules) + u_strlen(my_rules) + 1; - - /* avoid leaking collator on OOM */ - all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM); - if (!all_rules) - { - ucol_close(collator_std_rules); - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - u_strcpy(all_rules, std_rules); - u_strcat(all_rules, my_rules); - - ucol_close(collator_std_rules); - - status = U_ZERO_ERROR; - collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules), - UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, - NULL, &status); - if (U_FAILURE(status)) - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s", - iculocstr, icurules, u_errorName(status)))); - } - - return collator_all_rules; - } -} -#endif /* not USE_ICU */ - /* * Initialize default_locale with database locale settings. */ @@ -1471,6 +1278,7 @@ init_database_collation(void) HeapTuple tup; Form_pg_database dbform; Datum datum; + pg_locale_t result = NULL; /* Fetch our pg_database row normally, via syscache */ tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); @@ -1487,70 +1295,38 @@ init_database_collation(void) builtin_validate_locale(dbform->encoding, datlocale); - default_locale.collate_is_c = true; - default_locale.ctype_is_c = (strcmp(datlocale, "C") == 0); - - default_locale.info.builtin.locale = MemoryContextStrdup( + result = MemoryContextAllocZero(TopMemoryContext, + sizeof(struct pg_locale_struct)); + result->info.builtin.locale = MemoryContextStrdup( TopMemoryContext, datlocale); - } - else if (dbform->datlocprovider == COLLPROVIDER_ICU) - { -#ifdef USE_ICU - char *datlocale; - char *icurules; - bool isnull; - - datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); - datlocale = TextDatumGetCString(datum); - - default_locale.collate_is_c = false; - default_locale.ctype_is_c = false; + result->collate_is_c = true; + result->ctype_is_c = (strcmp(datlocale, "C") == 0); - datum = SysCacheGetAttr(DATABASEOID, tup, Anum_pg_database_daticurules, &isnull); - if (!isnull) - icurules = TextDatumGetCString(datum); - else - icurules = NULL; + if (!result->ctype_is_c) + result->ctype = &builtin_ctype_methods; - default_locale.info.icu.locale = MemoryContextStrdup(TopMemoryContext, datlocale); - default_locale.info.icu.ucol = make_icu_collator(datlocale, icurules); -#else /* not USE_ICU */ - /* could get here if a collation was created by a build with ICU */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ICU is not supported in this build"))); -#endif /* not USE_ICU */ } +#ifdef USE_ICU + else if (dbform->datlocprovider == COLLPROVIDER_ICU) + result = icu_dat_create_locale(tup); +#endif /* not USE_ICU */ + else if (dbform->datlocprovider == COLLPROVIDER_LIBC) + result = libc_dat_create_locale(tup); else - { - const char *datcollate; - const char *datctype; - - Assert(dbform->datlocprovider == COLLPROVIDER_LIBC); - - datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datcollate); - datcollate = TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datctype); - datctype = TextDatumGetCString(datum); - - default_locale.collate_is_c = (strcmp(datcollate, "C") == 0) || - (strcmp(datcollate, "POSIX") == 0); - default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) || - (strcmp(datctype, "POSIX") == 0); + PGLOCALE_SUPPORT_ERROR(dbform->datlocprovider); - default_locale.info.lt = make_libc_collator(datcollate, datctype); - } - - default_locale.provider = dbform->datlocprovider; + result->provider = dbform->datlocprovider; /* * Default locale is currently always deterministic. Nondeterministic * locales currently don't support pattern matching, which would break a * lot of things if applied globally. */ - default_locale.deterministic = true; + result->deterministic = true; ReleaseSysCache(tup); + + default_locale = result; } /* @@ -1558,12 +1334,12 @@ init_database_collation(void) * allocating memory. */ static pg_locale_t -create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid) +create_pg_locale(MemoryContext context, ResourceOwner resowner, Oid collid) { /* We haven't computed this yet in this session, so do it */ HeapTuple tp; Form_pg_collation collform; - pg_locale_t result; + pg_locale_t result = NULL; Datum datum; bool isnull; @@ -1631,65 +1407,19 @@ create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid) result->deterministic = collform->collisdeterministic; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); + if (!result->ctype_is_c) + result->ctype = &builtin_ctype_methods; result->info.builtin.locale = MemoryContextStrdup(context, locstr); } - else if (collform->collprovider == COLLPROVIDER_LIBC) - { - const char *collcollate; - const char *collctype; - locale_t locale; - - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); - collcollate = TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); - collctype = TextDatumGetCString(datum); - - ResourceOwnerEnlarge(owner); - locale = make_libc_collator(collcollate, collctype); - if (locale) - ResourceOwnerRememberLocaleT(owner, locale); - - result = MemoryContextAllocZero(context, - sizeof(struct pg_locale_struct)); - - result->provider = collform->collprovider; - result->deterministic = collform->collisdeterministic; - result->collate_is_c = (strcmp(collcollate, "C") == 0) || - (strcmp(collcollate, "POSIX") == 0); - result->ctype_is_c = (strcmp(collctype, "C") == 0) || - (strcmp(collctype, "POSIX") == 0); - result->info.lt = locale; - } +#ifdef USE_ICU else if (collform->collprovider == COLLPROVIDER_ICU) - { - const char *iculocstr; - const char *icurules; - UCollator *collator; - - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); - iculocstr = TextDatumGetCString(datum); - - datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); - if (!isnull) - icurules = TextDatumGetCString(datum); - else - icurules = NULL; - - ResourceOwnerEnlarge(owner); - collator = make_icu_collator(iculocstr, icurules); - ResourceOwnerRememberUCollator(owner, collator); - - result = MemoryContextAllocZero(context, - sizeof(struct pg_locale_struct)); - - result->provider = collform->collprovider; - result->deterministic = collform->collisdeterministic; - result->collate_is_c = false; - result->ctype_is_c = false; - result->info.icu.locale = MemoryContextStrdup(context, iculocstr); - result->info.icu.ucol = collator; - } + result = icu_coll_create_locale(context, resowner, tp); +#endif + else if (collform->collprovider == COLLPROVIDER_LIBC) + result = libc_coll_create_locale(context, resowner, tp); + else + PGLOCALE_SUPPORT_ERROR(collform->collprovider); ReleaseSysCache(tp); @@ -1735,7 +1465,7 @@ pg_newlocale_from_collation(Oid collid) bool found; if (collid == DEFAULT_COLLATION_OID) - return &default_locale; + return default_locale; if (!OidIsValid(collid)) elog(ERROR, "cache lookup failed for collation %u", collid); @@ -1886,483 +1616,48 @@ get_collation_actual_version(char collprovider, const char *collcollate) } /* - * strncoll_libc_win32_utf8 - * - * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and - * invoke wcscoll_l(). + * pg_strcoll * - * An input string length of -1 means that it's NUL-terminated. + * Like pg_strncoll for NUL-terminated input strings. */ -#ifdef WIN32 -static int -strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, - ssize_t len2, pg_locale_t locale) +int +pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) { - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - char *a1p, - *a2p; - int a1len; - int a2len; - int r; - int result; - - Assert(locale->provider == COLLPROVIDER_LIBC); - Assert(GetDatabaseEncoding() == PG_UTF8); -#ifndef WIN32 - Assert(false); -#endif - - if (len1 == -1) - len1 = strlen(arg1); - if (len2 == -1) - len2 = strlen(arg2); - - a1len = len1 * 2 + 2; - a2len = len2 * 2 + 2; - - if (a1len + a2len > TEXTBUFLEN) - buf = palloc(a1len + a2len); - - a1p = buf; - a2p = buf + a1len; - - /* API does not work for zero-length input */ - if (len1 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, - (LPWSTR) a1p, a1len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a1p)[r] = 0; - - if (len2 == 0) - r = 0; - else - { - r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, - (LPWSTR) a2p, a2len / 2); - if (!r) - ereport(ERROR, - (errmsg("could not convert string to UTF-16: error code %lu", - GetLastError()))); - } - ((LPWSTR) a2p)[r] = 0; - - errno = 0; - result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); - if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */ - ereport(ERROR, - (errmsg("could not compare Unicode strings: %m"))); - - if (buf != sbuf) - pfree(buf); - - return result; + return locale->collate->strncoll(arg1, -1, arg2, -1, locale); } -#endif /* WIN32 */ /* - * strncoll_libc + * pg_strncoll + * + * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as + * appropriate for the given locale, platform, and database encoding. If the + * locale is not specified, use the database collation. + * + * The input strings must be encoded in the database encoding. If an input + * string is NUL-terminated, its length may be specified as -1. * - * An input string length of -1 means that it's NUL-terminated. + * The caller is responsible for breaking ties if the collation is + * deterministic; this maintains consistency with pg_strnxfrm(), which cannot + * easily account for deterministic collations. */ -static int -strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, - pg_locale_t locale) +int +pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, + pg_locale_t locale) { - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1; - size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1; - const char *arg1n; - const char *arg2n; - int result; - - Assert(locale->provider == COLLPROVIDER_LIBC); - -#ifdef WIN32 - /* check for this case before doing the work for nul-termination */ - if (GetDatabaseEncoding() == PG_UTF8) - return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); -#endif /* WIN32 */ - - if (bufsize1 + bufsize2 > TEXTBUFLEN) - buf = palloc(bufsize1 + bufsize2); - - /* nul-terminate arguments if necessary */ - if (len1 == -1) - { - arg1n = arg1; - } - else - { - char *buf1 = buf; - memcpy(buf1, arg1, len1); - buf1[len1] = '\0'; - arg1n = buf1; - } - - if (len2 == -1) - { - arg2n = arg2; - } - else - { - char *buf2 = buf + bufsize1; - memcpy(buf2, arg2, len2); - buf2[len2] = '\0'; - arg2n = buf2; - } - - result = strcoll_l(arg1n, arg2n, locale->info.lt); - - if (buf != sbuf) - pfree(buf); - - return result; + return locale->collate->strncoll(arg1, len1, arg2, len2, locale); } -#ifdef USE_ICU - /* - * strncoll_icu_no_utf8 + * Return true if the collation provider supports pg_strxfrm() and + * pg_strnxfrm(); otherwise false. * - * Convert the arguments from the database encoding to UChar strings, then - * call ucol_strcoll(). An argument length of -1 means that the string is - * NUL-terminated. - * - * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), - * caller should call that instead. - */ -static int -strncoll_icu_no_utf8(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, pg_locale_t locale) -{ - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - int32_t ulen1; - int32_t ulen2; - size_t bufsize1; - size_t bufsize2; - UChar *uchar1, - *uchar2; - int result; - - Assert(locale->provider == COLLPROVIDER_ICU); -#ifdef HAVE_UCOL_STRCOLLUTF8 - Assert(GetDatabaseEncoding() != PG_UTF8); -#endif - - init_icu_converter(); - - ulen1 = uchar_length(icu_converter, arg1, len1); - ulen2 = uchar_length(icu_converter, arg2, len2); - - bufsize1 = (ulen1 + 1) * sizeof(UChar); - bufsize2 = (ulen2 + 1) * sizeof(UChar); - - if (bufsize1 + bufsize2 > TEXTBUFLEN) - buf = palloc(bufsize1 + bufsize2); - - uchar1 = (UChar *) buf; - uchar2 = (UChar *) (buf + bufsize1); - - ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); - ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); - - result = ucol_strcoll(locale->info.icu.ucol, - uchar1, ulen1, - uchar2, ulen2); - - if (buf != sbuf) - pfree(buf); - - return result; -} - -/* - * strncoll_icu - * - * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given - * database encoding. An argument length of -1 means the string is - * NUL-terminated. - */ -static int -strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, - pg_locale_t locale) -{ - int result; - - Assert(locale->provider == COLLPROVIDER_ICU); - -#ifdef HAVE_UCOL_STRCOLLUTF8 - if (GetDatabaseEncoding() == PG_UTF8) - { - UErrorCode status; - - status = U_ZERO_ERROR; - result = ucol_strcollUTF8(locale->info.icu.ucol, - arg1, len1, - arg2, len2, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("collation failed: %s", u_errorName(status)))); - } - else -#endif - { - result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); - } - - return result; -} - -#endif /* USE_ICU */ - -/* - * pg_strcoll - * - * Like pg_strncoll for NUL-terminated input strings. - */ -int -pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) -{ - int result; - - if (locale->provider == COLLPROVIDER_LIBC) - result = strncoll_libc(arg1, -1, arg2, -1, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strncoll_icu(arg1, -1, arg2, -1, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; -} - -/* - * pg_strncoll - * - * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as - * appropriate for the given locale, platform, and database encoding. If the - * locale is not specified, use the database collation. - * - * The input strings must be encoded in the database encoding. If an input - * string is NUL-terminated, its length may be specified as -1. - * - * The caller is responsible for breaking ties if the collation is - * deterministic; this maintains consistency with pg_strnxfrm(), which cannot - * easily account for deterministic collations. - */ -int -pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, - pg_locale_t locale) -{ - int result; - - if (locale->provider == COLLPROVIDER_LIBC) - result = strncoll_libc(arg1, len1, arg2, len2, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strncoll_icu(arg1, len1, arg2, len2, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; -} - - -static size_t -strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, - pg_locale_t locale) -{ - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - size_t bufsize = srclen + 1; - size_t result; - - Assert(locale->provider == COLLPROVIDER_LIBC); - - if (srclen == -1) - return strxfrm_l(dest, src, destsize, locale->info.lt); - - if (bufsize > TEXTBUFLEN) - buf = palloc(bufsize); - - /* nul-terminate argument */ - memcpy(buf, src, srclen); - buf[srclen] = '\0'; - - result = strxfrm_l(dest, buf, destsize, locale->info.lt); - - if (buf != sbuf) - pfree(buf); - - /* if dest is defined, it should be nul-terminated */ - Assert(result >= destsize || dest[result] == '\0'); - - return result; -} - -#ifdef USE_ICU - -/* 'srclen' of -1 means the strings are NUL-terminated */ -static size_t -strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, - pg_locale_t locale) -{ - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - UChar *uchar; - int32_t ulen; - size_t uchar_bsize; - Size result_bsize; - - Assert(locale->provider == COLLPROVIDER_ICU); - - init_icu_converter(); - - ulen = uchar_length(icu_converter, src, srclen); - - uchar_bsize = (ulen + 1) * sizeof(UChar); - - if (uchar_bsize > TEXTBUFLEN) - buf = palloc(uchar_bsize); - - uchar = (UChar *) buf; - - ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); - - result_bsize = ucol_getSortKey(locale->info.icu.ucol, - uchar, ulen, - (uint8_t *) dest, destsize); - - /* - * ucol_getSortKey() counts the nul-terminator in the result length, but - * this function should not. - */ - Assert(result_bsize > 0); - result_bsize--; - - if (buf != sbuf) - pfree(buf); - - /* if dest is defined, it should be nul-terminated */ - Assert(result_bsize >= destsize || dest[result_bsize] == '\0'); - - return result_bsize; -} - -/* 'srclen' of -1 means the strings are NUL-terminated */ -static size_t -strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) -{ - char sbuf[TEXTBUFLEN]; - char *buf = sbuf; - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; - int32_t ulen = -1; - UChar *uchar = NULL; - size_t uchar_bsize; - Size result_bsize; - - Assert(locale->provider == COLLPROVIDER_ICU); - Assert(GetDatabaseEncoding() != PG_UTF8); - - init_icu_converter(); - - ulen = uchar_length(icu_converter, src, srclen); - - uchar_bsize = (ulen + 1) * sizeof(UChar); - - if (uchar_bsize > TEXTBUFLEN) - buf = palloc(uchar_bsize); - - uchar = (UChar *) buf; - - ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); - - uiter_setString(&iter, uchar, ulen); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, - &iter, - state, - (uint8_t *) dest, - destsize, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - - return result_bsize; -} - -/* 'srclen' of -1 means the strings are NUL-terminated */ -static size_t -strnxfrm_prefix_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) -{ - size_t result; - - Assert(locale->provider == COLLPROVIDER_ICU); - - if (GetDatabaseEncoding() == PG_UTF8) - { - UCharIterator iter; - uint32_t state[2]; - UErrorCode status; - - uiter_setUTF8(&iter, src, srclen); - state[0] = state[1] = 0; /* won't need that again */ - status = U_ZERO_ERROR; - result = ucol_nextSortKeyPart(locale->info.icu.ucol, - &iter, - state, - (uint8_t *) dest, - destsize, - &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("sort key generation failed: %s", - u_errorName(status)))); - } - else - result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen, - locale); - - return result; -} - -#endif - -/* - * Return true if the collation provider supports pg_strxfrm() and - * pg_strnxfrm(); otherwise false. - * - * Unfortunately, it seems that strxfrm() for non-C collations is broken on - * many common platforms; testing of multiple versions of glibc reveals that, - * for many locales, strcoll() and strxfrm() do not return consistent - * results. While no other libc other than Cygwin has so far been shown to - * have a problem, we take the conservative course of action for right now and - * disable this categorically. (Users who are certain this isn't a problem on - * their system can define TRUST_STRXFRM.) + * Unfortunately, it seems that strxfrm() for non-C collations is broken on + * many common platforms; testing of multiple versions of glibc reveals that, + * for many locales, strcoll() and strxfrm() do not return consistent + * results. While no other libc other than Cygwin has so far been shown to + * have a problem, we take the conservative course of action for right now and + * disable this categorically. (Users who are certain this isn't a problem on + * their system can define TRUST_STRXFRM.) * * No similar problem is known for the ICU provider. */ @@ -2392,19 +1687,7 @@ pg_strxfrm_enabled(pg_locale_t locale) size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - - if (locale->provider == COLLPROVIDER_LIBC) - result = strnxfrm_libc(dest, destsize, src, -1, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strnxfrm_icu(dest, destsize, src, -1, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strnxfrm(dest, destsize, src, -1, locale); } /* @@ -2430,19 +1713,7 @@ size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - - if (locale->provider == COLLPROVIDER_LIBC) - result = strnxfrm_libc(dest, src, srclen, destsize, locale); -#ifdef USE_ICU - else if (locale->provider == COLLPROVIDER_ICU) - result = strnxfrm_icu(dest, src, srclen, destsize, locale); -#endif - else - /* shouldn't happen */ - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strnxfrm(dest, destsize, src, srclen, locale); } /* @@ -2472,7 +1743,7 @@ size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - return pg_strnxfrm_prefix(dest, destsize, src, -1, locale); + return locale->collate->strnxfrm_prefix(dest, destsize, src, -1, locale); } /* @@ -2497,16 +1768,9 @@ size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - -#ifdef USE_ICU - if (locale->provider == COLLPROVIDER_ICU) - result = strnxfrm_prefix_icu(dest, src, -1, destsize, locale); - else -#endif - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return locale->collate->strnxfrm_prefix(dest, destsize, + src, srclen, + locale); } /* @@ -2561,356 +1825,6 @@ builtin_validate_locale(int encoding, const char *locale) return canonical_name; } - -#ifdef USE_ICU - -/* - * Wrapper around ucol_open() to handle API differences for older ICU - * versions. - * - * Ensure that no path leaks a UCollator. - */ -static UCollator * -pg_ucol_open(const char *loc_str) -{ - UCollator *collator; - UErrorCode status; - const char *orig_str = loc_str; - char *fixed_str = NULL; - - /* - * Must never open default collator, because it depends on the environment - * and may change at any time. Should not happen, but check here to catch - * bugs that might be hard to catch otherwise. - * - * NB: the default collator is not the same as the collator for the root - * locale. The root locale may be specified as the empty string, "und", or - * "root". The default collator is opened by passing NULL to ucol_open(). - */ - if (loc_str == NULL) - elog(ERROR, "opening default collator is not supported"); - - /* - * In ICU versions 54 and earlier, "und" is not a recognized spelling of - * the root locale. If the first component of the locale is "und", replace - * with "root" before opening. - */ - if (U_ICU_VERSION_MAJOR_NUM < 55) - { - char lang[ULOC_LANG_CAPACITY]; - - status = U_ZERO_ERROR; - uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); - if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) - { - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not get language from locale \"%s\": %s", - loc_str, u_errorName(status)))); - } - - if (strcmp(lang, "und") == 0) - { - const char *remainder = loc_str + strlen("und"); - - fixed_str = palloc(strlen("root") + strlen(remainder) + 1); - strcpy(fixed_str, "root"); - strcat(fixed_str, remainder); - - loc_str = fixed_str; - } - } - - status = U_ZERO_ERROR; - collator = ucol_open(loc_str, &status); - if (U_FAILURE(status)) - ereport(ERROR, - /* use original string for error report */ - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not open collator for locale \"%s\": %s", - orig_str, u_errorName(status)))); - - if (U_ICU_VERSION_MAJOR_NUM < 54) - { - status = U_ZERO_ERROR; - icu_set_collation_attributes(collator, loc_str, &status); - - /* - * Pretend the error came from ucol_open(), for consistent error - * message across ICU versions. - */ - if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) - { - ucol_close(collator); - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("could not open collator for locale \"%s\": %s", - orig_str, u_errorName(status)))); - } - } - - if (fixed_str != NULL) - pfree(fixed_str); - - return collator; -} - -static void -init_icu_converter(void) -{ - const char *icu_encoding_name; - UErrorCode status; - UConverter *conv; - - if (icu_converter) - return; /* already done */ - - icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding()); - if (!icu_encoding_name) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("encoding \"%s\" not supported by ICU", - pg_encoding_to_char(GetDatabaseEncoding())))); - - status = U_ZERO_ERROR; - conv = ucnv_open(icu_encoding_name, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("could not open ICU converter for encoding \"%s\": %s", - icu_encoding_name, u_errorName(status)))); - - icu_converter = conv; -} - -/* - * Find length, in UChars, of given string if converted to UChar string. - * - * A length of -1 indicates that the input string is NUL-terminated. - */ -static size_t -uchar_length(UConverter *converter, const char *str, int32_t len) -{ - UErrorCode status = U_ZERO_ERROR; - int32_t ulen; - - ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status); - if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); - return ulen; -} - -/* - * Convert the given source string into a UChar string, stored in dest, and - * return the length (in UChars). - * - * A srclen of -1 indicates that the input string is NUL-terminated. - */ -static int32_t -uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, - const char *src, int32_t srclen) -{ - UErrorCode status = U_ZERO_ERROR; - int32_t ulen; - - status = U_ZERO_ERROR; - ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status); - if (U_FAILURE(status)) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); - return ulen; -} - -/* - * Convert a string in the database encoding into a string of UChars. - * - * The source string at buff is of length nbytes - * (it needn't be nul-terminated) - * - * *buff_uchar receives a pointer to the palloc'd result string, and - * the function's result is the number of UChars generated. - * - * The result string is nul-terminated, though most callers rely on the - * result length instead. - */ -int32_t -icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) -{ - int32_t len_uchar; - - init_icu_converter(); - - len_uchar = uchar_length(icu_converter, buff, nbytes); - - *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar)); - len_uchar = uchar_convert(icu_converter, - *buff_uchar, len_uchar + 1, buff, nbytes); - - return len_uchar; -} - -/* - * Convert a string of UChars into the database encoding. - * - * The source string at buff_uchar is of length len_uchar - * (it needn't be nul-terminated) - * - * *result receives a pointer to the palloc'd result string, and the - * function's result is the number of bytes generated (not counting nul). - * - * The result string is nul-terminated. - */ -int32_t -icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) -{ - UErrorCode status; - int32_t len_result; - - init_icu_converter(); - - status = U_ZERO_ERROR; - len_result = ucnv_fromUChars(icu_converter, NULL, 0, - buff_uchar, len_uchar, &status); - if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_fromUChars", - u_errorName(status)))); - - *result = palloc(len_result + 1); - - status = U_ZERO_ERROR; - len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1, - buff_uchar, len_uchar, &status); - if (U_FAILURE(status) || - status == U_STRING_NOT_TERMINATED_WARNING) - ereport(ERROR, - (errmsg("%s failed: %s", "ucnv_fromUChars", - u_errorName(status)))); - - return len_result; -} - -/* - * Parse collation attributes from the given locale string and apply them to - * the open collator. - * - * First, the locale string is canonicalized to an ICU format locale ID such - * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies - * the key-value arguments. - * - * Starting with ICU version 54, the attributes are processed automatically by - * ucol_open(), so this is only necessary for emulating this behavior on older - * versions. - */ -pg_attribute_unused() -static void -icu_set_collation_attributes(UCollator *collator, const char *loc, - UErrorCode *status) -{ - int32_t len; - char *icu_locale_id; - char *lower_str; - char *str; - char *token; - - /* - * The input locale may be a BCP 47 language tag, e.g. - * "und-u-kc-ks-level1", which expresses the same attributes in a - * different form. It will be converted to the equivalent ICU format - * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by - * uloc_canonicalize(). - */ - *status = U_ZERO_ERROR; - len = uloc_canonicalize(loc, NULL, 0, status); - icu_locale_id = palloc(len + 1); - *status = U_ZERO_ERROR; - len = uloc_canonicalize(loc, icu_locale_id, len + 1, status); - if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING) - return; - - lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id)); - - pfree(icu_locale_id); - - str = strchr(lower_str, '@'); - if (!str) - return; - str++; - - while ((token = strsep(&str, ";"))) - { - char *e = strchr(token, '='); - - if (e) - { - char *name; - char *value; - UColAttribute uattr; - UColAttributeValue uvalue; - - *status = U_ZERO_ERROR; - - *e = '\0'; - name = token; - value = e + 1; - - /* - * See attribute name and value lists in ICU i18n/coll.cpp - */ - if (strcmp(name, "colstrength") == 0) - uattr = UCOL_STRENGTH; - else if (strcmp(name, "colbackwards") == 0) - uattr = UCOL_FRENCH_COLLATION; - else if (strcmp(name, "colcaselevel") == 0) - uattr = UCOL_CASE_LEVEL; - else if (strcmp(name, "colcasefirst") == 0) - uattr = UCOL_CASE_FIRST; - else if (strcmp(name, "colalternate") == 0) - uattr = UCOL_ALTERNATE_HANDLING; - else if (strcmp(name, "colnormalization") == 0) - uattr = UCOL_NORMALIZATION_MODE; - else if (strcmp(name, "colnumeric") == 0) - uattr = UCOL_NUMERIC_COLLATION; - else - /* ignore if unknown */ - continue; - - if (strcmp(value, "primary") == 0) - uvalue = UCOL_PRIMARY; - else if (strcmp(value, "secondary") == 0) - uvalue = UCOL_SECONDARY; - else if (strcmp(value, "tertiary") == 0) - uvalue = UCOL_TERTIARY; - else if (strcmp(value, "quaternary") == 0) - uvalue = UCOL_QUATERNARY; - else if (strcmp(value, "identical") == 0) - uvalue = UCOL_IDENTICAL; - else if (strcmp(value, "no") == 0) - uvalue = UCOL_OFF; - else if (strcmp(value, "yes") == 0) - uvalue = UCOL_ON; - else if (strcmp(value, "shifted") == 0) - uvalue = UCOL_SHIFTED; - else if (strcmp(value, "non-ignorable") == 0) - uvalue = UCOL_NON_IGNORABLE; - else if (strcmp(value, "lower") == 0) - uvalue = UCOL_LOWER_FIRST; - else if (strcmp(value, "upper") == 0) - uvalue = UCOL_UPPER_FIRST; - else - { - *status = U_ILLEGAL_ARGUMENT_ERROR; - break; - } - - ucol_setAttribute(collator, uattr, uvalue, status); - } - } - - pfree(lower_str); -} -#endif - /* * Return the BCP47 language tag representation of the requested locale. * @@ -3049,6 +1963,16 @@ icu_validate_locale(const char *loc_str) #endif /* not USE_ICU */ } +/* + * + *TODO: add caching? + */ +int +char_props(pg_wchar wc, int mask, pg_locale_t locale) +{ + return locale->ctype->char_props(wc, mask, locale); +} + /* * These functions convert from/to libc's wchar_t, *not* pg_wchar_t. * Therefore we keep them here rather than with the mbutils code. diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c new file mode 100644 index 00000000000..a9e8b4b642b --- /dev/null +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -0,0 +1,873 @@ +/*----------------------------------------------------------------------- + * + * PostgreSQL locale utilities for ICU + * + * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group + * + * src/backend/utils/adt/pg_locale_libc.c + * + *----------------------------------------------------------------------- + */ + + +#include "postgres.h" + +#ifdef USE_ICU + +#include <unicode/ucnv.h> +#include <unicode/ustring.h> + +#include "access/htup_details.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_database.h" +#include "utils/builtins.h" +#include "utils/formatting.h" +#include "utils/memutils.h" +#include "utils/pg_locale.h" +#include "utils/resowner.h" +#include "utils/syscache.h" + +/* + * This should be large enough that most strings will fit, but small enough + * that we feel comfortable putting it on the stack + */ +#define TEXTBUFLEN 1024 + +extern pg_locale_t icu_dat_create_locale(HeapTuple dattuple); +extern pg_locale_t icu_coll_create_locale(MemoryContext context, + ResourceOwner resowner, + HeapTuple colltuple); +extern UCollator *pg_ucol_open(const char *loc_str); + + +static UCollator * make_icu_collator(const char *iculocstr, + const char *icurules); + +static int strncoll_icu(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); +static size_t strnxfrm_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +static size_t strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + +static void ResourceOwnerRememberUCollator(ResourceOwner owner, + UCollator *collator); +static void ResOwnerReleaseUCollator(Datum val); + +static void init_icu_converter(void); +static size_t uchar_length(UConverter *converter, + const char *str, int32_t len); +static int32_t uchar_convert(UConverter *converter, + UChar *dest, int32_t destlen, + const char *src, int32_t srclen); +static void icu_set_collation_attributes(UCollator *collator, const char *loc, + UErrorCode *status); + +/* + * Converter object for converting between ICU's UChar strings and C strings + * in database encoding. Since the database encoding doesn't change, we only + * need one of these per session. + */ +static UConverter *icu_converter = NULL; + +static const ResourceOwnerDesc UCollatorResourceKind = +{ + .name = "UCollator reference", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_LAST, + .ReleaseResource = ResOwnerReleaseUCollator, + .DebugPrint = NULL /* the default message is fine */ +}; + + +static int +char_props_icu(pg_wchar wc, int mask, pg_locale_t locale) +{ + int result = 0; + + if ((mask & PG_ISDIGIT) && u_isdigit(wc)) + result |= PG_ISDIGIT; + if ((mask & PG_ISALPHA) && u_isalpha(wc)) + result |= PG_ISALPHA; + if ((mask & PG_ISUPPER) && u_isupper(wc)) + result |= PG_ISUPPER; + if ((mask & PG_ISLOWER) && u_islower(wc)) + result |= PG_ISLOWER; + if ((mask & PG_ISGRAPH) && u_isgraph(wc)) + result |= PG_ISGRAPH; + if ((mask & PG_ISPRINT) && u_isprint(wc)) + result |= PG_ISPRINT; + if ((mask & PG_ISPUNCT) && u_ispunct(wc)) + result |= PG_ISPUNCT; + if ((mask & PG_ISSPACE) && u_isspace(wc)) + result |= PG_ISSPACE; + + return result; +} + +static pg_wchar +toupper_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_toupper(wc); +} + +static pg_wchar +tolower_icu(pg_wchar wc, pg_locale_t locale) +{ + return u_tolower(wc); +} + +struct collate_methods icu_collate_methods = { + .strncoll = strncoll_icu, + .strnxfrm = strnxfrm_icu, + .strnxfrm_prefix = strnxfrm_prefix_icu, +}; + +struct ctype_methods icu_ctype_methods = { + .char_props = char_props_icu, + .wc_toupper = toupper_icu, + .wc_tolower = tolower_icu, +}; + +static void +ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator) +{ + ResourceOwnerRemember(owner, PointerGetDatum(collator), + &UCollatorResourceKind); +} + +static void +ResOwnerReleaseUCollator(Datum val) +{ + UCollator *collator = (UCollator *) DatumGetPointer(val); + ucol_close(collator); +} + +pg_locale_t +icu_dat_create_locale(HeapTuple dattuple) +{ + Form_pg_database dbform; + Datum datum; + char *datlocale; + char *icurules; + bool isnull; + pg_locale_t result; + + dbform = (Form_pg_database) GETSTRUCT(dattuple); + + datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datlocale); + datlocale = TextDatumGetCString(datum); + + datum = SysCacheGetAttr(DATABASEOID, dattuple, Anum_pg_database_daticurules, &isnull); + if (!isnull) + icurules = TextDatumGetCString(datum); + else + icurules = NULL; + + result = MemoryContextAllocZero(TopMemoryContext, + sizeof(struct pg_locale_struct)); + + result->info.icu.locale = MemoryContextStrdup(TopMemoryContext, + datlocale); + result->provider = dbform->datlocprovider; + result->deterministic = true; + result->collate_is_c = false; + result->ctype_is_c = false; + result->collate = &icu_collate_methods; + result->ctype = &icu_ctype_methods; + result->info.icu.ucol = make_icu_collator(datlocale, icurules); + + return result; +} + +pg_locale_t +icu_coll_create_locale(MemoryContext context, ResourceOwner resowner, + HeapTuple colltuple) +{ + Form_pg_collation collform; + Datum datum; + bool isnull; + const char *iculocstr; + const char *icurules; + UCollator *collator; + pg_locale_t result; + + collform = (Form_pg_collation) GETSTRUCT(colltuple); + + Assert(collform->collprovider == COLLPROVIDER_ICU); + + datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_colllocale); + iculocstr = TextDatumGetCString(datum); + + datum = SysCacheGetAttr(COLLOID, colltuple, Anum_pg_collation_collicurules, &isnull); + if (!isnull) + icurules = TextDatumGetCString(datum); + else + icurules = NULL; + + ResourceOwnerEnlarge(resowner); + collator = make_icu_collator(iculocstr, icurules); + ResourceOwnerRememberUCollator(resowner, collator); + + result = MemoryContextAllocZero(context, + sizeof(struct pg_locale_struct)); + + result->info.icu.locale = MemoryContextStrdup(context, iculocstr); + result->provider = collform->collprovider; + result->deterministic = collform->collisdeterministic; + result->collate_is_c = false; + result->ctype_is_c = false; + result->collate = &icu_collate_methods; + result->ctype = &icu_ctype_methods; + result->info.icu.ucol = collator; + + return result; +} + +/* + * Create a UCollator with the given locale string and rules. + * + * Ensure that no path leaks a UCollator. + */ +static UCollator * +make_icu_collator(const char *iculocstr, const char *icurules) +{ + if (!icurules) + { + /* simple case without rules */ + return pg_ucol_open(iculocstr); + } + else + { + UCollator *collator_std_rules; + UCollator *collator_all_rules; + const UChar *std_rules; + UChar *my_rules; + UChar *all_rules; + int32_t length; + int32_t total; + UErrorCode status; + + /* + * If rules are specified, we extract the rules of the standard + * collation, add our own rules, and make a new collator with the + * combined rules. + */ + icu_to_uchar(&my_rules, icurules, strlen(icurules)); + + collator_std_rules = pg_ucol_open(iculocstr); + + std_rules = ucol_getRules(collator_std_rules, &length); + + total = u_strlen(std_rules) + u_strlen(my_rules) + 1; + + /* avoid leaking collator on OOM */ + all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM); + if (!all_rules) + { + ucol_close(collator_std_rules); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + u_strcpy(all_rules, std_rules); + u_strcat(all_rules, my_rules); + + ucol_close(collator_std_rules); + + status = U_ZERO_ERROR; + collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules), + UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, + NULL, &status); + if (U_FAILURE(status)) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s", + iculocstr, icurules, u_errorName(status)))); + } + + return collator_all_rules; + } +} + + +/* + * strncoll_icu_no_utf8 + * + * Convert the arguments from the database encoding to UChar strings, then + * call ucol_strcoll(). An argument length of -1 means that the string is + * NUL-terminated. + * + * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), + * caller should call that instead. + */ +static int +strncoll_icu_no_utf8(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + int32_t ulen1; + int32_t ulen2; + size_t bufsize1; + size_t bufsize2; + UChar *uchar1, + *uchar2; + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); +#ifdef HAVE_UCOL_STRCOLLUTF8 + Assert(GetDatabaseEncoding() != PG_UTF8); +#endif + + init_icu_converter(); + + ulen1 = uchar_length(icu_converter, arg1, len1); + ulen2 = uchar_length(icu_converter, arg2, len2); + + bufsize1 = (ulen1 + 1) * sizeof(UChar); + bufsize2 = (ulen2 + 1) * sizeof(UChar); + + if (bufsize1 + bufsize2 > TEXTBUFLEN) + buf = palloc(bufsize1 + bufsize2); + + uchar1 = (UChar *) buf; + uchar2 = (UChar *) (buf + bufsize1); + + ulen1 = uchar_convert(icu_converter, uchar1, ulen1 + 1, arg1, len1); + ulen2 = uchar_convert(icu_converter, uchar2, ulen2 + 1, arg2, len2); + + result = ucol_strcoll(locale->info.icu.ucol, + uchar1, ulen1, + uchar2, ulen2); + + if (buf != sbuf) + pfree(buf); + + return result; +} + +/* + * strncoll_icu + * + * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given + * database encoding. An argument length of -1 means the string is + * NUL-terminated. + */ +static int +strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, + pg_locale_t locale) +{ + int result; + + Assert(locale->provider == COLLPROVIDER_ICU); + +#ifdef HAVE_UCOL_STRCOLLUTF8 + if (GetDatabaseEncoding() == PG_UTF8) + { + UErrorCode status; + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(locale->info.icu.ucol, + arg1, len1, + arg2, len2, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + } + else +#endif + { + result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); + } + + return result; +} + + +/* 'srclen' of -1 means the strings are NUL-terminated */ +static size_t +strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UChar *uchar; + int32_t ulen; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + result_bsize = ucol_getSortKey(locale->info.icu.ucol, + uchar, ulen, + (uint8_t *) dest, destsize); + + /* + * ucol_getSortKey() counts the nul-terminator in the result length, but + * this function should not. + */ + Assert(result_bsize > 0); + result_bsize--; + + if (buf != sbuf) + pfree(buf); + + /* if dest is defined, it should be nul-terminated */ + Assert(result_bsize >= destsize || dest[result_bsize] == '\0'); + + return result_bsize; +} + +/* 'srclen' of -1 means the strings are NUL-terminated */ +static size_t +strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + int32_t ulen = -1; + UChar *uchar = NULL; + size_t uchar_bsize; + Size result_bsize; + + Assert(locale->provider == COLLPROVIDER_ICU); + Assert(GetDatabaseEncoding() != PG_UTF8); + + init_icu_converter(); + + ulen = uchar_length(icu_converter, src, srclen); + + uchar_bsize = (ulen + 1) * sizeof(UChar); + + if (uchar_bsize > TEXTBUFLEN) + buf = palloc(uchar_bsize); + + uchar = (UChar *) buf; + + ulen = uchar_convert(icu_converter, uchar, ulen + 1, src, srclen); + + uiter_setString(&iter, uchar, ulen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result_bsize = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + + return result_bsize; +} + +/* 'srclen' of -1 means the strings are NUL-terminated */ +static size_t +strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) +{ + size_t result; + + Assert(locale->provider == COLLPROVIDER_ICU); + + if (GetDatabaseEncoding() == PG_UTF8) + { + UCharIterator iter; + uint32_t state[2]; + UErrorCode status; + + uiter_setUTF8(&iter, src, srclen); + state[0] = state[1] = 0; /* won't need that again */ + status = U_ZERO_ERROR; + result = ucol_nextSortKeyPart(locale->info.icu.ucol, + &iter, + state, + (uint8_t *) dest, + destsize, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("sort key generation failed: %s", + u_errorName(status)))); + } + else + result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen, + locale); + + return result; +} + +/* + * Wrapper around ucol_open() to handle API differences for older ICU + * versions. + * + * Ensure that no path leaks a UCollator. + */ +UCollator * +pg_ucol_open(const char *loc_str) +{ + UCollator *collator; + UErrorCode status; + const char *orig_str = loc_str; + char *fixed_str = NULL; + + /* + * Must never open default collator, because it depends on the environment + * and may change at any time. Should not happen, but check here to catch + * bugs that might be hard to catch otherwise. + * + * NB: the default collator is not the same as the collator for the root + * locale. The root locale may be specified as the empty string, "und", or + * "root". The default collator is opened by passing NULL to ucol_open(). + */ + if (loc_str == NULL) + elog(ERROR, "opening default collator is not supported"); + + /* + * In ICU versions 54 and earlier, "und" is not a recognized spelling of + * the root locale. If the first component of the locale is "und", replace + * with "root" before opening. + */ + if (U_ICU_VERSION_MAJOR_NUM < 55) + { + char lang[ULOC_LANG_CAPACITY]; + + status = U_ZERO_ERROR; + uloc_getLanguage(loc_str, lang, ULOC_LANG_CAPACITY, &status); + if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not get language from locale \"%s\": %s", + loc_str, u_errorName(status)))); + } + + if (strcmp(lang, "und") == 0) + { + const char *remainder = loc_str + strlen("und"); + + fixed_str = palloc(strlen("root") + strlen(remainder) + 1); + strcpy(fixed_str, "root"); + strcat(fixed_str, remainder); + + loc_str = fixed_str; + } + } + + status = U_ZERO_ERROR; + collator = ucol_open(loc_str, &status); + if (U_FAILURE(status)) + ereport(ERROR, + /* use original string for error report */ + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not open collator for locale \"%s\": %s", + orig_str, u_errorName(status)))); + + if (U_ICU_VERSION_MAJOR_NUM < 54) + { + status = U_ZERO_ERROR; + icu_set_collation_attributes(collator, loc_str, &status); + + /* + * Pretend the error came from ucol_open(), for consistent error + * message across ICU versions. + */ + if (U_FAILURE(status) || status == U_STRING_NOT_TERMINATED_WARNING) + { + ucol_close(collator); + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not open collator for locale \"%s\": %s", + orig_str, u_errorName(status)))); + } + } + + if (fixed_str != NULL) + pfree(fixed_str); + + return collator; +} + +static void +init_icu_converter(void) +{ + const char *icu_encoding_name; + UErrorCode status; + UConverter *conv; + + if (icu_converter) + return; /* already done */ + + icu_encoding_name = get_encoding_name_for_icu(GetDatabaseEncoding()); + if (!icu_encoding_name) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("encoding \"%s\" not supported by ICU", + pg_encoding_to_char(GetDatabaseEncoding())))); + + status = U_ZERO_ERROR; + conv = ucnv_open(icu_encoding_name, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("could not open ICU converter for encoding \"%s\": %s", + icu_encoding_name, u_errorName(status)))); + + icu_converter = conv; +} + +/* + * Find length, in UChars, of given string if converted to UChar string. + * + * A length of -1 indicates that the input string is NUL-terminated. + */ +static size_t +uchar_length(UConverter *converter, const char *str, int32_t len) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + + ulen = ucnv_toUChars(converter, NULL, 0, str, len, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + +/* + * Convert the given source string into a UChar string, stored in dest, and + * return the length (in UChars). + * + * A srclen of -1 indicates that the input string is NUL-terminated. + */ +static int32_t +uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, + const char *src, int32_t srclen) +{ + UErrorCode status = U_ZERO_ERROR; + int32_t ulen; + + status = U_ZERO_ERROR; + ulen = ucnv_toUChars(converter, dest, destlen, src, srclen, &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_toUChars", u_errorName(status)))); + return ulen; +} + +/* + * Convert a string in the database encoding into a string of UChars. + * + * The source string at buff is of length nbytes + * (it needn't be nul-terminated) + * + * *buff_uchar receives a pointer to the palloc'd result string, and + * the function's result is the number of UChars generated. + * + * The result string is nul-terminated, though most callers rely on the + * result length instead. + */ +int32_t +icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) +{ + int32_t len_uchar; + + init_icu_converter(); + + len_uchar = uchar_length(icu_converter, buff, nbytes); + + *buff_uchar = palloc((len_uchar + 1) * sizeof(**buff_uchar)); + len_uchar = uchar_convert(icu_converter, + *buff_uchar, len_uchar + 1, buff, nbytes); + + return len_uchar; +} + +/* + * Convert a string of UChars into the database encoding. + * + * The source string at buff_uchar is of length len_uchar + * (it needn't be nul-terminated) + * + * *result receives a pointer to the palloc'd result string, and the + * function's result is the number of bytes generated (not counting nul). + * + * The result string is nul-terminated. + */ +int32_t +icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar) +{ + UErrorCode status; + int32_t len_result; + + init_icu_converter(); + + status = U_ZERO_ERROR; + len_result = ucnv_fromUChars(icu_converter, NULL, 0, + buff_uchar, len_uchar, &status); + if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_fromUChars", + u_errorName(status)))); + + *result = palloc(len_result + 1); + + status = U_ZERO_ERROR; + len_result = ucnv_fromUChars(icu_converter, *result, len_result + 1, + buff_uchar, len_uchar, &status); + if (U_FAILURE(status) || + status == U_STRING_NOT_TERMINATED_WARNING) + ereport(ERROR, + (errmsg("%s failed: %s", "ucnv_fromUChars", + u_errorName(status)))); + + return len_result; +} + +/* + * Parse collation attributes from the given locale string and apply them to + * the open collator. + * + * First, the locale string is canonicalized to an ICU format locale ID such + * as "und@colStrength=primary;colCaseLevel=yes". Then, it parses and applies + * the key-value arguments. + * + * Starting with ICU version 54, the attributes are processed automatically by + * ucol_open(), so this is only necessary for emulating this behavior on older + * versions. + */ +pg_attribute_unused() +static void +icu_set_collation_attributes(UCollator *collator, const char *loc, + UErrorCode *status) +{ + int32_t len; + char *icu_locale_id; + char *lower_str; + char *str; + char *token; + + /* + * The input locale may be a BCP 47 language tag, e.g. + * "und-u-kc-ks-level1", which expresses the same attributes in a + * different form. It will be converted to the equivalent ICU format + * locale ID, e.g. "und@colcaselevel=yes;colstrength=primary", by + * uloc_canonicalize(). + */ + *status = U_ZERO_ERROR; + len = uloc_canonicalize(loc, NULL, 0, status); + icu_locale_id = palloc(len + 1); + *status = U_ZERO_ERROR; + len = uloc_canonicalize(loc, icu_locale_id, len + 1, status); + if (U_FAILURE(*status) || *status == U_STRING_NOT_TERMINATED_WARNING) + return; + + lower_str = asc_tolower(icu_locale_id, strlen(icu_locale_id)); + + pfree(icu_locale_id); + + str = strchr(lower_str, '@'); + if (!str) + return; + str++; + + while ((token = strsep(&str, ";"))) + { + char *e = strchr(token, '='); + + if (e) + { + char *name; + char *value; + UColAttribute uattr; + UColAttributeValue uvalue; + + *status = U_ZERO_ERROR; + + *e = '\0'; + name = token; + value = e + 1; + + /* + * See attribute name and value lists in ICU i18n/coll.cpp + */ + if (strcmp(name, "colstrength") == 0) + uattr = UCOL_STRENGTH; + else if (strcmp(name, "colbackwards") == 0) + uattr = UCOL_FRENCH_COLLATION; + else if (strcmp(name, "colcaselevel") == 0) + uattr = UCOL_CASE_LEVEL; + else if (strcmp(name, "colcasefirst") == 0) + uattr = UCOL_CASE_FIRST; + else if (strcmp(name, "colalternate") == 0) + uattr = UCOL_ALTERNATE_HANDLING; + else if (strcmp(name, "colnormalization") == 0) + uattr = UCOL_NORMALIZATION_MODE; + else if (strcmp(name, "colnumeric") == 0) + uattr = UCOL_NUMERIC_COLLATION; + else + /* ignore if unknown */ + continue; + + if (strcmp(value, "primary") == 0) + uvalue = UCOL_PRIMARY; + else if (strcmp(value, "secondary") == 0) + uvalue = UCOL_SECONDARY; + else if (strcmp(value, "tertiary") == 0) + uvalue = UCOL_TERTIARY; + else if (strcmp(value, "quaternary") == 0) + uvalue = UCOL_QUATERNARY; + else if (strcmp(value, "identical") == 0) + uvalue = UCOL_IDENTICAL; + else if (strcmp(value, "no") == 0) + uvalue = UCOL_OFF; + else if (strcmp(value, "yes") == 0) + uvalue = UCOL_ON; + else if (strcmp(value, "shifted") == 0) + uvalue = UCOL_SHIFTED; + else if (strcmp(value, "non-ignorable") == 0) + uvalue = UCOL_NON_IGNORABLE; + else if (strcmp(value, "lower") == 0) + uvalue = UCOL_LOWER_FIRST; + else if (strcmp(value, "upper") == 0) + uvalue = UCOL_UPPER_FIRST; + else + { + *status = U_ILLEGAL_ARGUMENT_ERROR; + break; + } + + ucol_setAttribute(collator, uattr, uvalue, status); + } + } + + pfree(lower_str); +} + +#endif /* USE_ICU */ diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c new file mode 100644 index 00000000000..6eb8b80fdf9 --- /dev/null +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -0,0 +1,604 @@ +/*----------------------------------------------------------------------- + * + * PostgreSQL locale utilities for libc + * + * Portions Copyright (c) 2002-2024, PostgreSQL Global Development Group + * + * src/backend/utils/adt/pg_locale_libc.c + * + *----------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <limits.h> +#include <wctype.h> + +#include "access/htup_details.h" +#include "catalog/pg_collation.h" +#include "catalog/pg_database.h" +#include "utils/builtins.h" +#include "utils/memutils.h" +#include "utils/pg_locale.h" +#include "utils/resowner.h" +#include "utils/syscache.h" + +/* + * This should be large enough that most strings will fit, but small enough + * that we feel comfortable putting it on the stack + */ +#define TEXTBUFLEN 1024 + +extern pg_locale_t libc_dat_create_locale(HeapTuple dattuple); +extern pg_locale_t libc_coll_create_locale(MemoryContext context, + ResourceOwner resowner, + HeapTuple colltuple); + +static int strncoll_libc(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); +static size_t strnxfrm_libc(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + +static int char_props_libc(pg_wchar wc, int mask, pg_locale_t locale); +static pg_wchar toupper_libc(pg_wchar wc, pg_locale_t locale); +static pg_wchar tolower_libc(pg_wchar wc, pg_locale_t locale); + +static void ResourceOwnerRememberLocaleT(ResourceOwner resowner, + locale_t locale); +static void ResOwnerReleaseLocaleT(Datum val); + +static locale_t make_libc_collator(const char *collate, const char *ctype); + +static void report_newlocale_failure(const char *localename); + +static const ResourceOwnerDesc LocaleTResourceKind = +{ + .name = "locale_t reference", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_LAST, + .ReleaseResource = ResOwnerReleaseLocaleT, + .DebugPrint = NULL /* the default message is fine */ +}; + +struct collate_methods libc_collate_methods = { + .strncoll = strncoll_libc, + .strnxfrm = strnxfrm_libc, + .strnxfrm_prefix = NULL, +}; + +struct ctype_methods libc_ctype_methods = { + .char_props = char_props_libc, + .wc_toupper = toupper_libc, + .wc_tolower = tolower_libc, +}; + +pg_locale_t +libc_dat_create_locale(HeapTuple dattuple) +{ + Form_pg_database dbform; + Datum datum; + const char *datcollate; + const char *datctype; + pg_locale_t result; + + dbform = (Form_pg_database) GETSTRUCT(dattuple); + + Assert(dbform->datlocprovider == COLLPROVIDER_LIBC); + + datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datcollate); + datcollate = TextDatumGetCString(datum); + datum = SysCacheGetAttrNotNull(DATABASEOID, dattuple, Anum_pg_database_datctype); + datctype = TextDatumGetCString(datum); + + result = MemoryContextAllocZero(TopMemoryContext, + sizeof(struct pg_locale_struct)); + + result->provider = dbform->datlocprovider; + result->deterministic = true; + result->collate_is_c = (strcmp(datcollate, "C") == 0) || + (strcmp(datcollate, "POSIX") == 0); + result->ctype_is_c = (strcmp(datctype, "C") == 0) || + (strcmp(datctype, "POSIX") == 0); + + if (!result->collate_is_c) + result->collate = &libc_collate_methods; + if (!result->ctype_is_c) + result->ctype = &libc_ctype_methods; + result->info.lt = make_libc_collator(datcollate, datctype); + + return result; +} + +pg_locale_t +libc_coll_create_locale(MemoryContext context, ResourceOwner resowner, + HeapTuple colltuple) +{ + Form_pg_collation collform; + Datum datum; + const char *collcollate; + const char *collctype; + locale_t locale; + pg_locale_t result; + + collform = (Form_pg_collation) GETSTRUCT(colltuple); + + datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_collcollate); + collcollate = TextDatumGetCString(datum); + datum = SysCacheGetAttrNotNull(COLLOID, colltuple, Anum_pg_collation_collctype); + collctype = TextDatumGetCString(datum); + + ResourceOwnerEnlarge(resowner); + locale = make_libc_collator(collcollate, collctype); + if (locale) + ResourceOwnerRememberLocaleT(resowner, locale); + + result = MemoryContextAllocZero(context, + sizeof(struct pg_locale_struct)); + + result->provider = collform->collprovider; + result->deterministic = collform->collisdeterministic; + result->collate_is_c = (strcmp(collcollate, "C") == 0) || + (strcmp(collcollate, "POSIX") == 0); + result->ctype_is_c = (strcmp(collctype, "C") == 0) || + (strcmp(collctype, "POSIX") == 0); + if (!result->collate_is_c) + result->collate = &libc_collate_methods; + if (!result->ctype_is_c) + result->ctype = &libc_ctype_methods; + result->info.lt = locale; + + return result; +} + +static void +ResourceOwnerRememberLocaleT(ResourceOwner resowner, locale_t locale) +{ + ResourceOwnerRemember(resowner, PointerGetDatum(locale), + &LocaleTResourceKind); +} + +static void +ResOwnerReleaseLocaleT(Datum val) +{ + locale_t locale = (locale_t) DatumGetPointer(val); +#ifndef WIN32 + freelocale(locale); +#else + _free_locale(locale); +#endif +} + +static int +char_props_libc(pg_wchar wc, int mask, pg_locale_t locale) +{ + int result = 0; + + Assert(!locale->ctype_is_c); + + if (mask & PG_ISDIGIT) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswdigit_l((wint_t) wc, locale->info.lt)) + result |= PG_ISDIGIT; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + isdigit_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISDIGIT; + } + } + if (mask & PG_ISALPHA) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswalpha_l((wint_t) wc, locale->info.lt)) + result |= PG_ISALPHA; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + isalpha_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISALPHA; + } + } + if (mask & PG_ISUPPER) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswupper_l((wint_t) wc, locale->info.lt)) + result |= PG_ISUPPER; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + isupper_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISUPPER; + } + } + if (mask & PG_ISLOWER) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswlower_l((wint_t) wc, locale->info.lt)) + result |= PG_ISLOWER; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + islower_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISLOWER; + } + } + if (mask & PG_ISGRAPH) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswgraph_l((wint_t) wc, locale->info.lt)) + result |= PG_ISGRAPH; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + isgraph_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISGRAPH; + } + } + if (mask & PG_ISPRINT) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswprint_l((wint_t) wc, locale->info.lt)) + result |= PG_ISPRINT; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + isprint_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISPRINT; + } + } + if (mask & PG_ISPUNCT) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswpunct_l((wint_t) wc, locale->info.lt)) + result |= PG_ISPUNCT; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + ispunct_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISPUNCT; + } + } + if (mask & PG_ISSPACE) + { + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + { + if (iswspace_l((wint_t) wc, locale->info.lt)) + result |= PG_ISSPACE; + } + else + { + if (wc <= (pg_wchar) UCHAR_MAX && + isspace_l((unsigned char) wc, locale->info.lt)) + result |= PG_ISSPACE; + } + } + + return result; +} + +static pg_wchar +toupper_libc(pg_wchar wc, pg_locale_t locale) +{ + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + return towupper_l((wint_t) wc, locale->info.lt); + else if (wc <= (pg_wchar) UCHAR_MAX) + return toupper_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +static pg_wchar +tolower_libc(pg_wchar wc, pg_locale_t locale) +{ + if (GetDatabaseEncoding() == PG_UTF8 && + (sizeof(wchar_t) >= 4 || wc <= (pg_wchar) 0xFFFF)) + return towlower_l((wint_t) wc, locale->info.lt); + else if (wc <= (pg_wchar) UCHAR_MAX) + return tolower_l((unsigned char) wc, locale->info.lt); + else + return wc; +} + +/* + * Create a locale_t with the given collation and ctype. + * + * The "C" and "POSIX" locales are not actually handled by libc, so return + * NULL. + * + * Ensure that no path leaks a locale_t. + */ +static locale_t +make_libc_collator(const char *collate, const char *ctype) +{ + locale_t loc = 0; + + if (strcmp(collate, ctype) == 0) + { + if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0) + { + /* Normal case where they're the same */ + errno = 0; +#ifndef WIN32 + loc = newlocale(LC_COLLATE_MASK | LC_CTYPE_MASK, collate, + NULL); +#else + loc = _create_locale(LC_ALL, collate); +#endif + if (!loc) + report_newlocale_failure(collate); + } + } + else + { +#ifndef WIN32 + /* We need two newlocale() steps */ + locale_t loc1 = 0; + + if (strcmp(collate, "C") != 0 && strcmp(collate, "POSIX") != 0) + { + errno = 0; + loc1 = newlocale(LC_COLLATE_MASK, collate, NULL); + if (!loc1) + report_newlocale_failure(collate); + } + + if (strcmp(ctype, "C") != 0 && strcmp(ctype, "POSIX") != 0) + { + errno = 0; + loc = newlocale(LC_CTYPE_MASK, ctype, loc1); + if (!loc) + { + if (loc1) + freelocale(loc1); + report_newlocale_failure(ctype); + } + } + else + loc = loc1; +#else + + /* + * XXX The _create_locale() API doesn't appear to support this. Could + * perhaps be worked around by changing pg_locale_t to contain two + * separate fields. + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("collations with different collate and ctype values are not supported on this platform"))); +#endif + } + + return loc; +} + +/* + * strncoll_libc_win32_utf8 + * + * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and + * invoke wcscoll_l(). + * + * An input string length of -1 means that it's NUL-terminated. + */ +#ifdef WIN32 +static int +strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, + ssize_t len2, pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + char *a1p, + *a2p; + int a1len; + int a2len; + int r; + int result; + + Assert(locale->provider == COLLPROVIDER_LIBC); + Assert(GetDatabaseEncoding() == PG_UTF8); +#ifndef WIN32 + Assert(false); +#endif + + if (len1 == -1) + len1 = strlen(arg1); + if (len2 == -1) + len2 = strlen(arg2); + + a1len = len1 * 2 + 2; + a2len = len2 * 2 + 2; + + if (a1len + a2len > TEXTBUFLEN) + buf = palloc(a1len + a2len); + + a1p = buf; + a2p = buf + a1len; + + /* API does not work for zero-length input */ + if (len1 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1, + (LPWSTR) a1p, a1len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a1p)[r] = 0; + + if (len2 == 0) + r = 0; + else + { + r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2, + (LPWSTR) a2p, a2len / 2); + if (!r) + ereport(ERROR, + (errmsg("could not convert string to UTF-16: error code %lu", + GetLastError()))); + } + ((LPWSTR) a2p)[r] = 0; + + errno = 0; + result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, locale->info.lt); + if (result == 2147483647) /* _NLSCMPERROR; missing from mingw headers */ + ereport(ERROR, + (errmsg("could not compare Unicode strings: %m"))); + + if (buf != sbuf) + pfree(buf); + + return result; +} +#endif /* WIN32 */ + +/* + * strncoll_libc + * + * An input string length of -1 means that it's NUL-terminated. + */ +static int +strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1; + size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1; + const char *arg1n; + const char *arg2n; + int result; + + Assert(locale->provider == COLLPROVIDER_LIBC); + +#ifdef WIN32 + /* check for this case before doing the work for nul-termination */ + if (GetDatabaseEncoding() == PG_UTF8) + return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); +#endif /* WIN32 */ + + if (bufsize1 + bufsize2 > TEXTBUFLEN) + buf = palloc(bufsize1 + bufsize2); + + /* nul-terminate arguments if necessary */ + if (len1 == -1) + { + arg1n = arg1; + } + else + { + char *buf1 = buf; + memcpy(buf1, arg1, len1); + buf1[len1] = '\0'; + arg1n = buf1; + } + + if (len2 == -1) + { + arg2n = arg2; + } + else + { + char *buf2 = buf + bufsize1; + memcpy(buf2, arg2, len2); + buf2[len2] = '\0'; + arg2n = buf2; + } + + result = strcoll_l(arg1n, arg2n, locale->info.lt); + + if (buf != sbuf) + pfree(buf); + + return result; +} + +static size_t +strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) +{ + char sbuf[TEXTBUFLEN]; + char *buf = sbuf; + size_t bufsize = srclen + 1; + size_t result; + + Assert(locale->provider == COLLPROVIDER_LIBC); + + if (srclen == -1) + return strxfrm_l(dest, src, destsize, locale->info.lt); + + if (bufsize > TEXTBUFLEN) + buf = palloc(bufsize); + + /* nul-terminate argument */ + memcpy(buf, src, srclen); + buf[srclen] = '\0'; + + result = strxfrm_l(dest, buf, destsize, locale->info.lt); + + if (buf != sbuf) + pfree(buf); + + /* if dest is defined, it should be nul-terminated */ + Assert(result >= destsize || dest[result] == '\0'); + + return result; +} + +/* simple subroutine for reporting errors from newlocale() */ +static void +report_newlocale_failure(const char *localename) +{ + int save_errno; + + /* + * Windows doesn't provide any useful error indication from + * _create_locale(), and BSD-derived platforms don't seem to feel they + * need to set errno either (even though POSIX is pretty clear that + * newlocale should do so). So, if errno hasn't been set, assume ENOENT + * is what to report. + */ + if (errno == 0) + errno = ENOENT; + + /* + * ENOENT means "no such locale", not "no such file", so clarify that + * errno with an errdetail message. + */ + save_errno = errno; /* auxiliary funcs might change errno */ + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not create locale \"%s\": %m", + localename), + (save_errno == ENOENT ? + errdetail("The operating system could not find any locale data for the locale name \"%s\".", + localename) : 0))); +} + diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 3b443df8014..95ba7940b95 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -12,6 +12,7 @@ #ifndef _PG_LOCALE_ #define _PG_LOCALE_ +#include "mb/pg_wchar.h" #if defined(LOCALE_T_IN_XLOCALE) || defined(WCSTOMBS_L_IN_XLOCALE) #include <xlocale.h> #endif @@ -19,6 +20,19 @@ #include <unicode/ucol.h> #endif +/* + * Character properties for regular expressions. + */ +#define PG_ISDIGIT 0x01 +#define PG_ISALPHA 0x02 +#define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA) +#define PG_ISUPPER 0x04 +#define PG_ISLOWER 0x08 +#define PG_ISGRAPH 0x10 +#define PG_ISPRINT 0x20 +#define PG_ISPUNCT 0x40 +#define PG_ISSPACE 0x80 + #ifdef USE_ICU /* * ucol_strcollUTF8() was introduced in ICU 50, but it is buggy before ICU 53. @@ -62,6 +76,28 @@ extern struct lconv *PGLC_localeconv(void); extern void cache_locale_time(void); +struct pg_locale_struct; +typedef struct pg_locale_struct *pg_locale_t; + +struct collate_methods +{ + int (*strncoll)(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale); + size_t (*strnxfrm)(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); + size_t (*strnxfrm_prefix)(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale); +}; + +struct ctype_methods +{ + int (*char_props)(pg_wchar wc, int mask, pg_locale_t locale); + pg_wchar (*wc_toupper)(pg_wchar wc, pg_locale_t locale); + pg_wchar (*wc_tolower)(pg_wchar wc, pg_locale_t locale); +}; /* * We use a discriminated union to hold either a locale_t or an ICU collator. @@ -85,6 +121,10 @@ struct pg_locale_struct bool deterministic; bool collate_is_c; bool ctype_is_c; + + struct collate_methods *collate; + struct ctype_methods *ctype; + union { struct @@ -102,8 +142,6 @@ struct pg_locale_struct } info; }; -typedef struct pg_locale_struct *pg_locale_t; - extern void init_database_collation(void); extern pg_locale_t pg_newlocale_from_collation(Oid collid); @@ -132,6 +170,8 @@ extern int32_t icu_to_uchar(UChar **buff_uchar, const char *buff, size_t nbytes) extern int32_t icu_from_uchar(char **result, const UChar *buff_uchar, int32_t len_uchar); #endif +extern int char_props(pg_wchar wc, int mask, pg_locale_t locale); + /* These functions convert from/to libc's wchar_t, *not* pg_wchar_t */ extern size_t wchar2char(char *to, const wchar_t *from, size_t tolen, pg_locale_t locale); -- 2.34.1 [text/x-patch] v4-0006-Allow-length-1-for-NUL-terminated-input-to-pg_str.patch (19.5K, 3-v4-0006-Allow-length-1-for-NUL-terminated-input-to-pg_str.patch) download | inline diff: From 56e4fbb3ccd2927e4cc92b4201632361e2b16abb Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Wed, 21 Aug 2024 10:59:28 -0700 Subject: [PATCH v4 6/7] Allow length=-1 for NUL-terminated input to pg_strncoll(), etc. Like ICU, allow a length of -1 to be specified for NUL-terminated arguments to pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix(). Simplifies the code and comments. --- src/backend/utils/adt/pg_locale.c | 256 ++++++++++++------------------ src/include/utils/pg_locale.h | 8 +- 2 files changed, 104 insertions(+), 160 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index c89ac3b9e01..cfba55a6e31 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1886,22 +1886,24 @@ get_collation_actual_version(char collprovider, const char *collcollate) } /* - * pg_strncoll_libc_win32_utf8 + * strncoll_libc_win32_utf8 * * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and * invoke wcscoll_l(). + * + * An input string length of -1 means that it's NUL-terminated. */ #ifdef WIN32 static int -pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, - size_t len2, pg_locale_t locale) +strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, + ssize_t len2, pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; char *a1p, *a2p; - int a1len = len1 * 2 + 2; - int a2len = len2 * 2 + 2; + int a1len; + int a2len; int r; int result; @@ -1911,6 +1913,14 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, Assert(false); #endif + if (len1 == -1) + len1 = strlen(arg1); + if (len2 == -1) + len2 = strlen(arg2); + + a1len = len1 * 2 + 2; + a2len = len2 * 2 + 2; + if (a1len + a2len > TEXTBUFLEN) buf = palloc(a1len + a2len); @@ -1958,50 +1968,20 @@ pg_strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, #endif /* WIN32 */ /* - * pg_strcoll_libc + * strncoll_libc * - * Call strcoll_l() or wcscoll_l() as appropriate for the given locale, - * platform, and database encoding. If the locale is NULL, use the database - * collation. - * - * Arguments must be encoded in the database encoding and nul-terminated. + * An input string length of -1 means that it's NUL-terminated. */ static int -pg_strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale) -{ - int result; - - Assert(locale->provider == COLLPROVIDER_LIBC); -#ifdef WIN32 - if (GetDatabaseEncoding() == PG_UTF8) - { - size_t len1 = strlen(arg1); - size_t len2 = strlen(arg2); - - result = pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); - } - else -#endif /* WIN32 */ - result = strcoll_l(arg1, arg2, locale->info.lt); - - return result; -} - -/* - * pg_strncoll_libc - * - * Nul-terminate the arguments and call pg_strcoll_libc(). - */ -static int -pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, - pg_locale_t locale) +strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; - size_t bufsize1 = len1 + 1; - size_t bufsize2 = len2 + 1; - char *arg1n; - char *arg2n; + size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1; + size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1; + const char *arg1n; + const char *arg2n; int result; Assert(locale->provider == COLLPROVIDER_LIBC); @@ -2009,22 +1989,38 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, #ifdef WIN32 /* check for this case before doing the work for nul-termination */ if (GetDatabaseEncoding() == PG_UTF8) - return pg_strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); + return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); #endif /* WIN32 */ if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); - arg1n = buf; - arg2n = buf + bufsize1; + /* nul-terminate arguments if necessary */ + if (len1 == -1) + { + arg1n = arg1; + } + else + { + char *buf1 = buf; + memcpy(buf1, arg1, len1); + buf1[len1] = '\0'; + arg1n = buf1; + } - /* nul-terminate arguments */ - memcpy(arg1n, arg1, len1); - arg1n[len1] = '\0'; - memcpy(arg2n, arg2, len2); - arg2n[len2] = '\0'; + if (len2 == -1) + { + arg2n = arg2; + } + else + { + char *buf2 = buf + bufsize1; + memcpy(buf2, arg2, len2); + buf2[len2] = '\0'; + arg2n = buf2; + } - result = pg_strcoll_libc(arg1n, arg2n, locale); + result = strcoll_l(arg1n, arg2n, locale->info.lt); if (buf != sbuf) pfree(buf); @@ -2035,7 +2031,7 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, #ifdef USE_ICU /* - * pg_strncoll_icu_no_utf8 + * strncoll_icu_no_utf8 * * Convert the arguments from the database encoding to UChar strings, then * call ucol_strcoll(). An argument length of -1 means that the string is @@ -2045,8 +2041,8 @@ pg_strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, * caller should call that instead. */ static int -pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1, - const char *arg2, int32_t len2, pg_locale_t locale) +strncoll_icu_no_utf8(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -2091,17 +2087,15 @@ pg_strncoll_icu_no_utf8(const char *arg1, int32_t len1, } /* - * pg_strncoll_icu + * strncoll_icu * * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given * database encoding. An argument length of -1 means the string is * NUL-terminated. - * - * Arguments must be encoded in the database encoding. */ static int -pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2, - pg_locale_t locale) +strncoll_icu(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, + pg_locale_t locale) { int result; @@ -2124,7 +2118,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2, else #endif { - result = pg_strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); + result = strncoll_icu_no_utf8(arg1, len1, arg2, len2, locale); } return result; @@ -2135,15 +2129,7 @@ pg_strncoll_icu(const char *arg1, int32_t len1, const char *arg2, int32_t len2, /* * pg_strcoll * - * Call ucol_strcollUTF8(), ucol_strcoll(), strcoll_l() or wcscoll_l() as - * appropriate for the given locale, platform, and database encoding. If the - * locale is not specified, use the database collation. - * - * Arguments must be encoded in the database encoding and nul-terminated. - * - * The caller is responsible for breaking ties if the collation is - * deterministic; this maintains consistency with pg_strxfrm(), which cannot - * easily account for deterministic collations. + * Like pg_strncoll for NUL-terminated input strings. */ int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) @@ -2151,10 +2137,10 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) int result; if (locale->provider == COLLPROVIDER_LIBC) - result = pg_strcoll_libc(arg1, arg2, locale); + result = strncoll_libc(arg1, -1, arg2, -1, locale); #ifdef USE_ICU else if (locale->provider == COLLPROVIDER_ICU) - result = pg_strncoll_icu(arg1, -1, arg2, -1, locale); + result = strncoll_icu(arg1, -1, arg2, -1, locale); #endif else /* shouldn't happen */ @@ -2170,27 +2156,24 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) * appropriate for the given locale, platform, and database encoding. If the * locale is not specified, use the database collation. * - * Arguments must be encoded in the database encoding. - * - * This function may need to nul-terminate the arguments for libc functions; - * so if the caller already has nul-terminated strings, it should call - * pg_strcoll() instead. + * The input strings must be encoded in the database encoding. If an input + * string is NUL-terminated, its length may be specified as -1. * * The caller is responsible for breaking ties if the collation is * deterministic; this maintains consistency with pg_strnxfrm(), which cannot * easily account for deterministic collations. */ int -pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, +pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, pg_locale_t locale) { int result; if (locale->provider == COLLPROVIDER_LIBC) - result = pg_strncoll_libc(arg1, len1, arg2, len2, locale); + result = strncoll_libc(arg1, len1, arg2, len2, locale); #ifdef USE_ICU else if (locale->provider == COLLPROVIDER_ICU) - result = pg_strncoll_icu(arg1, len1, arg2, len2, locale); + result = strncoll_icu(arg1, len1, arg2, len2, locale); #endif else /* shouldn't happen */ @@ -2201,16 +2184,8 @@ pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, static size_t -pg_strxfrm_libc(char *dest, const char *src, size_t destsize, - pg_locale_t locale) -{ - Assert(locale->provider == COLLPROVIDER_LIBC); - return strxfrm_l(dest, src, destsize, locale->info.lt); -} - -static size_t -pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, - pg_locale_t locale) +strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -2219,14 +2194,17 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, Assert(locale->provider == COLLPROVIDER_LIBC); + if (srclen == -1) + return strxfrm_l(dest, src, destsize, locale->info.lt); + if (bufsize > TEXTBUFLEN) buf = palloc(bufsize); - /* nul-terminate arguments */ + /* nul-terminate argument */ memcpy(buf, src, srclen); buf[srclen] = '\0'; - result = pg_strxfrm_libc(dest, buf, destsize, locale); + result = strxfrm_l(dest, buf, destsize, locale->info.lt); if (buf != sbuf) pfree(buf); @@ -2241,8 +2219,8 @@ pg_strnxfrm_libc(char *dest, const char *src, size_t srclen, size_t destsize, /* 'srclen' of -1 means the strings are NUL-terminated */ static size_t -pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize, - pg_locale_t locale) +strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -2288,8 +2266,9 @@ pg_strnxfrm_icu(char *dest, const char *src, int32_t srclen, int32_t destsize, /* 'srclen' of -1 means the strings are NUL-terminated */ static size_t -pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen, - int32_t destsize, pg_locale_t locale) +strnxfrm_prefix_icu_no_utf8(char *dest,size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -2336,8 +2315,9 @@ pg_strnxfrm_prefix_icu_no_utf8(char *dest, const char *src, int32_t srclen, /* 'srclen' of -1 means the strings are NUL-terminated */ static size_t -pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen, - int32_t destsize, pg_locale_t locale) +strnxfrm_prefix_icu(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) { size_t result; @@ -2364,8 +2344,8 @@ pg_strnxfrm_prefix_icu(char *dest, const char *src, int32_t srclen, u_errorName(status)))); } else - result = pg_strnxfrm_prefix_icu_no_utf8(dest, src, srclen, destsize, - locale); + result = strnxfrm_prefix_icu_no_utf8(dest, destsize, src, srclen, + locale); return result; } @@ -2407,20 +2387,7 @@ pg_strxfrm_enabled(pg_locale_t locale) /* * pg_strxfrm * - * Transforms 'src' to a nul-terminated string stored in 'dest' such that - * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on - * untransformed strings. - * - * The provided 'src' must be nul-terminated. If 'destsize' is zero, 'dest' - * may be NULL. - * - * Not all providers support pg_strxfrm() safely. The caller should check - * pg_strxfrm_enabled() first, otherwise this function may return wrong - * results or an error. - * - * Returns the number of bytes needed (or more) to store the transformed - * string, excluding the terminating nul byte. If the value returned is - * 'destsize' or greater, the resulting contents of 'dest' are undefined. + * Like pg_strnxfrm for a NUL-terminated input string. */ size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) @@ -2428,10 +2395,10 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) size_t result = 0; /* keep compiler quiet */ if (locale->provider == COLLPROVIDER_LIBC) - result = pg_strxfrm_libc(dest, src, destsize, locale); + result = strnxfrm_libc(dest, destsize, src, -1, locale); #ifdef USE_ICU else if (locale->provider == COLLPROVIDER_ICU) - result = pg_strnxfrm_icu(dest, src, -1, destsize, locale); + result = strnxfrm_icu(dest, destsize, src, -1, locale); #endif else /* shouldn't happen */ @@ -2447,8 +2414,9 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on * untransformed strings. * - * 'src' does not need to be nul-terminated. If 'destsize' is zero, 'dest' may - * be NULL. + * The input string must be encoded in the database encoding. If the input + * string is NUL-terminated, its length may be specified as -1. If 'destsize' + * is zero, 'dest' may be NULL. * * Not all providers support pg_strnxfrm() safely. The caller should check * pg_strxfrm_enabled() first, otherwise this function may return wrong @@ -2457,22 +2425,18 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) * Returns the number of bytes needed (or more) to store the transformed * string, excluding the terminating nul byte. If the value returned is * 'destsize' or greater, the resulting contents of 'dest' are undefined. - * - * This function may need to nul-terminate the argument for libc functions; - * so if the caller already has a nul-terminated string, it should call - * pg_strxfrm() instead. */ size_t -pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen, +pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen, pg_locale_t locale) { size_t result = 0; /* keep compiler quiet */ if (locale->provider == COLLPROVIDER_LIBC) - result = pg_strnxfrm_libc(dest, src, srclen, destsize, locale); + result = strnxfrm_libc(dest, src, srclen, destsize, locale); #ifdef USE_ICU else if (locale->provider == COLLPROVIDER_ICU) - result = pg_strnxfrm_icu(dest, src, srclen, destsize, locale); + result = strnxfrm_icu(dest, src, srclen, destsize, locale); #endif else /* shouldn't happen */ @@ -2502,44 +2466,24 @@ pg_strxfrm_prefix_enabled(pg_locale_t locale) /* * pg_strxfrm_prefix * - * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary - * memcmp() on the byte sequence is equivalent to pg_strcoll() on - * untransformed strings. The result is not nul-terminated. - * - * The provided 'src' must be nul-terminated. - * - * Not all providers support pg_strxfrm_prefix() safely. The caller should - * check pg_strxfrm_prefix_enabled() first, otherwise this function may return - * wrong results or an error. - * - * If destsize is not large enough to hold the resulting byte sequence, stores - * only the first destsize bytes in 'dest'. Returns the number of bytes - * actually copied to 'dest'. + * Like pg_strnxfrm_prefix for a NUL-terminated input string. */ size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - size_t result = 0; /* keep compiler quiet */ - -#ifdef USE_ICU - if (locale->provider == COLLPROVIDER_ICU) - result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); - else -#endif - PGLOCALE_SUPPORT_ERROR(locale->provider); - - return result; + return pg_strnxfrm_prefix(dest, destsize, src, -1, locale); } /* * pg_strnxfrm_prefix * * Transforms 'src' to a byte sequence stored in 'dest' such that ordinary - * memcmp() on the byte sequence is equivalent to pg_strcoll() on + * memcmp() on the byte sequence is equivalent to pg_strncoll() on * untransformed strings. The result is not nul-terminated. * - * The provided 'src' must be nul-terminated. + * The input string must be encoded in the database encoding. If the input + * string is NUL-terminated, its length may be specified as -1. * * Not all providers support pg_strnxfrm_prefix() safely. The caller should * check pg_strxfrm_prefix_enabled() first, otherwise this function may return @@ -2548,20 +2492,16 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, * If destsize is not large enough to hold the resulting byte sequence, stores * only the first destsize bytes in 'dest'. Returns the number of bytes * actually copied to 'dest'. - * - * This function may need to nul-terminate the argument for libc functions; - * so if the caller already has a nul-terminated string, it should call - * pg_strxfrm_prefix() instead. */ size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, - size_t srclen, pg_locale_t locale) + ssize_t srclen, pg_locale_t locale) { size_t result = 0; /* keep compiler quiet */ #ifdef USE_ICU if (locale->provider == COLLPROVIDER_ICU) - result = pg_strnxfrm_prefix_icu(dest, src, -1, destsize, locale); + result = strnxfrm_prefix_icu(dest, src, -1, destsize, locale); else #endif PGLOCALE_SUPPORT_ERROR(locale->provider); @@ -2744,6 +2684,8 @@ init_icu_converter(void) /* * Find length, in UChars, of given string if converted to UChar string. + * + * A length of -1 indicates that the input string is NUL-terminated. */ static size_t uchar_length(UConverter *converter, const char *str, int32_t len) @@ -2761,6 +2703,8 @@ uchar_length(UConverter *converter, const char *str, int32_t len) /* * Convert the given source string into a UChar string, stored in dest, and * return the length (in UChars). + * + * A srclen of -1 indicates that the input string is NUL-terminated. */ static int32_t uchar_convert(UConverter *converter, UChar *dest, int32_t destlen, diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index c2d95411e0a..3b443df8014 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -109,18 +109,18 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); -extern int pg_strncoll(const char *arg1, size_t len1, - const char *arg2, size_t len2, pg_locale_t locale); +extern int pg_strncoll(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, pg_locale_t locale); extern bool pg_strxfrm_enabled(pg_locale_t locale); extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, - size_t srclen, pg_locale_t locale); + ssize_t srclen, pg_locale_t locale); extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale); extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, - size_t srclen, pg_locale_t locale); + ssize_t srclen, pg_locale_t locale); extern int builtin_locale_encoding(const char *locale); extern const char *builtin_validate_locale(int encoding, const char *locale); -- 2.34.1 [text/x-patch] v4-0005-invalidation.patch (2.7K, 4-v4-0005-invalidation.patch) download | inline diff: From 2f51247615a36dc257b700c2832f3d4aa32fce64 Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Wed, 18 Sep 2024 17:49:57 -0700 Subject: [PATCH v4 5/7] invalidation --- src/backend/utils/adt/pg_locale.c | 41 +++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 9d1d71f1561..c89ac3b9e01 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -63,6 +63,7 @@ #include "utils/builtins.h" #include "utils/formatting.h" #include "utils/guc_hooks.h" +#include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_locale.h" @@ -1695,6 +1696,34 @@ create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid) return result; } +static void +CollationCacheInvalidate(Datum arg, int cacheid, uint32 hashvalue) +{ + last_collation_cache_oid = InvalidOid; + + if (CollationCache == NULL) + return; + + ResourceOwnerRelease(CollationCacheOwner, + RESOURCE_RELEASE_BEFORE_LOCKS, + false, true); + ResourceOwnerRelease(CollationCacheOwner, + RESOURCE_RELEASE_LOCKS, + false, true); + ResourceOwnerRelease(CollationCacheOwner, + RESOURCE_RELEASE_AFTER_LOCKS, + false, true); + ResourceOwnerDelete(CollationCacheOwner); + CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache"); + + MemoryContextReset(CollationCacheContext); + + /* free all memory and reset hash table */ + CollationCache = collation_cache_create(CollationCacheContext, + 16, NULL); +} + + /* * Create or retrieve a pg_locale_t for the given collation OID. Results are * cached for the lifetime of the backend. @@ -1714,14 +1743,7 @@ pg_newlocale_from_collation(Oid collid) if (last_collation_cache_oid == collid) return last_collation_cache_locale; - /* - * Cache mechanism for collation information. - * - * Note that we currently lack any way to flush the cache. Since we don't - * support ALTER COLLATION, this is OK. The worst case is that someone - * drops a collation, and a useless cache entry hangs around in existing - * backends. - */ + /* cache mechanism for collation information */ if (CollationCache == NULL) { CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache"); @@ -1730,6 +1752,9 @@ pg_newlocale_from_collation(Oid collid) ALLOCSET_DEFAULT_SIZES); CollationCache = collation_cache_create(CollationCacheContext, 16, NULL); + CacheRegisterSyscacheCallback(COLLOID, + CollationCacheInvalidate, + (Datum) 0); } cache_entry = collation_cache_insert(CollationCache, collid, &found); -- 2.34.1 [text/x-patch] v4-0004-resource-owners.patch (5.1K, 5-v4-0004-resource-owners.patch) download | inline diff: From 5ae3b1be6489617a1639141749c31d2f4419a676 Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Wed, 18 Sep 2024 16:55:42 -0700 Subject: [PATCH v4 4/7] resource owners --- src/backend/utils/adt/pg_locale.c | 74 ++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index d3d9c3920e6..9d1d71f1561 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -66,6 +66,7 @@ #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_locale.h" +#include "utils/resowner.h" #include "utils/syscache.h" #ifdef USE_ICU @@ -148,6 +149,12 @@ typedef struct #define SH_DEFINE #include "lib/simplehash.h" +/* + * Collator objects (UCollator for ICU or locale_t for libc) are allocated in + * an external library, so track them using a resource owner. + */ +static ResourceOwner CollationCacheOwner = NULL; + static MemoryContext CollationCacheContext = NULL; static collation_cache_hash *CollationCache = NULL; @@ -179,8 +186,35 @@ static int32_t uchar_convert(UConverter *converter, const char *src, int32_t srclen); static void icu_set_collation_attributes(UCollator *collator, const char *loc, UErrorCode *status); + +static void ResourceOwnerRememberUCollator(ResourceOwner owner, + UCollator *collator); +static void ResOwnerReleaseUCollator(Datum val); + +static const ResourceOwnerDesc UCollatorResourceKind = +{ + .name = "UCollator reference", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_LAST, + .ReleaseResource = ResOwnerReleaseUCollator, + .DebugPrint = NULL /* the default message is fine */ +}; #endif +static void ResourceOwnerRememberLocaleT(ResourceOwner owner, + locale_t locale); +static void ResOwnerReleaseLocaleT(Datum val); + +static const ResourceOwnerDesc LocaleTResourceKind = +{ + .name = "locale_t reference", + .release_phase = RESOURCE_RELEASE_AFTER_LOCKS, + .release_priority = RELEASE_PRIO_LAST, + .ReleaseResource = ResOwnerReleaseLocaleT, + .DebugPrint = NULL /* the default message is fine */ +}; + + /* * POSIX doesn't define _l-variants of these functions, but several systems * have them. We provide our own replacements here. @@ -1257,6 +1291,20 @@ report_newlocale_failure(const char *localename) localename) : 0))); } +static void +ResourceOwnerRememberLocaleT(ResourceOwner owner, locale_t locale) +{ + ResourceOwnerRemember(owner, PointerGetDatum(locale), + &LocaleTResourceKind); +} + +static void +ResOwnerReleaseLocaleT(Datum val) +{ + locale_t locale = (locale_t) DatumGetPointer(val); + freelocale(locale); +} + /* * Create a locale_t with the given collation and ctype. * @@ -1335,6 +1383,20 @@ make_libc_collator(const char *collate, const char *ctype) * Ensure that no path leaks a UCollator. */ #ifdef USE_ICU +static void +ResourceOwnerRememberUCollator(ResourceOwner owner, UCollator *collator) +{ + ResourceOwnerRemember(owner, PointerGetDatum(collator), + &UCollatorResourceKind); +} + +static void +ResOwnerReleaseUCollator(Datum val) +{ + UCollator *collator = (UCollator *) DatumGetPointer(val); + ucol_close(collator); +} + static UCollator * make_icu_collator(const char *iculocstr, const char *icurules) { @@ -1495,7 +1557,7 @@ init_database_collation(void) * allocating memory. */ static pg_locale_t -create_pg_locale(MemoryContext context, Oid collid) +create_pg_locale(MemoryContext context, ResourceOwner owner, Oid collid) { /* We haven't computed this yet in this session, so do it */ HeapTuple tp; @@ -1582,7 +1644,10 @@ create_pg_locale(MemoryContext context, Oid collid) datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); collctype = TextDatumGetCString(datum); + ResourceOwnerEnlarge(owner); locale = make_libc_collator(collcollate, collctype); + if (locale) + ResourceOwnerRememberLocaleT(owner, locale); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); @@ -1610,7 +1675,9 @@ create_pg_locale(MemoryContext context, Oid collid) else icurules = NULL; + ResourceOwnerEnlarge(owner); collator = make_icu_collator(iculocstr, icurules); + ResourceOwnerRememberUCollator(owner, collator); result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); @@ -1657,6 +1724,7 @@ pg_newlocale_from_collation(Oid collid) */ if (CollationCache == NULL) { + CollationCacheOwner = ResourceOwnerCreate(NULL, "collation cache"); CollationCacheContext = AllocSetContextCreate(TopMemoryContext, "collation cache", ALLOCSET_DEFAULT_SIZES); @@ -1675,7 +1743,9 @@ pg_newlocale_from_collation(Oid collid) } if (cache_entry->locale == 0) - cache_entry->locale = create_pg_locale(CollationCacheContext, collid); + cache_entry->locale = create_pg_locale(CollationCacheContext, + CollationCacheOwner, + collid); last_collation_cache_oid = collid; last_collation_cache_locale = cache_entry->locale; -- 2.34.1 [text/x-patch] v4-0003-CollationCacheContext.patch (2.6K, 6-v4-0003-CollationCacheContext.patch) download | inline diff: From fca0efa184971f9780b356039aa3ed08a7445524 Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Wed, 18 Sep 2024 15:55:37 -0700 Subject: [PATCH v4 3/7] CollationCacheContext --- src/backend/utils/adt/pg_locale.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 1dec00b55ed..d3d9c3920e6 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1495,7 +1495,7 @@ init_database_collation(void) * allocating memory. */ static pg_locale_t -create_pg_locale(Oid collid) +create_pg_locale(MemoryContext context, Oid collid) { /* We haven't computed this yet in this session, so do it */ HeapTuple tp; @@ -1561,15 +1561,15 @@ create_pg_locale(Oid collid) builtin_validate_locale(GetDatabaseEncoding(), locstr); - result = MemoryContextAllocZero(TopMemoryContext, + result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->provider = collform->collprovider; result->deterministic = collform->collisdeterministic; result->collate_is_c = true; result->ctype_is_c = (strcmp(locstr, "C") == 0); - result->info.builtin.locale = MemoryContextStrdup(TopMemoryContext, - locstr); + result->info.builtin.locale = MemoryContextStrdup(context, + locstr); } else if (collform->collprovider == COLLPROVIDER_LIBC) { @@ -1584,7 +1584,7 @@ create_pg_locale(Oid collid) locale = make_libc_collator(collcollate, collctype); - result = MemoryContextAllocZero(TopMemoryContext, + result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->provider = collform->collprovider; @@ -1612,14 +1612,14 @@ create_pg_locale(Oid collid) collator = make_icu_collator(iculocstr, icurules); - result = MemoryContextAllocZero(TopMemoryContext, + result = MemoryContextAllocZero(context, sizeof(struct pg_locale_struct)); result->provider = collform->collprovider; result->deterministic = collform->collisdeterministic; result->collate_is_c = false; result->ctype_is_c = false; - result->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr); + result->info.icu.locale = MemoryContextStrdup(context, iculocstr); result->info.icu.ucol = collator; } @@ -1675,7 +1675,7 @@ pg_newlocale_from_collation(Oid collid) } if (cache_entry->locale == 0) - cache_entry->locale = create_pg_locale(collid); + cache_entry->locale = create_pg_locale(CollationCacheContext, collid); last_collation_cache_oid = collid; last_collation_cache_locale = cache_entry->locale; -- 2.34.1 [text/x-patch] v4-0002-create_pg_locale.patch (12.6K, 7-v4-0002-create_pg_locale.patch) download | inline diff: From eccc4a4a83069c6a14465b4a9239a4d759aaa2a8 Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Wed, 18 Sep 2024 15:53:56 -0700 Subject: [PATCH v4 2/7] create_pg_locale --- src/backend/utils/adt/pg_locale.c | 310 +++++++++++++++--------------- 1 file changed, 155 insertions(+), 155 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 12ba5726f77..1dec00b55ed 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1227,45 +1227,6 @@ IsoLocaleName(const char *winlocname) #endif /* WIN32 && LC_MESSAGES */ -/* - * Cache mechanism for collation information. - * - * Note that we currently lack any way to flush the cache. Since we don't - * support ALTER COLLATION, this is OK. The worst case is that someone - * drops a collation, and a useless cache entry hangs around in existing - * backends. - */ -static collation_cache_entry * -lookup_collation_cache(Oid collation) -{ - collation_cache_entry *cache_entry; - bool found; - - Assert(OidIsValid(collation)); - Assert(collation != DEFAULT_COLLATION_OID); - - if (CollationCache == NULL) - { - CollationCacheContext = AllocSetContextCreate(TopMemoryContext, - "collation cache", - ALLOCSET_DEFAULT_SIZES); - CollationCache = collation_cache_create(CollationCacheContext, - 16, NULL); - } - - cache_entry = collation_cache_insert(CollationCache, collation, &found); - if (!found) - { - /* - * Make sure cache entry is marked invalid, in case we fail before - * setting things. - */ - cache_entry->locale = 0; - } - - return cache_entry; -} - /* simple subroutine for reporting errors from newlocale() */ static void report_newlocale_failure(const char *localename) @@ -1530,153 +1491,192 @@ init_database_collation(void) } /* - * Create a pg_locale_t from a collation OID. Results are cached for the - * lifetime of the backend. Thus, do not free the result with freelocale(). - * - * For simplicity, we always generate COLLATE + CTYPE even though we - * might only need one of them. Since this is called only once per session, - * it shouldn't cost much. + * Create and initialize a pg_locale_t. Be careful to check for errors before + * allocating memory. */ -pg_locale_t -pg_newlocale_from_collation(Oid collid) +static pg_locale_t +create_pg_locale(Oid collid) { - collation_cache_entry *cache_entry; - - if (collid == DEFAULT_COLLATION_OID) - return &default_locale; + /* We haven't computed this yet in this session, so do it */ + HeapTuple tp; + Form_pg_collation collform; + pg_locale_t result; + Datum datum; + bool isnull; - if (!OidIsValid(collid)) + tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); + if (!HeapTupleIsValid(tp)) elog(ERROR, "cache lookup failed for collation %u", collid); + collform = (Form_pg_collation) GETSTRUCT(tp); - if (last_collation_cache_oid == collid) - return last_collation_cache_locale; - - cache_entry = lookup_collation_cache(collid); - - if (cache_entry->locale == 0) + /* compare version in catalog to version from provider */ + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, + &isnull); + if (!isnull) { - /* We haven't computed this yet in this session, so do it */ - HeapTuple tp; - Form_pg_collation collform; - struct pg_locale_struct result; - pg_locale_t resultp; - Datum datum; - bool isnull; + char *actual_versionstr; + char *collversionstr; - tp = SearchSysCache1(COLLOID, ObjectIdGetDatum(collid)); - if (!HeapTupleIsValid(tp)) - elog(ERROR, "cache lookup failed for collation %u", collid); - collform = (Form_pg_collation) GETSTRUCT(tp); + collversionstr = TextDatumGetCString(datum); - /* We'll fill in the result struct locally before allocating memory */ - memset(&result, 0, sizeof(result)); - result.provider = collform->collprovider; - result.deterministic = collform->collisdeterministic; + if (collform->collprovider == COLLPROVIDER_LIBC) + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); + else + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); - if (collform->collprovider == COLLPROVIDER_BUILTIN) + actual_versionstr = get_collation_actual_version(collform->collprovider, + TextDatumGetCString(datum)); + if (!actual_versionstr) { - const char *locstr; + /* + * This could happen when specifying a version in CREATE + * COLLATION but the provider does not support versioning, or + * manually creating a mess in the catalogs. + */ + ereport(ERROR, + (errmsg("collation \"%s\" has no actual version, but a version was recorded", + NameStr(collform->collname)))); + } - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); - locstr = TextDatumGetCString(datum); + if (strcmp(actual_versionstr, collversionstr) != 0) + ereport(WARNING, + (errmsg("collation \"%s\" has version mismatch", + NameStr(collform->collname)), + errdetail("The collation in the database was created using version %s, " + "but the operating system provides version %s.", + collversionstr, actual_versionstr), + errhint("Rebuild all objects affected by this collation and run " + "ALTER COLLATION %s REFRESH VERSION, " + "or build PostgreSQL with the right library version.", + quote_qualified_identifier(get_namespace_name(collform->collnamespace), + NameStr(collform->collname))))); + } - result.collate_is_c = true; - result.ctype_is_c = (strcmp(locstr, "C") == 0); + if (collform->collprovider == COLLPROVIDER_BUILTIN) + { + const char *locstr; - builtin_validate_locale(GetDatabaseEncoding(), locstr); + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + locstr = TextDatumGetCString(datum); - result.info.builtin.locale = MemoryContextStrdup(TopMemoryContext, - locstr); - } - else if (collform->collprovider == COLLPROVIDER_LIBC) - { - const char *collcollate; - const char *collctype; + builtin_validate_locale(GetDatabaseEncoding(), locstr); - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); - collcollate = TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); - collctype = TextDatumGetCString(datum); + result = MemoryContextAllocZero(TopMemoryContext, + sizeof(struct pg_locale_struct)); - result.collate_is_c = (strcmp(collcollate, "C") == 0) || - (strcmp(collcollate, "POSIX") == 0); - result.ctype_is_c = (strcmp(collctype, "C") == 0) || - (strcmp(collctype, "POSIX") == 0); + result->provider = collform->collprovider; + result->deterministic = collform->collisdeterministic; + result->collate_is_c = true; + result->ctype_is_c = (strcmp(locstr, "C") == 0); + result->info.builtin.locale = MemoryContextStrdup(TopMemoryContext, + locstr); + } + else if (collform->collprovider == COLLPROVIDER_LIBC) + { + const char *collcollate; + const char *collctype; + locale_t locale; + + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); + collcollate = TextDatumGetCString(datum); + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collctype); + collctype = TextDatumGetCString(datum); + + locale = make_libc_collator(collcollate, collctype); + + result = MemoryContextAllocZero(TopMemoryContext, + sizeof(struct pg_locale_struct)); + + result->provider = collform->collprovider; + result->deterministic = collform->collisdeterministic; + result->collate_is_c = (strcmp(collcollate, "C") == 0) || + (strcmp(collcollate, "POSIX") == 0); + result->ctype_is_c = (strcmp(collctype, "C") == 0) || + (strcmp(collctype, "POSIX") == 0); + result->info.lt = locale; + } + else if (collform->collprovider == COLLPROVIDER_ICU) + { + const char *iculocstr; + const char *icurules; + UCollator *collator; - result.info.lt = make_libc_collator(collcollate, collctype); - } - else if (collform->collprovider == COLLPROVIDER_ICU) - { - const char *iculocstr; - const char *icurules; + datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); + iculocstr = TextDatumGetCString(datum); - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); - iculocstr = TextDatumGetCString(datum); + datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); + if (!isnull) + icurules = TextDatumGetCString(datum); + else + icurules = NULL; - result.collate_is_c = false; - result.ctype_is_c = false; + collator = make_icu_collator(iculocstr, icurules); - datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collicurules, &isnull); - if (!isnull) - icurules = TextDatumGetCString(datum); - else - icurules = NULL; + result = MemoryContextAllocZero(TopMemoryContext, + sizeof(struct pg_locale_struct)); - result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr); - result.info.icu.ucol = make_icu_collator(iculocstr, icurules); - } + result->provider = collform->collprovider; + result->deterministic = collform->collisdeterministic; + result->collate_is_c = false; + result->ctype_is_c = false; + result->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr); + result->info.icu.ucol = collator; + } - datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, - &isnull); - if (!isnull) - { - char *actual_versionstr; - char *collversionstr; + ReleaseSysCache(tp); - collversionstr = TextDatumGetCString(datum); + return result; +} - if (collform->collprovider == COLLPROVIDER_LIBC) - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_collcollate); - else - datum = SysCacheGetAttrNotNull(COLLOID, tp, Anum_pg_collation_colllocale); +/* + * Create or retrieve a pg_locale_t for the given collation OID. Results are + * cached for the lifetime of the backend. + */ +pg_locale_t +pg_newlocale_from_collation(Oid collid) +{ + collation_cache_entry *cache_entry; + bool found; - actual_versionstr = get_collation_actual_version(collform->collprovider, - TextDatumGetCString(datum)); - if (!actual_versionstr) - { - /* - * This could happen when specifying a version in CREATE - * COLLATION but the provider does not support versioning, or - * manually creating a mess in the catalogs. - */ - ereport(ERROR, - (errmsg("collation \"%s\" has no actual version, but a version was recorded", - NameStr(collform->collname)))); - } + if (collid == DEFAULT_COLLATION_OID) + return &default_locale; - if (strcmp(actual_versionstr, collversionstr) != 0) - ereport(WARNING, - (errmsg("collation \"%s\" has version mismatch", - NameStr(collform->collname)), - errdetail("The collation in the database was created using version %s, " - "but the operating system provides version %s.", - collversionstr, actual_versionstr), - errhint("Rebuild all objects affected by this collation and run " - "ALTER COLLATION %s REFRESH VERSION, " - "or build PostgreSQL with the right library version.", - quote_qualified_identifier(get_namespace_name(collform->collnamespace), - NameStr(collform->collname))))); - } + if (!OidIsValid(collid)) + elog(ERROR, "cache lookup failed for collation %u", collid); - ReleaseSysCache(tp); + if (last_collation_cache_oid == collid) + return last_collation_cache_locale; - /* We'll keep the pg_locale_t structures in TopMemoryContext */ - resultp = MemoryContextAlloc(TopMemoryContext, sizeof(*resultp)); - *resultp = result; + /* + * Cache mechanism for collation information. + * + * Note that we currently lack any way to flush the cache. Since we don't + * support ALTER COLLATION, this is OK. The worst case is that someone + * drops a collation, and a useless cache entry hangs around in existing + * backends. + */ + if (CollationCache == NULL) + { + CollationCacheContext = AllocSetContextCreate(TopMemoryContext, + "collation cache", + ALLOCSET_DEFAULT_SIZES); + CollationCache = collation_cache_create(CollationCacheContext, + 16, NULL); + } - cache_entry->locale = resultp; + cache_entry = collation_cache_insert(CollationCache, collid, &found); + if (!found) + { + /* + * Make sure cache entry is marked invalid, in case we fail before + * setting things. + */ + cache_entry->locale = 0; } + if (cache_entry->locale == 0) + cache_entry->locale = create_pg_locale(collid); + last_collation_cache_oid = collid; last_collation_cache_locale = cache_entry->locale; -- 2.34.1 [text/x-patch] v4-0001-Tighten-up-make_libc_collator-and-make_icu_collat.patch (8.1K, 8-v4-0001-Tighten-up-make_libc_collator-and-make_icu_collat.patch) download | inline diff: From 224470bc4d0660dc11940f5595031eecb0319d62 Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Wed, 7 Aug 2024 11:05:46 -0700 Subject: [PATCH v4 1/7] Tighten up make_libc_collator() and make_icu_collator(). Return the result rather than using an out parameter, and make it the caller's responsibility to copy it into the right context. Ensure that no paths leak a collator. The function make_icu_collator() doesn't have any external callers, so change it to be static. Also, when re-opening with rules, use a try/finally block to avoid leaking the collator. In make_libc_collator(), if the first newlocale() succeeds and the second one fails, close the first locale_t object. Discussion: https://postgr.es/m/[email protected] --- src/backend/utils/adt/pg_locale.c | 126 +++++++++++++++++++----------- src/include/utils/pg_locale.h | 4 - 2 files changed, 80 insertions(+), 50 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 5bef1b113a8..12ba5726f77 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1297,14 +1297,15 @@ report_newlocale_failure(const char *localename) } /* - * Initialize the locale_t field. + * Create a locale_t with the given collation and ctype. * - * The "C" and "POSIX" locales are not actually handled by libc, so set the - * locale_t to zero in that case. + * The "C" and "POSIX" locales are not actually handled by libc, so return + * NULL. + * + * Ensure that no path leaks a locale_t. */ -static void -make_libc_collator(const char *collate, const char *ctype, - pg_locale_t result) +static locale_t +make_libc_collator(const char *collate, const char *ctype) { locale_t loc = 0; @@ -1343,7 +1344,11 @@ make_libc_collator(const char *collate, const char *ctype, errno = 0; loc = newlocale(LC_CTYPE_MASK, ctype, loc1); if (!loc) + { + if (loc1) + freelocale(loc1); report_newlocale_failure(ctype); + } } else loc = loc1; @@ -1360,60 +1365,78 @@ make_libc_collator(const char *collate, const char *ctype, #endif } - result->info.lt = loc; + return loc; } -void -make_icu_collator(const char *iculocstr, - const char *icurules, - struct pg_locale_struct *resultp) -{ +/* + * Create a UCollator with the given locale string and rules. + * + * Ensure that no path leaks a UCollator. + */ #ifdef USE_ICU - UCollator *collator; - - collator = pg_ucol_open(iculocstr); - - /* - * If rules are specified, we extract the rules of the standard collation, - * add our own rules, and make a new collator with the combined rules. - */ - if (icurules) +static UCollator * +make_icu_collator(const char *iculocstr, const char *icurules) +{ + if (!icurules) { - const UChar *default_rules; - UChar *agg_rules; + /* simple case without rules */ + return pg_ucol_open(iculocstr); + } + else + { + UCollator *collator_std_rules; + UCollator *collator_all_rules; + const UChar *std_rules; UChar *my_rules; - UErrorCode status; + UChar *all_rules; int32_t length; + int32_t total; + UErrorCode status; - default_rules = ucol_getRules(collator, &length); + /* + * If rules are specified, we extract the rules of the standard + * collation, add our own rules, and make a new collator with the + * combined rules. + */ icu_to_uchar(&my_rules, icurules, strlen(icurules)); - agg_rules = palloc_array(UChar, u_strlen(default_rules) + u_strlen(my_rules) + 1); - u_strcpy(agg_rules, default_rules); - u_strcat(agg_rules, my_rules); + collator_std_rules = pg_ucol_open(iculocstr); - ucol_close(collator); + std_rules = ucol_getRules(collator_std_rules, &length); + + total = u_strlen(std_rules) + u_strlen(my_rules) + 1; + + /* avoid leaking collator on OOM */ + all_rules = palloc_extended(sizeof(UChar) * total, MCXT_ALLOC_NO_OOM); + if (!all_rules) + { + ucol_close(collator_std_rules); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + u_strcpy(all_rules, std_rules); + u_strcat(all_rules, my_rules); + + ucol_close(collator_std_rules); status = U_ZERO_ERROR; - collator = ucol_openRules(agg_rules, u_strlen(agg_rules), - UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, NULL, &status); + collator_all_rules = ucol_openRules(all_rules, u_strlen(all_rules), + UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH, + NULL, &status); if (U_FAILURE(status)) + { ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("could not open collator for locale \"%s\" with rules \"%s\": %s", iculocstr, icurules, u_errorName(status)))); - } + } - /* We will leak this string if the caller errors later :-( */ - resultp->info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr); - resultp->info.icu.ucol = collator; -#else /* not USE_ICU */ - /* could get here if a collation was created by a build with ICU */ - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("ICU is not supported in this build"))); -#endif /* not USE_ICU */ + return collator_all_rules; + } } +#endif /* not USE_ICU */ /* * Initialize default_locale with database locale settings. @@ -1424,7 +1447,6 @@ init_database_collation(void) HeapTuple tup; Form_pg_database dbform; Datum datum; - bool isnull; /* Fetch our pg_database row normally, via syscache */ tup = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId)); @@ -1449,8 +1471,10 @@ init_database_collation(void) } else if (dbform->datlocprovider == COLLPROVIDER_ICU) { +#ifdef USE_ICU char *datlocale; char *icurules; + bool isnull; datum = SysCacheGetAttrNotNull(DATABASEOID, tup, Anum_pg_database_datlocale); datlocale = TextDatumGetCString(datum); @@ -1464,7 +1488,14 @@ init_database_collation(void) else icurules = NULL; - make_icu_collator(datlocale, icurules, &default_locale); + default_locale.info.icu.locale = MemoryContextStrdup(TopMemoryContext, datlocale); + default_locale.info.icu.ucol = make_icu_collator(datlocale, icurules); +#else /* not USE_ICU */ + /* could get here if a collation was created by a build with ICU */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ICU is not supported in this build"))); +#endif /* not USE_ICU */ } else { @@ -1483,7 +1514,7 @@ init_database_collation(void) default_locale.ctype_is_c = (strcmp(datctype, "C") == 0) || (strcmp(datctype, "POSIX") == 0); - make_libc_collator(datcollate, datctype, &default_locale); + default_locale.info.lt = make_libc_collator(datcollate, datctype); } default_locale.provider = dbform->datlocprovider; @@ -1572,7 +1603,7 @@ pg_newlocale_from_collation(Oid collid) result.ctype_is_c = (strcmp(collctype, "C") == 0) || (strcmp(collctype, "POSIX") == 0); - make_libc_collator(collcollate, collctype, &result); + result.info.lt = make_libc_collator(collcollate, collctype); } else if (collform->collprovider == COLLPROVIDER_ICU) { @@ -1591,7 +1622,8 @@ pg_newlocale_from_collation(Oid collid) else icurules = NULL; - make_icu_collator(iculocstr, icurules, &result); + result.info.icu.locale = MemoryContextStrdup(TopMemoryContext, iculocstr); + result.info.icu.ucol = make_icu_collator(iculocstr, icurules); } datum = SysCacheGetAttr(COLLOID, tp, Anum_pg_collation_collversion, @@ -2500,6 +2532,8 @@ builtin_validate_locale(int encoding, const char *locale) /* * Wrapper around ucol_open() to handle API differences for older ICU * versions. + * + * Ensure that no path leaks a UCollator. */ static UCollator * pg_ucol_open(const char *loc_str) diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index faae868bfcc..c2d95411e0a 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -104,10 +104,6 @@ struct pg_locale_struct typedef struct pg_locale_struct *pg_locale_t; -extern void make_icu_collator(const char *iculocstr, - const char *icurules, - struct pg_locale_struct *resultp); - extern void init_database_collation(void); extern pg_locale_t pg_newlocale_from_collation(Oid collid); -- 2.34.1 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. @ 2026-05-01 16:40 Andres Freund <[email protected]> parent: Jeff Davis <[email protected]> 1 sibling, 1 reply; 6+ messages in thread From: Andres Freund @ 2026-05-01 16:40 UTC (permalink / raw) To: Jeff Davis <[email protected]>; +Cc: pgsql-hackers Hi, On 2024-08-22 11:00:54 -0700, Jeff Davis wrote: > Like ICU, allow -1 length to mean that the input string is NUL- > terminated for pg_strncoll(), pg_strnxfrm(), and pg_strnxfrm_prefix(). > > This simplifies the API and code a bit. I don't really like this. I was hacking on a patch that uses compiler annotations to tell the compiler what range of memory a function access. The compiler then can use that knowledge to give you both compile-time warnings and, more importantly, it makes ubsan much more accurate. It'll e.g. often be able to warn you if a function accesses more memory than its annotation would suggest, even if the memory is part of a larger memory allocation (something asan, valgrind etc can't warn about, yet are often the most security critical issues). I found a bunch of issues that way already. But the annotations can't work if the access size is sometimes is -1. I also don't find this very convincing code-wise. You end up with lots of branches for -1. You have to support cases where one of the arguments is specifies as -1 and the other one with a real length, even though that's presumably a non-existing case. It seems reasonable to want the more efficient path for zero terminated strings with libc, but it seems like if we want that, we should add add a collate_method->strcoll, rather than have a strncoll that's not actually strncoll but strcoll. Greetings, Andres Freund ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. @ 2026-05-05 20:23 Jeff Davis <[email protected]> parent: Andres Freund <[email protected]> 0 siblings, 2 replies; 6+ messages in thread From: Jeff Davis @ 2026-05-05 20:23 UTC (permalink / raw) To: Andres Freund <[email protected]>; +Cc: pgsql-hackers On Fri, 2026-05-01 at 12:40 -0400, Andres Freund wrote: > Hi, > > On 2024-08-22 11:00:54 -0700, Jeff Davis wrote: > > Like ICU, allow -1 length to mean that the input string is NUL- > > terminated for pg_strncoll(), pg_strnxfrm(), and > > pg_strnxfrm_prefix(). > > > > This simplifies the API and code a bit. > > I don't really like this. Agreed. I did this to match up with the ICU API a bit better, but if it's interfering with useful tools, then the special cases aren't worth it. Patch attached. It causes a bit of churn, so one disadvantage is that it will complicate future backports in this area. Regards, Jeff Davis Attachments: [text/x-patch] v1-0001-Don-t-accept-length-of-1-in-pg_locale.h-APIs.patch (48.8K, 2-v1-0001-Don-t-accept-length-of-1-in-pg_locale.h-APIs.patch) download | inline diff: From 748134c2093412042f6db425c9f011aebf0c82d7 Mon Sep 17 00:00:00 2001 From: Jeff Davis <[email protected]> Date: Tue, 5 May 2026 10:55:06 -0700 Subject: [PATCH v1] Don't accept length of -1 in pg_locale.h APIs. Reverts ac30021356. Per discussion, that commit interfered with useful tooling, and was not worth the special cases. Suggested-by: Andres Freund <[email protected]> Discussion: https://postgr.es/m/s32n3tm2mjh247f3xkkxkdk7cf77hglbr3ia3hrsdjylajou7y@nlldpag3tjd5 --- src/backend/utils/adt/pg_locale.c | 42 +++-- src/backend/utils/adt/pg_locale_builtin.c | 13 +- src/backend/utils/adt/pg_locale_icu.c | 187 ++++++++++++++++------ src/backend/utils/adt/pg_locale_libc.c | 159 +++++++++--------- src/common/unicode/case_test.c | 69 ++------ src/common/unicode_case.c | 26 ++- src/include/common/unicode_case.h | 8 +- src/include/utils/pg_locale.h | 45 +++--- 8 files changed, 296 insertions(+), 253 deletions(-) diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c index 6c5c1019e1e..3f1fb9fafd9 100644 --- a/src/backend/utils/adt/pg_locale.c +++ b/src/backend/utils/adt/pg_locale.c @@ -1262,11 +1262,10 @@ get_collation_actual_version(char collprovider, const char *collcollate) /* lowercasing/casefolding in C locale */ static size_t -strlower_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) +strlower_c(char *dst, size_t dstsize, const char *src, size_t srclen) { int i; - srclen = (srclen >= 0) ? srclen : strlen(src); for (i = 0; i < srclen && i < dstsize; i++) dst[i] = pg_ascii_tolower(src[i]); if (i < dstsize) @@ -1276,12 +1275,11 @@ strlower_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) /* titlecasing in C locale */ static size_t -strtitle_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) +strtitle_c(char *dst, size_t dstsize, const char *src, size_t srclen) { bool wasalnum = false; int i; - srclen = (srclen >= 0) ? srclen : strlen(src); for (i = 0; i < srclen && i < dstsize; i++) { char c = src[i]; @@ -1302,11 +1300,10 @@ strtitle_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) /* uppercasing in C locale */ static size_t -strupper_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) +strupper_c(char *dst, size_t dstsize, const char *src, size_t srclen) { int i; - srclen = (srclen >= 0) ? srclen : strlen(src); for (i = 0; i < srclen && i < dstsize; i++) dst[i] = pg_ascii_toupper(src[i]); if (i < dstsize) @@ -1315,7 +1312,7 @@ strupper_c(char *dst, size_t dstsize, const char *src, ssize_t srclen) } size_t -pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, +pg_strlower(char *dst, size_t dstsize, const char *src, size_t srclen, pg_locale_t locale) { if (locale->ctype == NULL) @@ -1325,7 +1322,7 @@ pg_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, } size_t -pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, +pg_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen, pg_locale_t locale) { if (locale->ctype == NULL) @@ -1335,7 +1332,7 @@ pg_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, } size_t -pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, +pg_strupper(char *dst, size_t dstsize, const char *src, size_t srclen, pg_locale_t locale) { if (locale->ctype == NULL) @@ -1345,7 +1342,7 @@ pg_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, } size_t -pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, +pg_strfold(char *dst, size_t dstsize, const char *src, size_t srclen, pg_locale_t locale) { /* in the C locale, casefolding is the same as lowercasing */ @@ -1363,7 +1360,7 @@ pg_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, * pg_strfold(..., default_locale)? */ size_t -pg_downcase_ident(char *dst, size_t dstsize, const char *src, ssize_t srclen) +pg_downcase_ident(char *dst, size_t dstsize, const char *src, size_t srclen) { pg_locale_t locale = default_locale; @@ -1383,7 +1380,7 @@ pg_downcase_ident(char *dst, size_t dstsize, const char *src, ssize_t srclen) int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) { - return locale->collate->strncoll(arg1, -1, arg2, -1, locale); + return locale->collate->strcoll(arg1, arg2, locale); } /* @@ -1393,15 +1390,14 @@ pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale) * appropriate for the given locale, platform, and database encoding. If the * locale is not specified, use the database collation. * - * The input strings must be encoded in the database encoding. If an input - * string is NUL-terminated, its length may be specified as -1. + * The input strings must be encoded in the database encoding. * * The caller is responsible for breaking ties if the collation is * deterministic; this maintains consistency with pg_strnxfrm(), which cannot * easily account for deterministic collations. */ int -pg_strncoll(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, +pg_strncoll(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale) { return locale->collate->strncoll(arg1, len1, arg2, len2, locale); @@ -1433,7 +1429,7 @@ pg_strxfrm_enabled(pg_locale_t locale) size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - return locale->collate->strnxfrm(dest, destsize, src, -1, locale); + return locale->collate->strxfrm(dest, destsize, src, locale); } /* @@ -1443,9 +1439,8 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) * ordinary strcmp() on transformed strings is equivalent to pg_strcoll() on * untransformed strings. * - * The input string must be encoded in the database encoding. If the input - * string is NUL-terminated, its length may be specified as -1. If 'destsize' - * is zero, 'dest' may be NULL. + * The input string must be encoded in the database encoding. If 'destsize' is + * zero, 'dest' may be NULL. * * Not all providers support pg_strnxfrm() safely. The caller should check * pg_strxfrm_enabled() first, otherwise this function may return wrong @@ -1456,7 +1451,7 @@ pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale) * 'destsize' or greater, the resulting contents of 'dest' are undefined. */ size_t -pg_strnxfrm(char *dest, size_t destsize, const char *src, ssize_t srclen, +pg_strnxfrm(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return locale->collate->strnxfrm(dest, destsize, src, srclen, locale); @@ -1481,7 +1476,7 @@ size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale) { - return locale->collate->strnxfrm_prefix(dest, destsize, src, -1, locale); + return locale->collate->strxfrm_prefix(dest, destsize, src, locale); } /* @@ -1491,8 +1486,7 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, * memcmp() on the byte sequence is equivalent to pg_strncoll() on * untransformed strings. The result is not nul-terminated. * - * The input string must be encoded in the database encoding. If the input - * string is NUL-terminated, its length may be specified as -1. + * The input string must be encoded in the database encoding. * * Not all providers support pg_strnxfrm_prefix() safely. The caller should * check pg_strxfrm_prefix_enabled() first, otherwise this function may return @@ -1504,7 +1498,7 @@ pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, */ size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale) + size_t srclen, pg_locale_t locale) { return locale->collate->strnxfrm_prefix(dest, destsize, src, srclen, locale); } diff --git a/src/backend/utils/adt/pg_locale_builtin.c b/src/backend/utils/adt/pg_locale_builtin.c index 794aa37df76..01d4f55b07e 100644 --- a/src/backend/utils/adt/pg_locale_builtin.c +++ b/src/backend/utils/adt/pg_locale_builtin.c @@ -60,8 +60,7 @@ initcap_wbnext(void *state) { struct WordBoundaryState *wbstate = (struct WordBoundaryState *) state; - while (wbstate->offset < wbstate->len && - wbstate->str[wbstate->offset] != '\0') + while (wbstate->offset < wbstate->len) { char32_t u = utf8_to_unicode((const unsigned char *) wbstate->str + wbstate->offset); @@ -84,7 +83,7 @@ initcap_wbnext(void *state) } static size_t -strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, +strlower_builtin(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return unicode_strlower(dest, destsize, src, srclen, @@ -92,12 +91,12 @@ strlower_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, +strtitle_builtin(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { struct WordBoundaryState wbstate = { .str = src, - .len = (srclen < 0) ? strlen(src) : srclen, + .len = srclen, .offset = 0, .posix = !locale->builtin.casemap_full, .init = false, @@ -110,7 +109,7 @@ strtitle_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, +strupper_builtin(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return unicode_strupper(dest, destsize, src, srclen, @@ -118,7 +117,7 @@ strupper_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strfold_builtin(char *dest, size_t destsize, const char *src, ssize_t srclen, +strfold_builtin(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return unicode_strfold(dest, destsize, src, srclen, diff --git a/src/backend/utils/adt/pg_locale_icu.c b/src/backend/utils/adt/pg_locale_icu.c index a4a4e82eb9e..99b1f266c5a 100644 --- a/src/backend/utils/adt/pg_locale_icu.c +++ b/src/backend/utils/adt/pg_locale_icu.c @@ -57,29 +57,33 @@ extern UCollator *pg_ucol_open(const char *loc_str); static UCaseMap *pg_ucasemap_open(const char *loc_str); static size_t strlower_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t strtitle_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t strupper_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t strfold_icu(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t strlower_icu_utf8(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t strtitle_icu_utf8(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t strupper_icu_utf8(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t strfold_icu_utf8(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale); -static int strncoll_icu(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, + size_t srclen, pg_locale_t locale); +static int strncoll_icu(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); +static int strcoll_icu(const char *arg1, const char *arg2, + pg_locale_t locale); static size_t strnxfrm_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); +static size_t strxfrm_icu(char *dest, size_t destsize, const char *src, + pg_locale_t locale); extern char *get_collation_actual_version_icu(const char *collcollate); typedef int32_t (*ICU_Convert_Func) (UChar *dest, int32_t destCapacity, @@ -96,20 +100,24 @@ static UConverter *icu_converter = NULL; static UCollator *make_icu_collator(const char *iculocstr, const char *icurules); -static int strncoll_icu(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, - pg_locale_t locale); static size_t strnxfrm_prefix_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); +static size_t strxfrm_prefix_icu(char *dest, size_t destsize, const char *src, + pg_locale_t locale); #ifdef HAVE_UCOL_STRCOLLUTF8 -static int strncoll_icu_utf8(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, +static int strncoll_icu_utf8(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); +static int strcoll_icu_utf8(const char *arg1, + const char *arg2, + pg_locale_t locale); #endif static size_t strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); +static size_t strxfrm_prefix_icu_utf8(char *dest, size_t destsize, const char *src, + pg_locale_t locale); static void init_icu_converter(void); static size_t uchar_length(UConverter *converter, const char *str, int32_t len); @@ -124,7 +132,7 @@ static void icu_set_collation_attributes(UCollator *collator, const char *loc, UErrorCode *status); static int32_t icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); static int32_t u_strToTitle_default_BI(UChar *dest, int32_t destCapacity, const UChar *src, int32_t srcLength, const char *locale, @@ -154,19 +162,26 @@ tolower_icu(pg_wchar wc, pg_locale_t locale) static const struct collate_methods collate_methods_icu = { .strncoll = strncoll_icu, + .strcoll = strcoll_icu, .strnxfrm = strnxfrm_icu, + .strxfrm = strxfrm_icu, .strnxfrm_prefix = strnxfrm_prefix_icu, + .strxfrm_prefix = strxfrm_prefix_icu, .strxfrm_is_safe = true, }; static const struct collate_methods collate_methods_icu_utf8 = { #ifdef HAVE_UCOL_STRCOLLUTF8 .strncoll = strncoll_icu_utf8, + .strcoll = strcoll_icu_utf8, #else .strncoll = strncoll_icu, + .strcoll = strcoll_icu, #endif .strnxfrm = strnxfrm_icu, + .strxfrm = strxfrm_icu, .strnxfrm_prefix = strnxfrm_prefix_icu_utf8, + .strxfrm_prefix = strxfrm_prefix_icu_utf8, .strxfrm_is_safe = true, }; @@ -604,35 +619,35 @@ make_icu_collator(const char *iculocstr, const char *icurules) } static size_t -strlower_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, +strlower_icu(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return icu_convert_case(u_strToLower, dest, destsize, src, srclen, locale); } static size_t -strtitle_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, +strtitle_icu(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return icu_convert_case(u_strToTitle_default_BI, dest, destsize, src, srclen, locale); } static size_t -strupper_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, +strupper_icu(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return icu_convert_case(u_strToUpper, dest, destsize, src, srclen, locale); } static size_t -strfold_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, +strfold_icu(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { return icu_convert_case(u_strFoldCase_default, dest, destsize, src, srclen, locale); } static size_t -strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, +strlower_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { UErrorCode status = U_ZERO_ERROR; @@ -646,7 +661,7 @@ strlower_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, +strtitle_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { UErrorCode status = U_ZERO_ERROR; @@ -660,7 +675,7 @@ strtitle_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, +strupper_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { UErrorCode status = U_ZERO_ERROR; @@ -674,7 +689,7 @@ strupper_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, +strfold_icu_utf8(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { UErrorCode status = U_ZERO_ERROR; @@ -695,7 +710,7 @@ strfold_icu_utf8(char *dest, size_t destsize, const char *src, ssize_t srclen, */ static size_t downcase_ident_icu(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) + size_t srclen, pg_locale_t locale) { int i; bool libc_lower; @@ -724,12 +739,11 @@ downcase_ident_icu(char *dst, size_t dstsize, const char *src, * strncoll_icu_utf8 * * Call ucol_strcollUTF8() or ucol_strcoll() as appropriate for the given - * database encoding. An argument length of -1 means the string is - * NUL-terminated. + * database encoding. */ #ifdef HAVE_UCOL_STRCOLLUTF8 int -strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, +strncoll_icu_utf8(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale) { int result; @@ -748,12 +762,31 @@ strncoll_icu_utf8(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2 return result; } + +int +strcoll_icu_utf8(const char *arg1, const char *arg2, pg_locale_t locale) +{ + int result; + UErrorCode status; + + Assert(GetDatabaseEncoding() == PG_UTF8); + + status = U_ZERO_ERROR; + result = ucol_strcollUTF8(locale->icu.ucol, + arg1, -1, + arg2, -1, + &status); + if (U_FAILURE(status)) + ereport(ERROR, + (errmsg("collation failed: %s", u_errorName(status)))); + + return result; +} #endif -/* 'srclen' of -1 means the strings are NUL-terminated */ -size_t -strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, - pg_locale_t locale) +static size_t +strnxfrm_icu_internal(char *dest, size_t destsize, const char *src, ssize_t srclen, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -795,11 +828,24 @@ strnxfrm_icu(char *dest, size_t destsize, const char *src, ssize_t srclen, return result_bsize; } -/* 'srclen' of -1 means the strings are NUL-terminated */ -size_t -strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) +static size_t +strnxfrm_icu(char *dest, size_t destsize, const char *src, size_t srclen, + pg_locale_t locale) +{ + return strnxfrm_icu_internal(dest, destsize, src, srclen, locale); +} + +static size_t +strxfrm_icu(char *dest, size_t destsize, const char *src, + pg_locale_t locale) +{ + return strnxfrm_icu_internal(dest, destsize, src, -1, locale); +} + +static size_t +strnxfrm_prefix_icu_utf8_internal(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) { size_t result; UCharIterator iter; @@ -825,6 +871,21 @@ strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, return result; } +static size_t +strnxfrm_prefix_icu_utf8(char *dest, size_t destsize, + const char *src, size_t srclen, + pg_locale_t locale) +{ + return strnxfrm_prefix_icu_utf8_internal(dest, destsize, src, srclen, locale); +} + +static size_t +strxfrm_prefix_icu_utf8(char *dest, size_t destsize, const char *src, + pg_locale_t locale) +{ + return strnxfrm_prefix_icu_utf8_internal(dest, destsize, src, -1, locale); +} + char * get_collation_actual_version_icu(const char *collcollate) { @@ -940,7 +1001,7 @@ convert_case_uchar(ICU_Convert_Func func, pg_locale_t mylocale, static int32_t icu_convert_case(ICU_Convert_Func func, char *dest, size_t destsize, - const char *src, ssize_t srclen, pg_locale_t locale) + const char *src, size_t srclen, pg_locale_t locale) { int32_t len_uchar; int32_t len_conv; @@ -1010,15 +1071,15 @@ foldcase_options(const char *locale) * strncoll_icu * * Convert the arguments from the database encoding to UChar strings, then - * call ucol_strcoll(). An argument length of -1 means that the string is - * NUL-terminated. + * call ucol_strcoll(). * * When the database encoding is UTF-8, and ICU supports ucol_strcollUTF8(), * caller should call that instead. */ static int -strncoll_icu(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, pg_locale_t locale) +strncoll_icu_internal(const char *arg1, ssize_t len1, + const char *arg2, ssize_t len2, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -1062,11 +1123,23 @@ strncoll_icu(const char *arg1, ssize_t len1, return result; } -/* 'srclen' of -1 means the strings are NUL-terminated */ +static int +strncoll_icu(const char *arg1, size_t len1, const char *arg2, size_t len2, + pg_locale_t locale) +{ + return strncoll_icu_internal(arg1, len1, arg2, len2, locale); +} + +static int +strcoll_icu(const char *arg1, const char *arg2, pg_locale_t locale) +{ + return strncoll_icu_internal(arg1, -1, arg2, -1, locale); +} + static size_t -strnxfrm_prefix_icu(char *dest, size_t destsize, - const char *src, ssize_t srclen, - pg_locale_t locale) +strnxfrm_prefix_icu_internal(char *dest, size_t destsize, + const char *src, ssize_t srclen, + pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -1114,6 +1187,20 @@ strnxfrm_prefix_icu(char *dest, size_t destsize, return result_bsize; } +static size_t +strnxfrm_prefix_icu(char *dest, size_t destsize, const char *src, size_t srclen, + pg_locale_t locale) +{ + return strnxfrm_prefix_icu_internal(dest, destsize, src, srclen, locale); +} + +static size_t +strxfrm_prefix_icu(char *dest, size_t destsize, const char *src, + pg_locale_t locale) +{ + return strnxfrm_prefix_icu_internal(dest, destsize, src, -1, locale); +} + static void init_icu_converter(void) { diff --git a/src/backend/utils/adt/pg_locale_libc.c b/src/backend/utils/adt/pg_locale_libc.c index 78f6ea161a0..0b52d6f8fe3 100644 --- a/src/backend/utils/adt/pg_locale_libc.c +++ b/src/backend/utils/adt/pg_locale_libc.c @@ -82,42 +82,48 @@ extern pg_locale_t create_pg_locale_libc(Oid collid, MemoryContext context); -static int strncoll_libc(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, +static int strncoll_libc(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); +static int strcoll_libc(const char *arg1, const char *arg2, + pg_locale_t locale); static size_t strnxfrm_libc(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); +static size_t strxfrm_libc(char *dest, size_t destsize, + const char *src, pg_locale_t locale); extern char *get_collation_actual_version_libc(const char *collcollate); static locale_t make_libc_collator(const char *collate, const char *ctype); #ifdef WIN32 -static int strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, +static int strncoll_libc_win32_utf8(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); +static int strcoll_libc_win32_utf8(const char *arg1, const char *arg2, + pg_locale_t locale); #endif static size_t char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen, locale_t loc); static size_t strlower_libc_sb(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); static size_t strlower_libc_mb(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); static size_t strtitle_libc_sb(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); static size_t strtitle_libc_mb(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); static size_t strupper_libc_sb(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); static size_t strupper_libc_mb(char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); static bool @@ -324,7 +330,7 @@ tolower_libc_mb(pg_wchar wc, pg_locale_t locale) */ static size_t downcase_ident_libc_sb(char *dst, size_t dstsize, const char *src, - ssize_t srclen, pg_locale_t locale) + size_t srclen, pg_locale_t locale) { locale_t loc = locale->lt; int i; @@ -420,8 +426,11 @@ static const struct ctype_methods ctype_methods_libc_utf8 = { static const struct collate_methods collate_methods_libc = { .strncoll = strncoll_libc, + .strcoll = strcoll_libc, .strnxfrm = strnxfrm_libc, + .strxfrm = strxfrm_libc, .strnxfrm_prefix = NULL, + .strxfrm_prefix = NULL, /* * Unfortunately, it seems that strxfrm() for non-C collations is broken @@ -442,7 +451,9 @@ static const struct collate_methods collate_methods_libc = { #ifdef WIN32 static const struct collate_methods collate_methods_libc_win32_utf8 = { .strncoll = strncoll_libc_win32_utf8, + .strcoll = strcoll_libc_win32_utf8, .strnxfrm = strnxfrm_libc, + .strxfrm = strxfrm_libc, .strnxfrm_prefix = NULL, #ifdef TRUST_STRXFRM .strxfrm_is_safe = true, @@ -453,12 +464,9 @@ static const struct collate_methods collate_methods_libc_win32_utf8 = { #endif static size_t -strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, +strlower_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { - if (srclen < 0) - srclen = strlen(src); - if (srclen + 1 <= destsize) { locale_t loc = locale->lt; @@ -492,7 +500,7 @@ strlower_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, +strlower_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { locale_t loc = locale->lt; @@ -502,9 +510,6 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t curr_char; size_t max_size; - if (srclen < 0) - srclen = strlen(src); - /* Overflow paranoia */ if ((srclen + 1) > (INT_MAX / sizeof(wchar_t))) ereport(ERROR, @@ -540,12 +545,9 @@ strlower_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, +strtitle_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { - if (srclen < 0) - srclen = strlen(src); - if (srclen + 1 <= destsize) { locale_t loc = locale->lt; @@ -596,7 +598,7 @@ strtitle_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, +strtitle_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { locale_t loc = locale->lt; @@ -607,9 +609,6 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t curr_char; size_t max_size; - if (srclen < 0) - srclen = strlen(src); - /* Overflow paranoia */ if ((srclen + 1) > (INT_MAX / sizeof(wchar_t))) ereport(ERROR, @@ -651,12 +650,9 @@ strtitle_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, +strupper_libc_sb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { - if (srclen < 0) - srclen = strlen(src); - if (srclen + 1 <= destsize) { locale_t loc = locale->lt; @@ -690,7 +686,7 @@ strupper_libc_sb(char *dest, size_t destsize, const char *src, ssize_t srclen, } static size_t -strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, +strupper_libc_mb(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { locale_t loc = locale->lt; @@ -700,9 +696,6 @@ strupper_libc_mb(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t curr_char; size_t max_size; - if (srclen < 0) - srclen = strlen(src); - /* Overflow paranoia */ if ((srclen + 1) > (INT_MAX / sizeof(wchar_t))) ereport(ERROR, @@ -889,17 +882,17 @@ make_libc_collator(const char *collate, const char *ctype) * strncoll_libc * * NUL-terminate arguments, if necessary, and pass to strcoll_l(). - * - * An input string length of -1 means that it's already NUL-terminated. */ -int -strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, +static int +strncoll_libc(const char *arg1, size_t len1, const char *arg2, size_t len2, pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; - size_t bufsize1 = (len1 == -1) ? 0 : len1 + 1; - size_t bufsize2 = (len2 == -1) ? 0 : len2 + 1; + size_t bufsize1 = len1 + 1; + size_t bufsize2 = len2 + 1; + char *buf1; + char *buf2; const char *arg1n; const char *arg2n; int result; @@ -907,32 +900,16 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, if (bufsize1 + bufsize2 > TEXTBUFLEN) buf = palloc(bufsize1 + bufsize2); - /* nul-terminate arguments if necessary */ - if (len1 == -1) - { - arg1n = arg1; - } - else - { - char *buf1 = buf; - - memcpy(buf1, arg1, len1); - buf1[len1] = '\0'; - arg1n = buf1; - } + buf1 = buf; + buf2 = buf + bufsize1; - if (len2 == -1) - { - arg2n = arg2; - } - else - { - char *buf2 = buf + bufsize1; + memcpy(buf1, arg1, len1); + buf1[len1] = '\0'; + arg1n = buf1; - memcpy(buf2, arg2, len2); - buf2[len2] = '\0'; - arg2n = buf2; - } + memcpy(buf2, arg2, len2); + buf2[len2] = '\0'; + arg2n = buf2; result = strcoll_l(arg1n, arg2n, locale->lt); @@ -942,15 +919,22 @@ strncoll_libc(const char *arg1, ssize_t len1, const char *arg2, ssize_t len2, return result; } +/* + * strcoll_libc + */ +static int +strcoll_libc(const char *arg1, const char *arg2, pg_locale_t locale) +{ + return strcoll_l(arg1, arg2, locale->lt); +} + /* * strnxfrm_libc * - * NUL-terminate src, if necessary, and pass to strxfrm_l(). - * - * A source length of -1 means that it's already NUL-terminated. + * NUL-terminate src and pass to strxfrm_l(). */ -size_t -strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, +static size_t +strnxfrm_libc(char *dest, size_t destsize, const char *src, size_t srclen, pg_locale_t locale) { char sbuf[TEXTBUFLEN]; @@ -958,9 +942,6 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, size_t bufsize = srclen + 1; size_t result; - if (srclen == -1) - return strxfrm_l(dest, src, destsize, locale->lt); - if (bufsize > TEXTBUFLEN) buf = palloc(bufsize); @@ -979,6 +960,15 @@ strnxfrm_libc(char *dest, size_t destsize, const char *src, ssize_t srclen, return result; } +/* + * strxfrm_libc + */ +static size_t +strxfrm_libc(char *dest, size_t destsize, const char *src, pg_locale_t locale) +{ + return strxfrm_l(dest, src, destsize, locale->lt); +} + char * get_collation_actual_version_libc(const char *collcollate) { @@ -1049,13 +1039,11 @@ get_collation_actual_version_libc(const char *collcollate) * * Win32 does not have UTF-8. Convert UTF8 arguments to wide characters and * invoke wcscoll_l(). - * - * An input string length of -1 means that it's NUL-terminated. */ #ifdef WIN32 static int -strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, - ssize_t len2, pg_locale_t locale) +strncoll_libc_win32_utf8(const char *arg1, size_t len1, const char *arg2, + size_t len2, pg_locale_t locale) { char sbuf[TEXTBUFLEN]; char *buf = sbuf; @@ -1068,11 +1056,6 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, Assert(GetDatabaseEncoding() == PG_UTF8); - if (len1 == -1) - len1 = strlen(arg1); - if (len2 == -1) - len2 = strlen(arg2); - a1len = len1 * 2 + 2; a2len = len2 * 2 + 2; @@ -1120,6 +1103,16 @@ strncoll_libc_win32_utf8(const char *arg1, ssize_t len1, const char *arg2, return result; } + +static int +strcoll_libc_win32_utf8(const char *arg1, const char *arg2, + pg_locale_t locale) +{ + size_t len1 = strlen(arg1); + size_t len2 = strlen(arg2); + + return strncoll_libc_win32_utf8(arg1, len1, arg2, len2, locale); +} #endif /* WIN32 */ /* simple subroutine for reporting errors from newlocale() */ diff --git a/src/common/unicode/case_test.c b/src/common/unicode/case_test.c index 099530c1ead..a0dbf00b671 100644 --- a/src/common/unicode/case_test.c +++ b/src/common/unicode/case_test.c @@ -34,7 +34,7 @@ static UCaseMap *casemap = NULL; #endif typedef size_t (*TestFunc) (char *dst, size_t dstsize, const char *src, - ssize_t srclen); + size_t srclen); /* simple boundary iterator copied from pg_locale_builtin.c */ struct WordBoundaryState @@ -114,6 +114,7 @@ icu_test_full(char *str) char icu_upper[BUFSZ]; char icu_fold[BUFSZ]; UErrorCode status; + size_t len = strlen(str); /* full case mapping doesn't use posix semantics */ struct WordBoundaryState wbstate = { @@ -125,18 +126,18 @@ icu_test_full(char *str) .prev_alnum = false, }; - unicode_strlower(lower, BUFSZ, str, -1, true); - unicode_strtitle(title, BUFSZ, str, -1, true, initcap_wbnext, &wbstate); - unicode_strupper(upper, BUFSZ, str, -1, true); - unicode_strfold(fold, BUFSZ, str, -1, true); + unicode_strlower(lower, BUFSZ, str, len, true); + unicode_strtitle(title, BUFSZ, str, len, true, initcap_wbnext, &wbstate); + unicode_strupper(upper, BUFSZ, str, len, true); + unicode_strfold(fold, BUFSZ, str, len, true); status = U_ZERO_ERROR; - ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, -1, &status); + ucasemap_utf8ToLower(casemap, icu_lower, BUFSZ, str, len, &status); status = U_ZERO_ERROR; - ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, -1, &status); + ucasemap_utf8ToTitle(casemap, icu_title, BUFSZ, str, len, &status); status = U_ZERO_ERROR; - ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, -1, &status); + ucasemap_utf8ToUpper(casemap, icu_upper, BUFSZ, str, len, &status); status = U_ZERO_ERROR; - ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, -1, &status); + ucasemap_utf8FoldCase(casemap, icu_fold, BUFSZ, str, len, &status); if (strcmp(lower, icu_lower) != 0) { @@ -209,18 +210,16 @@ static void test_convert(TestFunc tfunc, const char *test_string, const char *expected) { size_t src1len = strlen(test_string); - size_t src2len = -1; /* NUL-terminated */ size_t dst1len = strlen(expected); size_t dst2len = strlen(expected) + 1; /* NUL-terminated */ char *src1 = malloc(src1len); char *dst1 = malloc(dst1len); - char *src2 = strdup(test_string); char *dst2 = malloc(dst2len); size_t needed; memcpy(src1, test_string, src1len); /* not NUL-terminated */ - /* neither source nor destination are NUL-terminated */ + /* destination is not NUL-terminated */ memset(dst1, 0x7F, dst1len); needed = tfunc(dst1, dst1len, src1, src1len); if (needed != strlen(expected)) @@ -236,7 +235,7 @@ test_convert(TestFunc tfunc, const char *test_string, const char *expected) exit(1); } - /* destination is NUL-terminated and source is not */ + /* destination is NUL-terminated */ memset(dst2, 0x7F, dst2len); needed = tfunc(dst2, dst2len, src1, src1len); if (needed != strlen(expected)) @@ -252,59 +251,25 @@ test_convert(TestFunc tfunc, const char *test_string, const char *expected) exit(1); } - /* source is NUL-terminated and destination is not */ - memset(dst1, 0x7F, dst1len); - needed = tfunc(dst1, dst1len, src2, src2len); - if (needed != strlen(expected)) - { - printf("case_test: convert_case test3 FAILURE: '%s' needed %zu expected %zu\n", - test_string, needed, strlen(expected)); - printf("case_test: convert_case test3 FAILURE: needed %zu\n", needed); - exit(1); - } - if (memcmp(dst1, expected, dst1len) != 0) - { - printf("case_test: convert_case test3 FAILURE: test: '%s' result: '%.*s' expected: '%s'\n", - test_string, (int) dst1len, dst1, expected); - exit(1); - } - - /* both source and destination are NUL-terminated */ - memset(dst2, 0x7F, dst2len); - needed = tfunc(dst2, dst2len, src2, src2len); - if (needed != strlen(expected)) - { - printf("case_test: convert_case test4 FAILURE: '%s' needed %zu expected %zu\n", - test_string, needed, strlen(expected)); - exit(1); - } - if (strcmp(dst2, expected) != 0) - { - printf("case_test: convert_case test4 FAILURE: test: '%s' result: '%s' expected: '%s'\n", - test_string, dst2, expected); - exit(1); - } - free(src1); free(dst1); - free(src2); free(dst2); } static size_t tfunc_lower(char *dst, size_t dstsize, const char *src, - ssize_t srclen) + size_t srclen) { return unicode_strlower(dst, dstsize, src, srclen, true); } static size_t tfunc_title(char *dst, size_t dstsize, const char *src, - ssize_t srclen) + size_t srclen) { struct WordBoundaryState wbstate = { .str = src, - .len = (srclen < 0) ? strlen(src) : srclen, + .len = srclen, .offset = 0, .init = false, .prev_alnum = false, @@ -316,14 +281,14 @@ tfunc_title(char *dst, size_t dstsize, const char *src, static size_t tfunc_upper(char *dst, size_t dstsize, const char *src, - ssize_t srclen) + size_t srclen) { return unicode_strupper(dst, dstsize, src, srclen, true); } static size_t tfunc_fold(char *dst, size_t dstsize, const char *src, - ssize_t srclen) + size_t srclen) { return unicode_strfold(dst, dstsize, src, srclen, true); } diff --git a/src/common/unicode_case.c b/src/common/unicode_case.c index 0b8d3ffc0b4..d6ee00b7d9c 100644 --- a/src/common/unicode_case.c +++ b/src/common/unicode_case.c @@ -39,7 +39,7 @@ static const char32_t *const casekind_map[NCaseKind] = }; static char32_t find_case_map(char32_t ucs, const char32_t *map); -static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, +static size_t convert_case(char *dst, size_t dstsize, const char *src, size_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate); static enum CaseMapResult casemap(char32_t u1, CaseKind casekind, bool full, @@ -84,8 +84,7 @@ unicode_casefold_simple(char32_t code) * Convert src to lowercase, and return the result length (not including * terminating NUL). * - * String src must be encoded in UTF-8. If srclen < 0, src must be - * NUL-terminated. + * String src must be encoded in UTF-8. * * Result string is stored in dst, truncating if larger than dstsize. If * dstsize is greater than the result length, dst will be NUL-terminated; @@ -98,7 +97,7 @@ unicode_casefold_simple(char32_t code) * conditions are satisfied. */ size_t -unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, +unicode_strlower(char *dst, size_t dstsize, const char *src, size_t srclen, bool full) { return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL, @@ -111,8 +110,7 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, * Convert src to titlecase, and return the result length (not including * terminating NUL). * - * String src must be encoded in UTF-8. If srclen < 0, src must be - * NUL-terminated. + * String src must be encoded in UTF-8. * * Result string is stored in dst, truncating if larger than dstsize. If * dstsize is greater than the result length, dst will be NUL-terminated; @@ -135,7 +133,7 @@ unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen, * the string to indicate the final boundary. */ size_t -unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, +unicode_strtitle(char *dst, size_t dstsize, const char *src, size_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate) { return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext, @@ -148,8 +146,7 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, * Convert src to uppercase, and return the result length (not including * terminating NUL). * - * String src must be encoded in UTF-8. If srclen < 0, src must be - * NUL-terminated. + * String src must be encoded in UTF-8. * * Result string is stored in dst, truncating if larger than dstsize. If * dstsize is greater than the result length, dst will be NUL-terminated; @@ -162,7 +159,7 @@ unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen, * conditions are satisfied. */ size_t -unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, +unicode_strupper(char *dst, size_t dstsize, const char *src, size_t srclen, bool full) { return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL, @@ -175,8 +172,7 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, * Case fold src, and return the result length (not including terminating * NUL). * - * String src must be encoded in UTF-8. If srclen < 0, src must be - * NUL-terminated. + * String src must be encoded in UTF-8. * * Result string is stored in dst, truncating if larger than dstsize. If * dstsize is greater than the result length, dst will be NUL-terminated; @@ -186,7 +182,7 @@ unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen, * required buffer size before allocating. */ size_t -unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, +unicode_strfold(char *dst, size_t dstsize, const char *src, size_t srclen, bool full) { return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL, @@ -210,7 +206,7 @@ unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen, * map a single codepoint to multiple codepoints, or depend on conditions. */ static size_t -convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, +convert_case(char *dst, size_t dstsize, const char *src, size_t srclen, CaseKind str_casekind, bool full, WordBoundaryNext wbnext, void *wbstate) { @@ -229,7 +225,7 @@ convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen, Assert(boundary == 0); /* start of text is always a boundary */ } - while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0') + while (srcoff < srclen) { char32_t u1 = utf8_to_unicode((const unsigned char *) src + srcoff); int u1len = unicode_utf8len(u1); diff --git a/src/include/common/unicode_case.h b/src/include/common/unicode_case.h index 2737c1382d4..03add78cabe 100644 --- a/src/include/common/unicode_case.h +++ b/src/include/common/unicode_case.h @@ -21,13 +21,13 @@ char32_t unicode_titlecase_simple(char32_t code); char32_t unicode_uppercase_simple(char32_t code); char32_t unicode_casefold_simple(char32_t code); size_t unicode_strlower(char *dst, size_t dstsize, const char *src, - ssize_t srclen, bool full); + size_t srclen, bool full); size_t unicode_strtitle(char *dst, size_t dstsize, const char *src, - ssize_t srclen, bool full, + size_t srclen, bool full, WordBoundaryNext wbnext, void *wbstate); size_t unicode_strupper(char *dst, size_t dstsize, const char *src, - ssize_t srclen, bool full); + size_t srclen, bool full); size_t unicode_strfold(char *dst, size_t dstsize, const char *src, - ssize_t srclen, bool full); + size_t srclen, bool full); #endif /* UNICODE_CASE_H */ diff --git a/src/include/utils/pg_locale.h b/src/include/utils/pg_locale.h index 444350bb803..b74821fdfa9 100644 --- a/src/include/utils/pg_locale.h +++ b/src/include/utils/pg_locale.h @@ -63,20 +63,29 @@ typedef struct pg_locale_struct *pg_locale_t; struct collate_methods { /* required */ - int (*strncoll) (const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, + int (*strncoll) (const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); + int (*strcoll) (const char *arg1, const char *arg2, + pg_locale_t locale); + /* required */ size_t (*strnxfrm) (char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); + size_t (*strxfrm) (char *dest, size_t destsize, + const char *src, pg_locale_t locale); + /* optional */ size_t (*strnxfrm_prefix) (char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); + size_t (*strxfrm_prefix) (char *dest, size_t destsize, + const char *src, pg_locale_t locale); + /* * If the strnxfrm method is not trusted to return the correct results, * set strxfrm_is_safe to false. It set to false, the method will not be @@ -90,19 +99,19 @@ struct ctype_methods { /* case mapping: LOWER()/INITCAP()/UPPER() */ size_t (*strlower) (char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); size_t (*strtitle) (char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); size_t (*strupper) (char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); size_t (*strfold) (char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); size_t (*downcase_ident) (char *dest, size_t destsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); /* required */ @@ -172,32 +181,32 @@ extern pg_locale_t pg_newlocale_from_collation(Oid collid); extern char *get_collation_actual_version(char collprovider, const char *collcollate); extern size_t pg_strlower(char *dst, size_t dstsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); extern size_t pg_strtitle(char *dst, size_t dstsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); extern size_t pg_strupper(char *dst, size_t dstsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); extern size_t pg_strfold(char *dst, size_t dstsize, - const char *src, ssize_t srclen, + const char *src, size_t srclen, pg_locale_t locale); extern size_t pg_downcase_ident(char *dst, size_t dstsize, - const char *src, ssize_t srclen); + const char *src, size_t srclen); extern int pg_strcoll(const char *arg1, const char *arg2, pg_locale_t locale); -extern int pg_strncoll(const char *arg1, ssize_t len1, - const char *arg2, ssize_t len2, pg_locale_t locale); +extern int pg_strncoll(const char *arg1, size_t len1, + const char *arg2, size_t len2, pg_locale_t locale); extern bool pg_strxfrm_enabled(pg_locale_t locale); extern size_t pg_strxfrm(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); extern bool pg_strxfrm_prefix_enabled(pg_locale_t locale); extern size_t pg_strxfrm_prefix(char *dest, const char *src, size_t destsize, pg_locale_t locale); extern size_t pg_strnxfrm_prefix(char *dest, size_t destsize, const char *src, - ssize_t srclen, pg_locale_t locale); + size_t srclen, pg_locale_t locale); extern bool pg_iswdigit(pg_wchar wc, pg_locale_t locale); extern bool pg_iswalpha(pg_wchar wc, pg_locale_t locale); -- 2.43.0 ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. @ 2026-05-14 21:58 Jeff Davis <[email protected]> parent: Jeff Davis <[email protected]> 1 sibling, 0 replies; 6+ messages in thread From: Jeff Davis @ 2026-05-14 21:58 UTC (permalink / raw) To: Andres Freund <[email protected]>; +Cc: pgsql-hackers On Tue, 2026-05-05 at 13:23 -0700, Jeff Davis wrote: > Patch attached. It causes a bit of churn, so one disadvantage is that > it will complicate future backports in this area. I plan to commit this soon. Regards, Jeff Davis ^ permalink raw reply [nested|flat] 6+ messages in thread
* Re: Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. @ 2026-05-14 22:17 Andres Freund <[email protected]> parent: Jeff Davis <[email protected]> 1 sibling, 0 replies; 6+ messages in thread From: Andres Freund @ 2026-05-14 22:17 UTC (permalink / raw) To: Jeff Davis <[email protected]>; +Cc: pgsql-hackers Hi, On 2026-05-05 13:23:12 -0700, Jeff Davis wrote: > Agreed. I did this to match up with the ICU API a bit better, but if > it's interfering with useful tools, then the special cases aren't worth > it. > Patch attached. Thanks! > It causes a bit of churn, so one disadvantage is that it will complicate > future backports in this area. I think it's worth the gain in instrument-ability. I also suspect it's good for runtime performance, adding all those branches can't be particularly good. Greetings, Andres Freund ^ permalink raw reply [nested|flat] 6+ messages in thread
end of thread, other threads:[~2026-05-14 22:17 UTC | newest] Thread overview: 6+ messages (download: mbox.gz follow: Atom feed) -- links below jump to the message on this page -- 2024-08-22 18:00 Refactor: allow pg_strncoll(), etc., to accept -1 length for NUL-terminated cstrings. Jeff Davis <[email protected]> 2024-09-21 00:28 ` Jeff Davis <[email protected]> 2026-05-01 16:40 ` Andres Freund <[email protected]> 2026-05-05 20:23 ` Jeff Davis <[email protected]> 2026-05-14 21:58 ` Jeff Davis <[email protected]> 2026-05-14 22:17 ` Andres Freund <[email protected]>
This inbox is served by agora; see mirroring instructions for how to clone and mirror all data and code used for this inbox