public inbox for [email protected]
help / color / mirror / Atom feedFrom: Martijn van Oosterhout <[email protected]>
To: Gregory Maxwell <[email protected]>
Cc: Bruce Momjian <[email protected]>
Cc: Gavin Sherry <[email protected]>
Cc: Peter Eisentraut <[email protected]>
Cc: [email protected]
Subject: Re: Upcoming PG re-releases
Date: Fri, 9 Dec 2005 17:17:33 +0100
Message-ID: <[email protected]> (raw)
In-Reply-To: <[email protected]>
References: <[email protected]>
<[email protected]>
<[email protected]>
On Thu, Dec 08, 2005 at 05:54:35PM -0500, Gregory Maxwell wrote:
> No, what is needed for people who care about fixing their data is a
> loadable strip_invalid_utf8() that works in older versions.. then just
> select * from bar where foo != strip_invalid_utf8(foo); The function
> would be useful in general, for example, if you have an application
> which doesn't already have much utf8 logic, you want to use a text
> field, and stripping is the behaviour you want. For example, lots of
> simple web applications.
Would something like the following work? It's written in pl/pgsql and
does (AFAICS) the same checking as the backend in recent releases.
Except the backend only supports up to 4-byte UTF-8 whereas this
function checks upto six byte. For a six byte UTF-8 character, who is
wrong?
In any case, people should be able to do something like:
SELECT field FROM table WHERE NOT utf8_verify(field,4);
To check conformance with PostgreSQL 8.1. Note, I don't have large
chunks of UTF-8 to test with but it works for the characters I tried
with. Tested with 7.4.
Have a nice day,
--
Martijn van Oosterhout <[email protected]> http://svana.org/kleptog/
> Patent. n. Genius is 5% inspiration and 95% perspiration. A patent is a
> tool for doing 5% of the work and then sitting around waiting for someone
> else to do the other 95% so you can sue them.
create or replace function utf8_verify(bytea,integer) returns bool as '
DECLARE
str ALIAS FOR $1;
maxlen ALIAS FOR $2;
strlen INTEGER;
i integer;
j INTEGER;
len integer;
chr integer;
wchr integer;
BEGIN
i := 0;
strlen := length(str);
WHILE i < strlen LOOP
-- Check leading byte
chr := get_byte(str,i);
IF chr < 128 THEN -- 0x00 - 0x80 - single byte
len := 1;
wchr := chr;
ELSIF chr < 192 THEN -- 0x80 - 0xC0 - illegal
RETURN false;
ELSIF chr < 224 THEN -- 0xC0 - 0xE0 - two bytes
len := 2;
wchr := chr - 192;
ELSIF chr < 240 THEN -- 0xE0 - 0xF0 - three bytes
len := 3;
wchr := chr - 224;
ELSIF chr < 248 THEN -- 0xF0 - 0xF8 - four bytes
len := 4;
wchr := chr - 240;
ELSIF chr < 252 THEN -- 0xF8 - 0xFC - five bytes
len := 5;
wchr := chr - 248;
ELSIF chr < 254 THEN -- 0xFC - 0xFE - six bytes
len := 6;
wchr := chr - 252;
ELSE
RETURN false; -- FE and FF not currently defined
END IF;
IF i + len > strlen THEN
RETURN false;
END IF;
IF len > maxlen THEN
RETURN false;
END IF;
-- Check remaining characters
j := 1;
WHILE len > j LOOP
chr := get_byte(str, i+j);
IF chr < 128 OR chr >= 192 THEN
RETURN false;
END IF;
wchr := (wchr << 6) + (chr - 192);
j := j+1;
END LOOP;
-- Verify shortest possible string
IF len = 1 AND wchr >= 128 THEN
RETURN false;
ELSIF len = 2 AND (wchr < 128 OR wchr >= 2048) THEN
RETURN false;
ELSIF len = 3 AND (wchr < 2048 OR wchr >= 65536) THEN
RETURN false;
ELSIF len = 4 AND (wchr < 65536 OR wchr >= 2097152) THEN
RETURN false;
ELSIF len = 5 AND (wchr < 2097152 OR wchr >= 67108864) THEN
RETURN false;
ELSIF len = 6 AND (wchr < 67108864 OR wchr >= 2147483648) THEN
RETURN false;
END IF;
-- RAISE NOTICE ''Checked char offset %, OK (wchr=%,len=%)'', i, wchr, len;
i := i+len;
END LOOP;
RETURN true;
END;
' language plpgsql;
Attachments:
[text/plain] utf8_verify.sql (2.1K, 2-utf8_verify.sql)
download | inline:
create or replace function utf8_verify(bytea,integer) returns bool as '
DECLARE
str ALIAS FOR $1;
maxlen ALIAS FOR $2;
strlen INTEGER;
i integer;
j INTEGER;
len integer;
chr integer;
wchr integer;
BEGIN
i := 0;
strlen := length(str);
WHILE i < strlen LOOP
-- Check leading byte
chr := get_byte(str,i);
IF chr < 128 THEN -- 0x00 - 0x80 - single byte
len := 1;
wchr := chr;
ELSIF chr < 192 THEN -- 0x80 - 0xC0 - illegal
RETURN false;
ELSIF chr < 224 THEN -- 0xC0 - 0xE0 - two bytes
len := 2;
wchr := chr - 192;
ELSIF chr < 240 THEN -- 0xE0 - 0xF0 - three bytes
len := 3;
wchr := chr - 224;
ELSIF chr < 248 THEN -- 0xF0 - 0xF8 - four bytes
len := 4;
wchr := chr - 240;
ELSIF chr < 252 THEN -- 0xF8 - 0xFC - five bytes
len := 5;
wchr := chr - 248;
ELSIF chr < 254 THEN -- 0xFC - 0xFE - six bytes
len := 6;
wchr := chr - 252;
ELSE
RETURN false; -- FE and FF not currently defined
END IF;
IF i + len > strlen THEN
RETURN false;
END IF;
IF len > maxlen THEN
RETURN false;
END IF;
-- Check remaining characters
j := 1;
WHILE len > j LOOP
chr := get_byte(str, i+j);
IF chr < 128 OR chr >= 192 THEN
RETURN false;
END IF;
wchr := (wchr << 6) + (chr - 192);
j := j+1;
END LOOP;
-- Verify shortest possible string
IF len = 1 AND wchr >= 128 THEN
RETURN false;
ELSIF len = 2 AND (wchr < 128 OR wchr >= 2048) THEN
RETURN false;
ELSIF len = 3 AND (wchr < 2048 OR wchr >= 65536) THEN
RETURN false;
ELSIF len = 4 AND (wchr < 65536 OR wchr >= 2097152) THEN
RETURN false;
ELSIF len = 5 AND (wchr < 2097152 OR wchr >= 67108864) THEN
RETURN false;
ELSIF len = 6 AND (wchr < 67108864 OR wchr >= 2147483648) THEN
RETURN false;
END IF;
-- RAISE NOTICE ''Checked char offset %, OK (wchr=%,len=%)'', i, wchr, len;
i := i+len;
END LOOP;
RETURN true;
END;
' language plpgsql;
view thread (55+ messages) latest in thread
reply
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Reply to all the recipients using the --to and --cc options:
reply via email
To: [email protected]
Cc: [email protected], [email protected], [email protected], [email protected], [email protected], [email protected]
Subject: Re: Upcoming PG re-releases
In-Reply-To: <[email protected]>
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
This inbox is served by agora; see mirroring instructions
for how to clone and mirror all data and code used for this inbox