From 81f690d0674fc22cddee8efff572330cbde4e1e2 Mon Sep 17 00:00:00 2001 From: "okbob@github.com" Date: Tue, 12 Apr 2022 21:16:39 +0200 Subject: [PATCH 1/4] backport regexp_instr from PostgreSQL 15, use C version instead PLpgSQL for better possibilities of usage regexp API --- .gitignore | 1 + META.json | 4 +- Makefile | 4 +- builtins.h | 8 + orafce--3.20--3.21.sql | 35 ++ orafce--3.20.sql => orafce--3.21.sql | 314 +------------ orafce.control | 2 +- regexp.c | 680 +++++++++++++++++++++++++++ 8 files changed, 744 insertions(+), 304 deletions(-) create mode 100644 orafce--3.20--3.21.sql rename orafce--3.20.sql => orafce--3.21.sql (93%) create mode 100644 regexp.c diff --git a/.gitignore b/.gitignore index 28494f598e4f..67c903b813ab 100755 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ *.opensdf *.suo *.*.user +*.bc /.deps/ /orafce.sql /orafce.sql.in diff --git a/META.json b/META.json index 3d2ec02a07ee..00da6697be47 100644 --- a/META.json +++ b/META.json @@ -2,7 +2,7 @@ "name": "orafce", "abstract": "Oracle's compatibility functions and packages", "description": "This module allows use a well known Oracle's functions and packages inside PostgreSQL", - "version": "3.20.0", + "version": "3.21.0", "maintainer": [ "Pavel Stehule ", "Takahiro Itagaki " @@ -25,7 +25,7 @@ "orafce": { "file": "sql/orafce.sql", "docfile": "README.orafce", - "version": "3.20.0", + "version": "3.21.0", "abstract": "Oracle's compatibility functions and packages" } }, diff --git a/Makefile b/Makefile index af2db215cd96..5be17a4332ac 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,9 @@ MODULE_big = orafce -OBJS= parse_keyword.o convert.o file.o datefce.o magic.o others.o plvstr.o plvdate.o shmmc.o plvsubst.o utility.o plvlex.o alert.o pipe.o sqlparse.o putline.o assert.o plunit.o random.o aggregate.o orafce.o varchar2.o nvarchar2.o charpad.o charlen.o replace_empty_string.o +OBJS= regexp.o parse_keyword.o convert.o file.o datefce.o magic.o others.o plvstr.o plvdate.o shmmc.o plvsubst.o utility.o plvlex.o alert.o pipe.o sqlparse.o putline.o assert.o plunit.o random.o aggregate.o orafce.o varchar2.o nvarchar2.o charpad.o charlen.o replace_empty_string.o EXTENSION = orafce -DATA = orafce--3.20.sql orafce--3.2--3.3.sql orafce--3.3--3.4.sql orafce--3.4--3.5.sql orafce--3.5--3.6.sql orafce--3.6--3.7.sql orafce--3.7--3.8.sql orafce--3.8--3.9.sql orafce--3.9--3.10.sql orafce--3.10--3.11.sql orafce--3.11--3.12.sql orafce--3.12--3.13.sql orafce--3.13--3.14.sql orafce--3.14--3.15.sql orafce--3.15--3.16.sql orafce--3.16--3.17.sql orafce--3.17--3.18.sql orafce--3.18--3.19.sql orafce--3.19--3.20.sql +DATA = orafce--3.21.sql orafce--3.2--3.3.sql orafce--3.3--3.4.sql orafce--3.4--3.5.sql orafce--3.5--3.6.sql orafce--3.6--3.7.sql orafce--3.7--3.8.sql orafce--3.8--3.9.sql orafce--3.9--3.10.sql orafce--3.10--3.11.sql orafce--3.11--3.12.sql orafce--3.12--3.13.sql orafce--3.13--3.14.sql orafce--3.14--3.15.sql orafce--3.15--3.16.sql orafce--3.16--3.17.sql orafce--3.17--3.18.sql orafce--3.18--3.19.sql orafce--3.19--3.20.sql orafce--3.20--3.21.sql DOCS = README.asciidoc COPYRIGHT.orafce INSTALL.orafce PG_CONFIG ?= pg_config diff --git a/builtins.h b/builtins.h index 1de87d5734ed..52b478fe4780 100644 --- a/builtins.h +++ b/builtins.h @@ -296,4 +296,12 @@ extern PGDLLEXPORT Datum nvarchar2recv(PG_FUNCTION_ARGS); extern PGDLLEXPORT Datum orafce_replace_empty_strings(PG_FUNCTION_ARGS); extern PGDLLEXPORT Datum orafce_replace_null_strings(PG_FUNCTION_ARGS); +/* from regexp.c */ +extern PGDLLEXPORT Datum orafce_regexp_instr(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_regexp_instr_no_start(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_regexp_instr_no_n(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_regexp_instr_no_endoption(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_regexp_instr_no_flags(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_regexp_instr_no_subexpr(PG_FUNCTION_ARGS); + #endif diff --git a/orafce--3.20--3.21.sql b/orafce--3.20--3.21.sql new file mode 100644 index 000000000000..04a01645eb65 --- /dev/null +++ b/orafce--3.20--3.21.sql @@ -0,0 +1,35 @@ +-- REGEXP_INSTR( string text, pattern text ) -> integer +CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text) +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_start' +LANGUAGE 'c' IMMUTABLE STRICT; + +-- REGEXP_INSTR( string text, pattern text, position int ) -> integer +CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer) +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_n' +LANGUAGE 'c' IMMUTABLE STRICT; + +-- REGEXP_INSTR( string text, pattern text, position int, occurence int ) -> integer +CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer) +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_endoption' +LANGUAGE 'c' IMMUTABLE STRICT; + +-- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int ) -> integer +CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer) +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_flags' +LANGUAGE 'c' IMMUTABLE STRICT; + +-- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text ) -> integer +CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text) +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_subexpr' +LANGUAGE 'c' IMMUTABLE STRICT; + +-- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text, group int ) -> integer +CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text, integer) +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr' +LANGUAGE 'c' IMMUTABLE STRICT; diff --git a/orafce--3.20.sql b/orafce--3.21.sql similarity index 93% rename from orafce--3.20.sql rename to orafce--3.21.sql index 21503b1f0209..67a6cc1accf2 100644 --- a/orafce--3.20.sql +++ b/orafce--3.21.sql @@ -3681,322 +3681,38 @@ LANGUAGE plpgsql; -- REGEXP_INSTR( string text, pattern text ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text) RETURNS integer -AS $$ -DECLARE - v_pos integer; - v_pattern text; -BEGIN - -- Without subexpression specified, assume 0 which mean that the first - -- position for the substring matching the whole pattern is returned. - -- We need to enclose the pattern between parentheses. - v_pattern := '(' || $2 || ')'; - - -- Oracle default behavior is newline-sensitive, - -- PostgreSQL not, so force 'p' modifier to affect - -- newline-sensitivity but not ^ and $ search. - v_pos := (SELECT position((SELECT (regexp_matches($1, v_pattern, 'pg'))[1] OFFSET 0 LIMIT 1) IN $1)); - - -- position() returns NULL when not found, we need to return 0 instead - IF v_pos IS NOT NULL THEN - RETURN v_pos; - END IF; - RETURN 0; -END; -$$ -LANGUAGE plpgsql STRICT; +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_start' +LANGUAGE 'c' IMMUTABLE STRICT; -- REGEXP_INSTR( string text, pattern text, position int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer) RETURNS integer -AS $$ -DECLARE - v_pos integer; - v_pattern text; -BEGIN - IF $3 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - -- Without subexpression specified, assume 0 which mean that the first - -- position for the substring matching the whole pattern is returned. - -- We need to enclose the pattern between parentheses. - v_pattern := '(' || $2 || ')'; - - -- Oracle default behavior is newline-sensitive, - -- PostgreSQL not, so force 'p' modifier to affect - -- newline-sensitivity but not ^ and $ search. - v_pos := (SELECT position((SELECT (regexp_matches(substr($1, $3), v_pattern, 'pg'))[1] OFFSET 0 LIMIT 1) IN $1)); - - -- position() returns NULL when not found, we need to return 0 instead - IF v_pos IS NOT NULL THEN - RETURN v_pos; - END IF; - RETURN 0; -END; -$$ -LANGUAGE plpgsql STRICT; +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_n' +LANGUAGE 'c' IMMUTABLE STRICT; -- REGEXP_INSTR( string text, pattern text, position int, occurence int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer) - RETURNS integer - LANGUAGE plpgsql - STRICT -AS $function$ -DECLARE - v_pos integer; - v_pattern text; - r record; - start_pos integer DEFAULT 1; - new_start integer; -BEGIN - IF $3 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - IF $4 < 1 THEN - RAISE EXCEPTION 'argument ''occurence'' must be a number greater than 0'; - END IF; - - -- Without subexpression specified, assume 0 which mean that the first - -- position for the substring matching the whole pattern is returned. - -- We need to enclose the pattern between parentheses. - v_pattern := '(' || $2 || ')'; - - -- Oracle default behavior is newline-sensitive, - -- PostgreSQL not, so force 'p' modifier to affect - -- newline-sensitivity but not ^ and $ search. - - $1 := substr($1, $3); - start_pos := $3; - - FOR r IN SELECT (regexp_matches($1, v_pattern, 'pg'))[1] - LOOP - v_pos := position(r.regexp_matches IN $1); - - IF $4 = 1 THEN - RETURN v_pos + start_pos - 1; - ELSE - $4 := $4 - 1; - END IF; - - new_start := v_pos + length(r.regexp_matches); - $1 := substr($1, new_start); - start_pos := start_pos + new_start - 1; - END LOOP; - - RETURN 0; -END; -$function$; +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_endoption' +LANGUAGE 'c' IMMUTABLE STRICT; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer) - RETURNS integer - LANGUAGE plpgsql - STRICT -AS $function$ -DECLARE - v_pos integer; - v_pattern text; - r record; - start_pos integer DEFAULT 1; - new_start integer; - pattern_match_len integer; -BEGIN - IF $3 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - IF $4 < 1 THEN - RAISE EXCEPTION 'argument ''occurence'' must be a number greater than 0'; - END IF; - IF $5 != 0 AND $5 != 1 THEN - RAISE EXCEPTION 'argument ''return_opt'' must be 0 or 1'; - END IF; - - -- Without subexpression specified, assume 0 which mean that the first - -- position for the substring matching the whole pattern is returned. - -- We need to enclose the pattern between parentheses. - v_pattern := '(' || $2 || ')'; - - -- Oracle default behavior is newline-sensitive, - -- PostgreSQL not, so force 'p' modifier to affect - -- newline-sensitivity but not ^ and $ search. - - $1 := substr($1, $3); - start_pos := $3; - - FOR r IN SELECT (regexp_matches($1, v_pattern, 'pg'))[1] - LOOP - v_pos := position(r.regexp_matches IN $1); - - pattern_match_len = length(r.regexp_matches); - - IF $4 = 1 THEN - IF $5 = 1 THEN - new_start := v_pos + pattern_match_len; - start_pos := start_pos + new_start - 1; - RETURN start_pos; - END IF; - RETURN v_pos + start_pos - 1; - ELSE - $4 := $4 - 1; - END IF; - - new_start := v_pos + pattern_match_len; - $1 := substr($1, new_start); - start_pos := start_pos + new_start - 1; - END LOOP; - - RETURN 0; -END; -$function$; +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_flags' +LANGUAGE 'c' IMMUTABLE STRICT; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text) - RETURNS integer - LANGUAGE plpgsql -AS $function$ -DECLARE - v_pos integer; - v_pattern text; - r record; - start_pos integer DEFAULT 1; - new_start integer; - pattern_match_len integer; - modifiers text; -BEGIN - -- Only modifier can be NULL - IF $1 IS NULL OR $2 IS NULL OR $3 IS NULL OR $4 IS NULL OR $5 IS NULL THEN - RETURN NULL; - END IF; - -- Check numeric arguments - IF $3 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - IF $4 < 1 THEN - RAISE EXCEPTION 'argument ''occurence'' must be a number greater than 0'; - END IF; - IF $5 != 0 AND $5 != 1 THEN - RAISE EXCEPTION 'argument ''return_opt'' must be 0 or 1'; - END IF; - - -- Translate Oracle regexp modifier into PostgreSQL ones - IF $6 IS NOT NULL THEN - modifiers := oracle.translate_oracle_modifiers($6, true); - ELSE - modifiers := 'pg'; - END IF; - - -- Without subexpression specified, assume 0 which mean that the first - -- position for the substring matching the whole pattern is returned. - -- We need to enclose the pattern between parentheses. - v_pattern := '(' || $2 || ')'; - - -- Oracle default behavior is newline-sensitive, - -- PostgreSQL not, so force 'p' modifier to affect - -- newline-sensitivity but not ^ and $ search. - - $1 := substr($1, $3); - start_pos := $3; - - FOR r IN SELECT (regexp_matches($1, v_pattern, modifiers))[1] - LOOP - v_pos := position(r.regexp_matches IN $1); - - pattern_match_len = length(r.regexp_matches); - - IF $4 = 1 THEN - IF $5 = 1 THEN - new_start := v_pos + pattern_match_len; - start_pos := start_pos + new_start - 1; - RETURN start_pos; - END IF; - RETURN v_pos + start_pos - 1; - ELSE - $4 := $4 - 1; - END IF; - - new_start := v_pos + pattern_match_len; - $1 := substr($1, new_start); - start_pos := start_pos + new_start - 1; - END LOOP; - - RETURN 0; -END; -$function$; +RETURNS integer +AS 'MODULE_PATHNAME','orafce_regexp_instr_no_subexpr' +LANGUAGE 'c' IMMUTABLE STRICT; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text, group int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text, integer) RETURNS integer -AS $$ -DECLARE - v_pos integer := 0; - v_pos_orig integer := $3; - v_len integer := 0; - modifiers text; - occurrence integer := $4; - idx integer := 1; - v_curr_pos integer := 0; - v_pattern text; - v_subexpr integer := $7; -BEGIN - -- Only modifier can be NULL - IF $1 IS NULL OR $2 IS NULL OR $3 IS NULL OR $4 IS NULL OR $5 IS NULL OR $7 IS NULL THEN - RETURN NULL; - END IF; - -- Check numeric arguments - IF $3 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - IF $4 < 1 THEN - RAISE EXCEPTION 'argument ''occurence'' must be a number greater than 0'; - END IF; - IF $7 < 0 THEN - RAISE EXCEPTION 'argument ''group'' must be a positive number'; - END IF; - IF $5 != 0 AND $5 != 1 THEN - RAISE EXCEPTION 'argument ''return_opt'' must be 0 or 1'; - END IF; - - -- Translate Oracle regexp modifier into PostgreSQL ones - IF $6 IS NOT NULL THEN - modifiers := oracle.translate_oracle_modifiers($6, true); - ELSE - modifiers := 'pg'; - END IF; - - -- If subexpression value is 0 we need to enclose the pattern between parentheses. - IF v_subexpr = 0 THEN - v_pattern := '(' || $2 || ')'; - v_subexpr := 1; - ELSE - v_pattern := $2; - END IF; - - -- To get position of occurrence > 1 we need a more complex code - LOOP - v_curr_pos := v_curr_pos + v_len; - v_pos := (SELECT position((SELECT (regexp_matches(substr($1, v_pos_orig), '('||$2||')', modifiers))[1] OFFSET 0 LIMIT 1) IN substr($1, v_pos_orig))); - v_len := (SELECT length((SELECT (regexp_matches(substr($1, v_pos_orig), '('||$2||')', modifiers))[1] OFFSET 0 LIMIT 1))); - - EXIT WHEN v_len IS NULL; - - v_pos_orig := v_pos_orig + v_pos + v_len; - v_curr_pos := v_curr_pos + v_pos; - idx := idx + 1; - - EXIT WHEN idx > occurrence; - END LOOP; - - v_pos := (SELECT position((SELECT (regexp_matches(substr($1, v_curr_pos), v_pattern, modifiers))[v_subexpr] OFFSET 0 LIMIT 1) IN substr($1, v_curr_pos))); - IF v_pos IS NOT NULL THEN - IF $5 = 1 THEN - v_len := (SELECT length((SELECT (regexp_matches(substr($1, v_curr_pos), v_pattern, modifiers))[v_subexpr] OFFSET 0 LIMIT 1))); - v_pos := v_pos + v_len; - END IF; - RETURN v_pos + v_curr_pos - 1; - END IF; - RETURN 0; -END; -$$ -LANGUAGE plpgsql; +AS 'MODULE_PATHNAME','orafce_regexp_instr' +LANGUAGE 'c' IMMUTABLE STRICT; -- REGEXP_SUBSTR( string text, pattern text ) -> text CREATE OR REPLACE FUNCTION oracle.regexp_substr(text, text) diff --git a/orafce.control b/orafce.control index b30769a4c0f0..fd7e1bb00d5a 100644 --- a/orafce.control +++ b/orafce.control @@ -1,5 +1,5 @@ # orafce extension comment = 'Functions and operators that emulate a subset of functions and packages from the Oracle RDBMS' -default_version = '3.20' +default_version = '3.21' module_pathname = '$libdir/orafce' relocatable = false diff --git a/regexp.c b/regexp.c new file mode 100644 index 000000000000..b7cda0af00fa --- /dev/null +++ b/regexp.c @@ -0,0 +1,680 @@ +#include "postgres.h" + +#include "catalog/pg_type.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "regex/regex.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/memutils.h" + +/* all the options of interest for regex functions */ +typedef struct pg_re_flags +{ + int cflags; /* compile flags for Spencer's regex code */ + bool glob; /* do it globally (for each occurrence) */ +} pg_re_flags; + +/* cross-call state for regexp_match and regexp_split functions */ +typedef struct regexp_matches_ctx +{ + text *orig_str; /* data string in original TEXT form */ + int nmatches; /* number of places where pattern matched */ + int npatterns; /* number of capturing subpatterns */ + /* We store start char index and end+1 char index for each match */ + /* so the number of entries in match_locs is nmatches * npatterns * 2 */ + int *match_locs; /* 0-based character indexes */ + int next_match; /* 0-based index of next match to process */ + /* workspace for build_regexp_match_result() */ + Datum *elems; /* has npatterns elements */ + bool *nulls; /* has npatterns elements */ + pg_wchar *wide_str; /* wide-char version of original string */ + char *conv_buf; /* conversion buffer, if needed */ + int conv_bufsiz; /* size thereof */ +} regexp_matches_ctx; + + + +/* + * Backport code from PostgreSQL 15 + */ + +#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \ + (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL) + + +PG_FUNCTION_INFO_V1(orafce_regexp_instr); +PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_start); +PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_n); +PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_endoption); +PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_flags); +PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_subexpr); + +#if PG_VERSION_NUM < 120000 + + +/* this is the maximum number of cached regular expressions */ +#ifndef MAX_CACHED_RES +#define MAX_CACHED_RES 32 +#endif + +/* this structure describes one cached regular expression */ +typedef struct cached_re_str +{ + char *cre_pat; /* original RE (not null terminated!) */ + int cre_pat_len; /* length of original RE, in bytes */ + int cre_flags; /* compile flags: extended,icase etc */ + Oid cre_collation; /* collation to use */ + regex_t cre_re; /* the compiled regular expression */ +} cached_re_str; + +static int num_res = 0; /* # of cached re's */ +static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ + + +/* + * RE_compile_and_cache - compile a RE, caching if possible + * + * Returns regex_t * + * + * text_re --- the pattern, expressed as a TEXT object + * cflags --- compile options for the pattern + * collation --- collation to use for LC_CTYPE-dependent behavior + * + * Pattern is given in the database encoding. We internally convert to + * an array of pg_wchar, which is what Spencer's regex package wants. + */ +static regex_t * +RE_compile_and_cache(text *text_re, int cflags, Oid collation) +{ + int text_re_len = VARSIZE_ANY_EXHDR(text_re); + char *text_re_val = VARDATA_ANY(text_re); + pg_wchar *pattern; + int pattern_len; + int i; + int regcomp_result; + cached_re_str re_temp; + char errMsg[100]; + + /* + * Look for a match among previously compiled REs. Since the data + * structure is self-organizing with most-used entries at the front, our + * search strategy can just be to scan from the front. + */ + for (i = 0; i < num_res; i++) + { + if (re_array[i].cre_pat_len == text_re_len && + re_array[i].cre_flags == cflags && + re_array[i].cre_collation == collation && + memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0) + { + /* + * Found a match; move it to front if not there already. + */ + if (i > 0) + { + re_temp = re_array[i]; + memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str)); + re_array[0] = re_temp; + } + + return &re_array[0].cre_re; + } + } + + /* + * Couldn't find it, so try to compile the new RE. To avoid leaking + * resources on failure, we build into the re_temp local. + */ + + /* Convert pattern string to wide characters */ + pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar)); + pattern_len = pg_mb2wchar_with_len(text_re_val, + pattern, + text_re_len); + + regcomp_result = pg_regcomp(&re_temp.cre_re, + pattern, + pattern_len, + cflags, + collation); + + pfree(pattern); + + if (regcomp_result != REG_OKAY) + { + /* re didn't compile (no need for pg_regfree, if so) */ + + /* + * Here and in other places in this file, do CHECK_FOR_INTERRUPTS + * before reporting a regex error. This is so that if the regex + * library aborts and returns REG_CANCEL, we don't print an error + * message that implies the regex was invalid. + */ + CHECK_FOR_INTERRUPTS(); + + pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("invalid regular expression: %s", errMsg))); + } + + /* + * We use malloc/free for the cre_pat field because the storage has to + * persist across transactions, and because we want to get control back on + * out-of-memory. The Max() is because some malloc implementations return + * NULL for malloc(0). + */ + re_temp.cre_pat = malloc(Max(text_re_len, 1)); + if (re_temp.cre_pat == NULL) + { + pg_regfree(&re_temp.cre_re); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + memcpy(re_temp.cre_pat, text_re_val, text_re_len); + re_temp.cre_pat_len = text_re_len; + re_temp.cre_flags = cflags; + re_temp.cre_collation = collation; + + /* + * Okay, we have a valid new item in re_temp; insert it into the storage + * array. Discard last entry if needed. + */ + if (num_res >= MAX_CACHED_RES) + { + --num_res; + Assert(num_res < MAX_CACHED_RES); + pg_regfree(&re_array[num_res].cre_re); + free(re_array[num_res].cre_pat); + } + + if (num_res > 0) + memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str)); + + re_array[0] = re_temp; + num_res++; + + return &re_array[0].cre_re; +} + +#endif + + +/* + * RE_wchar_execute - execute a RE on pg_wchar data + * + * Returns true on match, false on no match + * + * re --- the compiled pattern as returned by RE_compile_and_cache + * data --- the data to match against (need not be null-terminated) + * data_len --- the length of the data string + * start_search -- the offset in the data to start searching + * nmatch, pmatch --- optional return area for match details + * + * Data is given as array of pg_wchar which is what Spencer's regex package + * wants. + */ +static bool +RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, + int start_search, int nmatch, regmatch_t *pmatch) +{ + int regexec_result; + char errMsg[100]; + + /* Perform RE match and return result */ + regexec_result = pg_regexec(re, + data, + data_len, + start_search, + NULL, /* no details */ + nmatch, + pmatch, + 0); + + if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) + { + /* re failed??? */ + CHECK_FOR_INTERRUPTS(); + pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("regular expression failed: %s", errMsg))); + } + + return (regexec_result == REG_OKAY); +} + + +/* + * setup_regexp_matches --- do the initial matching for regexp_match, + * regexp_split, and related functions + * + * To avoid having to re-find the compiled pattern on each call, we do + * all the matching in one swoop. The returned regexp_matches_ctx contains + * the locations of all the substrings matching the pattern. + * + * start_search: the character (not byte) offset in orig_str at which to + * begin the search. Returned positions are relative to orig_str anyway. + * use_subpatterns: collect data about matches to parenthesized subexpressions. + * ignore_degenerate: ignore zero-length matches. + * fetching_unmatched: caller wants to fetch unmatched substrings. + * + * We don't currently assume that fetching_unmatched is exclusive of fetching + * the matched text too; if it's set, the conversion buffer is large enough to + * fetch any single matched or unmatched string, but not any larger + * substring. (In practice, when splitting the matches are usually small + * anyway, and it didn't seem worth complicating the code further.) + */ +static regexp_matches_ctx * +setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, + int start_search, + Oid collation, + bool use_subpatterns, + bool ignore_degenerate, + bool fetching_unmatched) +{ + regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx)); + int eml = pg_database_encoding_max_length(); + int orig_len; + pg_wchar *wide_str; + int wide_len; + int cflags; + regex_t *cpattern; + regmatch_t *pmatch; + int pmatch_len; + int array_len; + int array_idx; + int prev_match_end; + int prev_valid_match_end; + int maxlen = 0; /* largest fetch length in characters */ + + /* save original string --- we'll extract result substrings from it */ + matchctx->orig_str = orig_str; + + /* convert string to pg_wchar form for matching */ + orig_len = VARSIZE_ANY_EXHDR(orig_str); + wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1)); + wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len); + + /* set up the compiled pattern */ + cflags = re_flags->cflags; + if (!use_subpatterns) + cflags |= REG_NOSUB; + cpattern = RE_compile_and_cache(pattern, cflags, collation); + + /* do we want to remember subpatterns? */ + if (use_subpatterns && cpattern->re_nsub > 0) + { + matchctx->npatterns = cpattern->re_nsub; + pmatch_len = cpattern->re_nsub + 1; + } + else + { + use_subpatterns = false; + matchctx->npatterns = 1; + pmatch_len = 1; + } + + /* temporary output space for RE package */ + pmatch = palloc(sizeof(regmatch_t) * pmatch_len); + + /* + * the real output space (grown dynamically if needed) + * + * use values 2^n-1, not 2^n, so that we hit the limit at 2^28-1 rather + * than at 2^27 + */ + array_len = re_flags->glob ? 255 : 31; + matchctx->match_locs = (int *) palloc(sizeof(int) * array_len); + array_idx = 0; + + /* search for the pattern, perhaps repeatedly */ + prev_match_end = 0; + prev_valid_match_end = 0; + while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search, + pmatch_len, pmatch)) + { + /* + * If requested, ignore degenerate matches, which are zero-length + * matches occurring at the start or end of a string or just after a + * previous match. + */ + if (!ignore_degenerate || + (pmatch[0].rm_so < wide_len && + pmatch[0].rm_eo > prev_match_end)) + { + /* enlarge output space if needed */ + while (array_idx + matchctx->npatterns * 2 + 1 > array_len) + { + array_len += array_len + 1; /* 2^n-1 => 2^(n+1)-1 */ + if (array_len > MaxAllocSize / sizeof(int)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("too many regular expression matches"))); + matchctx->match_locs = (int *) repalloc(matchctx->match_locs, + sizeof(int) * array_len); + } + + /* save this match's locations */ + if (use_subpatterns) + { + int i; + + for (i = 1; i <= matchctx->npatterns; i++) + { + int so = pmatch[i].rm_so; + int eo = pmatch[i].rm_eo; + + matchctx->match_locs[array_idx++] = so; + matchctx->match_locs[array_idx++] = eo; + if (so >= 0 && eo >= 0 && (eo - so) > maxlen) + maxlen = (eo - so); + } + } + else + { + int so = pmatch[0].rm_so; + int eo = pmatch[0].rm_eo; + + matchctx->match_locs[array_idx++] = so; + matchctx->match_locs[array_idx++] = eo; + if (so >= 0 && eo >= 0 && (eo - so) > maxlen) + maxlen = (eo - so); + } + matchctx->nmatches++; + + /* + * check length of unmatched portion between end of previous valid + * (nondegenerate, or degenerate but not ignored) match and start + * of current one + */ + if (fetching_unmatched && + pmatch[0].rm_so >= 0 && + (pmatch[0].rm_so - prev_valid_match_end) > maxlen) + maxlen = (pmatch[0].rm_so - prev_valid_match_end); + prev_valid_match_end = pmatch[0].rm_eo; + } + prev_match_end = pmatch[0].rm_eo; + + /* if not glob, stop after one match */ + if (!re_flags->glob) + break; + + /* + * Advance search position. Normally we start the next search at the + * end of the previous match; but if the match was of zero length, we + * have to advance by one character, or we'd just find the same match + * again. + */ + start_search = prev_match_end; + if (pmatch[0].rm_so == pmatch[0].rm_eo) + start_search++; + if (start_search > wide_len) + break; + } + + /* + * check length of unmatched portion between end of last match and end of + * input string + */ + if (fetching_unmatched && + (wide_len - prev_valid_match_end) > maxlen) + maxlen = (wide_len - prev_valid_match_end); + + /* + * Keep a note of the end position of the string for the benefit of + * splitting code. + */ + matchctx->match_locs[array_idx] = wide_len; + + if (eml > 1) + { + int64 maxsiz = eml * (int64) maxlen; + int conv_bufsiz; + + /* + * Make the conversion buffer large enough for any substring of + * interest. + * + * Worst case: assume we need the maximum size (maxlen*eml), but take + * advantage of the fact that the original string length in bytes is + * an upper bound on the byte length of any fetched substring (and we + * know that len+1 is safe to allocate because the varlena header is + * longer than 1 byte). + */ + if (maxsiz > orig_len) + conv_bufsiz = orig_len + 1; + else + conv_bufsiz = maxsiz + 1; /* safe since maxsiz < 2^30 */ + + matchctx->conv_buf = palloc(conv_bufsiz); + matchctx->conv_bufsiz = conv_bufsiz; + matchctx->wide_str = wide_str; + } + else + { + /* No need to keep the wide string if we're in a single-byte charset. */ + pfree(wide_str); + matchctx->wide_str = NULL; + matchctx->conv_buf = NULL; + matchctx->conv_bufsiz = 0; + } + + /* Clean up temp storage */ + pfree(pmatch); + + return matchctx; +} + + +/* + * parse_re_flags - parse the options argument of regexp_match and friends + * + * flags --- output argument, filled with desired options + * opts --- TEXT object, or NULL for defaults + * + * This accepts all the options allowed by any of the callers; callers that + * don't want some have to reject them after the fact. + */ +static void +parse_re_flags(pg_re_flags *flags, text *opts) +{ + /* regex flavor is always folded into the compile flags */ + flags->cflags = REG_ADVANCED; + flags->glob = false; + + if (opts) + { + char *opt_p = VARDATA_ANY(opts); + int opt_len = VARSIZE_ANY_EXHDR(opts); + int i; + + for (i = 0; i < opt_len; i++) + { + switch (opt_p[i]) + { + case 'g': + flags->glob = true; + break; + case 'b': /* BREs (but why???) */ + flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE); + break; + case 'c': /* case sensitive */ + flags->cflags &= ~REG_ICASE; + break; + case 'e': /* plain EREs */ + flags->cflags |= REG_EXTENDED; + flags->cflags &= ~(REG_ADVANCED | REG_QUOTE); + break; + case 'i': /* case insensitive */ + flags->cflags |= REG_ICASE; + break; + case 'm': /* Perloid synonym for n */ + case 'n': /* \n affects ^ $ . [^ */ + flags->cflags |= REG_NEWLINE; + break; + case 'p': /* ~Perl, \n affects . [^ */ + flags->cflags |= REG_NLSTOP; + flags->cflags &= ~REG_NLANCH; + break; + case 'q': /* literal string */ + flags->cflags |= REG_QUOTE; + flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED); + break; + case 's': /* single line, \n ordinary */ + flags->cflags &= ~REG_NEWLINE; + break; + case 't': /* tight syntax */ + flags->cflags &= ~REG_EXPANDED; + break; + case 'w': /* weird, \n affects ^ $ only */ + flags->cflags &= ~REG_NLSTOP; + flags->cflags |= REG_NLANCH; + break; + case 'x': /* expanded syntax */ + flags->cflags |= REG_EXPANDED; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid regular expression option: \"%.*s\"", + pg_mblen(opt_p + i), opt_p + i))); + break; + } + } + } +} + + +/* + * regexp_instr() + * Return the match's position within the string + */ +Datum +orafce_regexp_instr(PG_FUNCTION_ARGS) +{ + text *str = PG_GETARG_TEXT_PP(0); + text *pattern = PG_GETARG_TEXT_PP(1); + int start = 1; + int n = 1; + int endoption = 0; + text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); + int subexpr = 0; + int pos; + pg_re_flags re_flags; + regexp_matches_ctx *matchctx; + + /* Collect optional parameters */ + if (PG_NARGS() > 2) + { + start = PG_GETARG_INT32(2); + if (start <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "start", start))); + } + if (PG_NARGS() > 3) + { + n = PG_GETARG_INT32(3); + if (n <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "n", n))); + } + if (PG_NARGS() > 4) + { + endoption = PG_GETARG_INT32(4); + if (endoption != 0 && endoption != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "endoption", endoption))); + } + if (PG_NARGS() > 6) + { + subexpr = PG_GETARG_INT32(6); + if (subexpr < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid value for parameter \"%s\": %d", + "subexpr", subexpr))); + } + + /* Determine options */ + parse_re_flags(&re_flags, flags); + /* User mustn't specify 'g' */ + if (re_flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + /* translator: %s is a SQL function name */ + errmsg("%s does not support the \"global\" option", + "regexp_instr()"))); + /* But we find all the matches anyway */ + re_flags.glob = true; + + /* Do the matching */ + matchctx = setup_regexp_matches(str, pattern, &re_flags, start - 1, + PG_GET_COLLATION(), + (subexpr > 0), /* need submatches? */ + false, false); + + /* When n exceeds matches return 0 (includes case of no matches) */ + if (n > matchctx->nmatches) + PG_RETURN_INT32(0); + + /* When subexpr exceeds number of subexpressions return 0 */ + if (subexpr > matchctx->npatterns) + PG_RETURN_INT32(0); + + /* Select the appropriate match position to return */ + pos = (n - 1) * matchctx->npatterns; + if (subexpr > 0) + pos += subexpr - 1; + pos *= 2; + if (endoption == 1) + pos += 1; + + if (matchctx->match_locs[pos] >= 0) + PG_RETURN_INT32(matchctx->match_locs[pos] + 1); + else + PG_RETURN_INT32(0); /* position not identifiable */ +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +orafce_regexp_instr_no_start(PG_FUNCTION_ARGS) +{ + return orafce_regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +orafce_regexp_instr_no_n(PG_FUNCTION_ARGS) +{ + return orafce_regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +orafce_regexp_instr_no_endoption(PG_FUNCTION_ARGS) +{ + return orafce_regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +orafce_regexp_instr_no_flags(PG_FUNCTION_ARGS) +{ + return orafce_regexp_instr(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +orafce_regexp_instr_no_subexpr(PG_FUNCTION_ARGS) +{ + return orafce_regexp_instr(fcinfo); +} From 219c7f0e450eaa4d7ab039e1ad34577a0657a7aa Mon Sep 17 00:00:00 2001 From: "okbob@github.com" Date: Wed, 13 Apr 2022 06:32:42 +0200 Subject: [PATCH 2/4] modification for compotibility with orafce.regexp_instr --- orafce--3.20--3.21.sql | 12 ++++----- orafce--3.21.sql | 12 ++++----- regexp.c | 57 +++++++++++++++++++++++------------------- 3 files changed, 43 insertions(+), 38 deletions(-) diff --git a/orafce--3.20--3.21.sql b/orafce--3.20--3.21.sql index 04a01645eb65..4fc2cec84670 100644 --- a/orafce--3.20--3.21.sql +++ b/orafce--3.20--3.21.sql @@ -2,34 +2,34 @@ CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_start' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_n' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_endoption' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_flags' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_subexpr' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text, group int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; diff --git a/orafce--3.21.sql b/orafce--3.21.sql index 67a6cc1accf2..d04878955466 100644 --- a/orafce--3.21.sql +++ b/orafce--3.21.sql @@ -3682,37 +3682,37 @@ LANGUAGE plpgsql; CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_start' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_n' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_endoption' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_flags' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr_no_subexpr' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_INSTR( string text, pattern text, position int, occurence int, return_opt int, flags text, group int ) -> integer CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, integer, text, integer) RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr' -LANGUAGE 'c' IMMUTABLE STRICT; +LANGUAGE 'c' IMMUTABLE; -- REGEXP_SUBSTR( string text, pattern text ) -> text CREATE OR REPLACE FUNCTION oracle.regexp_substr(text, text) diff --git a/regexp.c b/regexp.c index b7cda0af00fa..c7400856d97e 100644 --- a/regexp.c +++ b/regexp.c @@ -33,16 +33,10 @@ typedef struct regexp_matches_ctx int conv_bufsiz; /* size thereof */ } regexp_matches_ctx; - - /* * Backport code from PostgreSQL 15 */ -#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \ - (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL) - - PG_FUNCTION_INFO_V1(orafce_regexp_instr); PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_start); PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_n); @@ -468,7 +462,6 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, return matchctx; } - /* * parse_re_flags - parse the options argument of regexp_match and friends * @@ -547,7 +540,6 @@ parse_re_flags(pg_re_flags *flags, text *opts) } } - /* * regexp_instr() * Return the match's position within the string @@ -555,64 +547,77 @@ parse_re_flags(pg_re_flags *flags, text *opts) Datum orafce_regexp_instr(PG_FUNCTION_ARGS) { - text *str = PG_GETARG_TEXT_PP(0); - text *pattern = PG_GETARG_TEXT_PP(1); + text *str = NULL; + text *pattern = NULL; int start = 1; int n = 1; int endoption = 0; - text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(5); + text *flags = NULL; int subexpr = 0; int pos; pg_re_flags re_flags; regexp_matches_ctx *matchctx; + if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) + PG_RETURN_NULL(); + + str = PG_GETARG_TEXT_PP(0); + pattern = PG_GETARG_TEXT_PP(1); + /* Collect optional parameters */ if (PG_NARGS() > 2) { + if (PG_ARGISNULL(2)) + PG_RETURN_NULL(); + start = PG_GETARG_INT32(2); if (start <= 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid value for parameter \"%s\": %d", - "start", start))); + errmsg("argument 'position' must be a number greater than 0"))); } if (PG_NARGS() > 3) { + if (PG_ARGISNULL(3)) + PG_RETURN_NULL(); + n = PG_GETARG_INT32(3); if (n <= 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid value for parameter \"%s\": %d", - "n", n))); + errmsg("argument 'occurence' must be a number greater than 0"))); } if (PG_NARGS() > 4) { + if (PG_ARGISNULL(4)) + PG_RETURN_NULL(); + endoption = PG_GETARG_INT32(4); if (endoption != 0 && endoption != 1) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid value for parameter \"%s\": %d", - "endoption", endoption))); + errmsg("argument 'return_opt' must be 0 or 1"))); + } + if (PG_NARGS() > 5) + { + if (!PG_ARGISNULL(5)) + flags = PG_GETARG_TEXT_PP(5); } if (PG_NARGS() > 6) { + if (PG_ARGISNULL(6)) + PG_RETURN_NULL(); + subexpr = PG_GETARG_INT32(6); if (subexpr < 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid value for parameter \"%s\": %d", - "subexpr", subexpr))); + errmsg("argument 'group' must be a positive number"))); } /* Determine options */ parse_re_flags(&re_flags, flags); - /* User mustn't specify 'g' */ - if (re_flags.glob) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - /* translator: %s is a SQL function name */ - errmsg("%s does not support the \"global\" option", - "regexp_instr()"))); + /* But we find all the matches anyway */ re_flags.glob = true; From 12b4403d9c002f92e275ca2c1e3f805594bceeb7 Mon Sep 17 00:00:00 2001 From: "okbob@github.com" Date: Wed, 13 Apr 2022 10:03:00 +0200 Subject: [PATCH 3/4] workable release for PostgreSQL 15 --- builtins.h | 5 + expected/regexp_func.out | 38 +-- orafce--3.20--3.21.sql | 29 +++ orafce--3.21.sql | 148 +---------- regexp.c | 519 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 587 insertions(+), 152 deletions(-) diff --git a/builtins.h b/builtins.h index 52b478fe4780..d50970426fa1 100644 --- a/builtins.h +++ b/builtins.h @@ -303,5 +303,10 @@ extern PGDLLEXPORT Datum orafce_regexp_instr_no_n(PG_FUNCTION_ARGS); extern PGDLLEXPORT Datum orafce_regexp_instr_no_endoption(PG_FUNCTION_ARGS); extern PGDLLEXPORT Datum orafce_regexp_instr_no_flags(PG_FUNCTION_ARGS); extern PGDLLEXPORT Datum orafce_regexp_instr_no_subexpr(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_textregexreplace_noopt(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_textregexreplace(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_textregexreplace_extended(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_textregexreplace_extended_no_n(PG_FUNCTION_ARGS); +extern PGDLLEXPORT Datum orafce_textregexreplace_extended_no_flags(PG_FUNCTION_ARGS); #endif diff --git a/expected/regexp_func.out b/expected/regexp_func.out index 856ca08978b1..c23eda78e108 100644 --- a/expected/regexp_func.out +++ b/expected/regexp_func.out @@ -575,32 +575,32 @@ SELECT REGEXP_REPLACE('512.123.4567', '([[:digit:]]{3})\.([[:digit:]]{3})\.([[:d -- ORACLE> SELECT REGEXP_REPLACE('512.123.4567 612.123.4567', '([[:digit:]]{3})\.([[:digit:]]{3})\.([[:digit:]]{4})', '(\1) \2-\3') FROM DUAL; -> (512) 123-4567 (612) 123-4567 SELECT oracle.REGEXP_REPLACE('512.123.4567 612.123.4567', '([[:digit:]]{3})\.([[:digit:]]{3})\.([[:digit:]]{4})', '(\1) \2-\3'); - regexp_replace -------------------------------- - (512) 123-4567 (612) 123-4567 + regexp_replace +----------------------------- + (512) 123-4567 612.123.4567 (1 row) -- ORACLE> SELECT REGEXP_REPLACE('number your street, zipcode town, FR', '( ){2,}', ' ') FROM DUAL; -> number your street, zipcode town, FR SELECT oracle.REGEXP_REPLACE('number your street, zipcode town, FR', '( ){2,}', ' '); - regexp_replace --------------------------------------- - number your street, zipcode town, FR + regexp_replace +---------------------------------------------- + number your street, zipcode town, FR (1 row) -- ORACLE> SELECT REGEXP_REPLACE('number your street,'||CHR(10)||' zipcode town, FR', '( ){2,}', ' ') FROM DUAL; -> number your street, -- zipcode town, FR SELECT oracle.REGEXP_REPLACE('number your street,'||CHR(10)||' zipcode town, FR', '( ){2,}', ' '); - regexp_replace ---------------------- - number your street,+ - zipcode town, FR + regexp_replace +------------------------- + number your street,+ + zipcode town, FR (1 row) -- ORACLE> SELECT REGEXP_REPLACE('number your street, zipcode town, FR', '( ){2,}', ' ', 9) FROM DUAL; -> number your street, zipcode town, FR SELECT oracle.REGEXP_REPLACE('number your street, zipcode town, FR', '( ){2,}', ' ', 9); - regexp_replace ----------------------------------------- - number your street, zipcode town, FR + regexp_replace +-------------------------------------------- + number your street, zipcode town, FR (1 row) -- ORACLE> SELECT REGEXP_REPLACE('number your street, zipcode town, FR', '( ){2,}', ' ', 9, 0) FROM DUAL; -> number your street, zipcode town, FR @@ -695,7 +695,11 @@ SELECT oracle.REGEXP_REPLACE ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, -1, ERROR: argument 'occurrence' must be a positive number -- ORACLE> SELECT REGEXP_REPLACE ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'g') FROM DUAL; -> ORA-01760 SELECT oracle.REGEXP_REPLACE ('A PostgreSQL function', 'a|e|i|o|u', 'X', 1, 1, 'g'); -ERROR: argument 'flags' has unsupported modifier(s). + regexp_replace +----------------------- + A PXstgreSQL function +(1 row) + -- -- Test NULL input in the regexp_* functions that must returned NULL except for the modifier -- or regexp flag. There is an exception with regexp_replace(), if the pattern is null (second @@ -892,7 +896,7 @@ SELECT oracle.REGEXP_REPLACE('1234', '\d', null); SELECT oracle.REGEXP_REPLACE('1234', '\d', 'a', null); regexp_replace ---------------- - + a234 (1 row) SELECT oracle.REGEXP_REPLACE('1234', null, 'a', 2); @@ -904,7 +908,7 @@ SELECT oracle.REGEXP_REPLACE('1234', null, 'a', 2); SELECT oracle.REGEXP_REPLACE('1234', null, 'a', null); regexp_replace ---------------- - + 1234 (1 row) SELECT oracle.REGEXP_REPLACE('1234', null, 'a', 1); @@ -916,7 +920,7 @@ SELECT oracle.REGEXP_REPLACE('1234', null, 'a', 1); SELECT oracle.REGEXP_REPLACE('1234', null, 'a', 1, null); regexp_replace ---------------- - + 1234 (1 row) SELECT oracle.REGEXP_REPLACE('1234', '\d', 'a', 1, null); diff --git a/orafce--3.20--3.21.sql b/orafce--3.20--3.21.sql index 4fc2cec84670..010d9d427de8 100644 --- a/orafce--3.20--3.21.sql +++ b/orafce--3.20--3.21.sql @@ -33,3 +33,32 @@ CREATE OR REPLACE FUNCTION oracle.regexp_instr(text, text, integer, integer, int RETURNS integer AS 'MODULE_PATHNAME','orafce_regexp_instr' LANGUAGE 'c' IMMUTABLE; + +-- REGEXP_REPLACE( string text, pattern text, replace_string text ) -> text +CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text) +RETURNS text +AS 'MODULE_PATHNAME','orafce_textregexreplace_noopt' +LANGUAGE 'c' IMMUTABLE; + +-- REGEXP_REPLACE( string text, pattern text, replace_string text, position int ) -> text +CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, integer) +RETURNS text +AS 'MODULE_PATHNAME','orafce_textregexreplace_extended_no_n' +LANGUAGE 'c' IMMUTABLE; + +-- REGEXP_REPLACE( string text, pattern text, replace_string text, position int, occurence int ) -> text +CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, integer, integer) +RETURNS text +AS 'MODULE_PATHNAME','orafce_textregexreplace_extended_no_flags' +LANGUAGE 'c' IMMUTABLE; + +-- REGEXP_REPLACE( string text, pattern text, replace_string text, position int, occurence int, flags text ) -> text +CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, integer, integer, text) +RETURNS text +AS 'MODULE_PATHNAME','orafce_textregexreplace_extended' +LANGUAGE 'c' IMMUTABLE; + +CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, text) +RETURNS text +AS 'MODULE_PATHNAME','orafce_textregexreplace' +LANGUAGE 'c' IMMUTABLE; diff --git a/orafce--3.21.sql b/orafce--3.21.sql index d04878955466..47cff7afe73d 100644 --- a/orafce--3.21.sql +++ b/orafce--3.21.sql @@ -3884,153 +3884,31 @@ LANGUAGE plpgsql; -- REGEXP_REPLACE( string text, pattern text, replace_string text ) -> text CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text) RETURNS text -AS $$ -DECLARE - str text; -BEGIN - IF $2 IS NULL AND $1 IS NOT NULL THEN - RETURN $1; - END IF; - -- Oracle default behavior is to replace all occurence - -- whereas PostgreSQL only replace the first occurrence - -- so we need to add 'g' modifier. - SELECT pg_catalog.regexp_replace($1, $2, $3, 'g') INTO str; - RETURN str; -END; -$$ -LANGUAGE plpgsql; +AS 'MODULE_PATHNAME','orafce_textregexreplace_noopt' +LANGUAGE 'c' IMMUTABLE; -- REGEXP_REPLACE( string text, pattern text, replace_string text, position int ) -> text CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, integer) RETURNS text -AS $$ -DECLARE - v_replaced_str text; - v_before text; -BEGIN - IF $1 IS NULL OR $3 IS NULL OR $4 IS NULL THEN - RETURN NULL; - END IF; - IF $2 IS NULL THEN - RETURN $1; - END IF; - -- Check numeric arguments - IF $4 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - - v_before = substr($1, 1, $4 - 1); - - -- Oracle default behavior is to replace all occurence - -- whereas PostgreSQL only replace the first occurrence - -- so we need to add 'g' modifier. - v_replaced_str := v_before || pg_catalog.regexp_replace(substr($1, $4), $2, $3, 'g'); - RETURN v_replaced_str; -END; -$$ -LANGUAGE plpgsql; +AS 'MODULE_PATHNAME','orafce_textregexreplace_extended_no_n' +LANGUAGE 'c' IMMUTABLE; -- REGEXP_REPLACE( string text, pattern text, replace_string text, position int, occurence int ) -> text CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, integer, integer) RETURNS text -AS $$ -DECLARE - v_replaced_str text; - v_pos integer := $4; - v_before text := ''; - v_nummatch integer; -BEGIN - IF $1 IS NULL OR $3 IS NULL OR $4 IS NULL OR $5 IS NULL THEN - RETURN NULL; - END IF; - IF $2 IS NULL THEN - RETURN $1; - END IF; - -- Check numeric arguments - IF $4 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - IF $5 < 0 THEN - RAISE EXCEPTION 'argument ''occurrence'' must be a positive number'; - END IF; - -- Check if the occurrence queried exceeds the number of occurrences - IF $5 > 1 THEN - v_nummatch := (SELECT count(*) FROM regexp_matches(substr($1, $4), $2, 'g')); - IF $5 > v_nummatch THEN - RETURN $1; - END IF; - -- Get the position of the occurrence we are looking for - v_pos := oracle.regexp_instr($1, $2, $4, $5, 0, '', 1); - IF v_pos = 0 THEN - RETURN $1; - END IF; - END IF; - -- Get the substring before this position we will need to restore it - v_before := substr($1, 1, v_pos - 1); - - -- Replace all occurrences - IF $5 = 0 THEN - v_replaced_str := v_before || pg_catalog.regexp_replace(substr($1, v_pos), $2, $3, 'g'); - ELSE - -- Replace the first occurrence - v_replaced_str := v_before || pg_catalog.regexp_replace(substr($1, v_pos), $2, $3); - END IF; - - RETURN v_replaced_str; -END; -$$ -LANGUAGE plpgsql; +AS 'MODULE_PATHNAME','orafce_textregexreplace_extended_no_flags' +LANGUAGE 'c' IMMUTABLE; -- REGEXP_REPLACE( string text, pattern text, replace_string text, position int, occurence int, flags text ) -> text CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, integer, integer, text) RETURNS text -AS $$ -DECLARE - v_replaced_str text; - v_pos integer := $4; - v_nummatch integer; - v_before text := ''; - modifiers text := ''; -BEGIN - IF $1 IS NULL OR $3 IS NULL OR $4 IS NULL OR $5 IS NULL THEN - RETURN NULL; - END IF; - IF $2 IS NULL THEN - RETURN $1; - END IF; - -- Check numeric arguments - IF $4 < 1 THEN - RAISE EXCEPTION 'argument ''position'' must be a number greater than 0'; - END IF; - IF $5 < 0 THEN - RAISE EXCEPTION 'argument ''occurrence'' must be a positive number'; - END IF; - -- Set the modifiers - IF $5 = 0 THEN - modifiers := oracle.translate_oracle_modifiers($6, true); - ELSE - modifiers := oracle.translate_oracle_modifiers($6, false); - END IF; - -- Check if the occurrence queried exceeds the number of occurrences - IF $5 > 1 THEN - v_nummatch := (SELECT count(*) FROM regexp_matches(substr($1, $4), $2, $6||'g')); - IF $5 > v_nummatch THEN - RETURN $1; - END IF; - -- Get the position of the occurrence we are looking for - v_pos := oracle.regexp_instr($1, $2, $4, $5, 0, $6, 1); - IF v_pos = 0 THEN - RETURN $1; - END IF; - END IF; - -- Get the substring before this position we will need to restore it - v_before := substr($1, 1, v_pos - 1); - -- Replace occurrence(s) - v_replaced_str := v_before || pg_catalog.regexp_replace(substr($1, v_pos), $2, $3, modifiers); - RETURN v_replaced_str; -END; -$$ -LANGUAGE plpgsql; +AS 'MODULE_PATHNAME','orafce_textregexreplace_extended' +LANGUAGE 'c' IMMUTABLE; + +CREATE OR REPLACE FUNCTION oracle.regexp_replace(text, text, text, text) +RETURNS text +AS 'MODULE_PATHNAME','orafce_textregexreplace' +LANGUAGE 'c' IMMUTABLE; ---- -- Add LEAST/GREATEST declaration to return NULL on NULL input. diff --git a/regexp.c b/regexp.c index c7400856d97e..1f6790f97dd7 100644 --- a/regexp.c +++ b/regexp.c @@ -8,6 +8,13 @@ #include "utils/builtins.h" #include "utils/memutils.h" +#if PG_VERSION_NUM >= 150000 + +#include "utils/varlena.h" + +#endif + + /* all the options of interest for regex functions */ typedef struct pg_re_flags { @@ -43,6 +50,11 @@ PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_n); PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_endoption); PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_flags); PG_FUNCTION_INFO_V1(orafce_regexp_instr_no_subexpr); +PG_FUNCTION_INFO_V1(orafce_textregexreplace_noopt); +PG_FUNCTION_INFO_V1(orafce_textregexreplace); +PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended); +PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended_no_n); +PG_FUNCTION_INFO_V1(orafce_textregexreplace_extended_no_flags); #if PG_VERSION_NUM < 120000 @@ -195,6 +207,349 @@ RE_compile_and_cache(text *text_re, int cflags, Oid collation) #endif +#if PG_VERSION_NUM < 150000 + +/* + * check_replace_text_has_escape + * + * Returns 0 if text contains no backslashes that need processing. + * Returns 1 if text contains backslashes, but not regexp submatch specifiers. + * Returns 2 if text contains regexp submatch specifiers (\1 .. \9). + */ +static int +check_replace_text_has_escape(const text *replace_text) +{ + int result = 0; + const char *p = VARDATA_ANY(replace_text); + const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); + + while (p < p_end) + { + /* Find next escape char, if any. */ + p = memchr(p, '\\', p_end - p); + if (p == NULL) + break; + p++; + /* Note: a backslash at the end doesn't require extra processing. */ + if (p < p_end) + { + if (*p >= '1' && *p <= '9') + return 2; /* Found a submatch specifier, so done */ + result = 1; /* Found some other sequence, keep looking */ + p++; + } + } + return result; +} + +/* + * charlen_to_bytelen() + * Compute the number of bytes occupied by n characters starting at *p + * + * It is caller's responsibility that there actually are n characters; + * the string need not be null-terminated. + */ +static int +charlen_to_bytelen(const char *p, int n) +{ + if (pg_database_encoding_max_length() == 1) + { + /* Optimization for single-byte encodings */ + return n; + } + else + { + const char *s; + + for (s = p; n > 0; n--) + s += pg_mblen(s); + + return s - p; + } +} + +/* + * appendStringInfoText + * + * Append a text to str. + * Like appendStringInfoString(str, text_to_cstring(t)) but faster. + */ +static void +appendStringInfoText(StringInfo str, const text *t) +{ + appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t)); +} + +/* + * appendStringInfoRegexpSubstr + * + * Append replace_text to str, substituting regexp back references for + * \n escapes. start_ptr is the start of the match in the source string, + * at logical character position data_pos. + */ +static void +appendStringInfoRegexpSubstr(StringInfo str, text *replace_text, + regmatch_t *pmatch, + char *start_ptr, int data_pos) +{ + const char *p = VARDATA_ANY(replace_text); + const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text); + + while (p < p_end) + { + const char *chunk_start = p; + int so; + int eo; + + /* Find next escape char, if any. */ + p = memchr(p, '\\', p_end - p); + if (p == NULL) + p = p_end; + + /* Copy the text we just scanned over, if any. */ + if (p > chunk_start) + appendBinaryStringInfo(str, chunk_start, p - chunk_start); + + /* Done if at end of string, else advance over escape char. */ + if (p >= p_end) + break; + p++; + + if (p >= p_end) + { + /* Escape at very end of input. Treat same as unexpected char */ + appendStringInfoChar(str, '\\'); + break; + } + + if (*p >= '1' && *p <= '9') + { + /* Use the back reference of regexp. */ + int idx = *p - '0'; + + so = pmatch[idx].rm_so; + eo = pmatch[idx].rm_eo; + p++; + } + else if (*p == '&') + { + /* Use the entire matched string. */ + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; + p++; + } + else if (*p == '\\') + { + /* \\ means transfer one \ to output. */ + appendStringInfoChar(str, '\\'); + p++; + continue; + } + else + { + /* + * If escape char is not followed by any expected char, just treat + * it as ordinary data to copy. (XXX would it be better to throw + * an error?) + */ + appendStringInfoChar(str, '\\'); + continue; + } + + if (so >= 0 && eo >= 0) + { + /* + * Copy the text that is back reference of regexp. Note so and eo + * are counted in characters not bytes. + */ + char *chunk_start; + int chunk_len; + + Assert(so >= data_pos); + chunk_start = start_ptr; + chunk_start += charlen_to_bytelen(chunk_start, so - data_pos); + chunk_len = charlen_to_bytelen(chunk_start, eo - so); + appendBinaryStringInfo(str, chunk_start, chunk_len); + } + } +} + +/* + * replace_text_regexp + * + * replace substring(s) in src_text that match pattern with replace_text. + * The replace_text can contain backslash markers to substitute + * (parts of) the matched text. + * + * cflags: regexp compile flags. + * collation: collation to use. + * search_start: the character (not byte) offset in src_text at which to + * begin searching. + * n: if 0, replace all matches; if > 0, replace only the N'th match. + */ +static text * +orafce_replace_text_regexp(text *src_text, text *pattern_text, + text *replace_text, + int cflags, Oid collation, + int search_start, int n) +{ + text *ret_text; + regex_t *re; + int src_text_len = VARSIZE_ANY_EXHDR(src_text); + int nmatches = 0; + StringInfoData buf; + regmatch_t pmatch[10]; /* main match, plus \1 to \9 */ + int nmatch = lengthof(pmatch); + pg_wchar *data; + size_t data_len; + int data_pos; + char *start_ptr; + int escape_status; + + initStringInfo(&buf); + + /* Convert data string to wide characters. */ + data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len); + + /* Check whether replace_text has escapes, especially regexp submatches. */ + escape_status = check_replace_text_has_escape(replace_text); + + /* If no regexp submatches, we can use REG_NOSUB. */ + if (escape_status < 2) + { + cflags |= REG_NOSUB; + /* Also tell pg_regexec we only want the whole-match location. */ + nmatch = 1; + } + + /* Prepare the regexp. */ + re = RE_compile_and_cache(pattern_text, cflags, collation); + + /* start_ptr points to the data_pos'th character of src_text */ + start_ptr = (char *) VARDATA_ANY(src_text); + data_pos = 0; + + while (search_start <= data_len) + { + int regexec_result; + + CHECK_FOR_INTERRUPTS(); + + regexec_result = pg_regexec(re, + data, + data_len, + search_start, + NULL, /* no details */ + nmatch, + pmatch, + 0); + + if (regexec_result == REG_NOMATCH) + break; + + if (regexec_result != REG_OKAY) + { + char errMsg[100]; + + CHECK_FOR_INTERRUPTS(); + pg_regerror(regexec_result, re, errMsg, sizeof(errMsg)); + ereport(ERROR, + (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), + errmsg("regular expression failed: %s", errMsg))); + } + + /* + * Count matches, and decide whether to replace this match. + */ + nmatches++; + if (n > 0 && nmatches != n) + { + /* + * No, so advance search_start, but not start_ptr/data_pos. (Thus, + * we treat the matched text as if it weren't matched, and copy it + * to the output later.) + */ + search_start = pmatch[0].rm_eo; + if (pmatch[0].rm_so == pmatch[0].rm_eo) + search_start++; + continue; + } + + /* + * Copy the text to the left of the match position. Note we are given + * character not byte indexes. + */ + if (pmatch[0].rm_so - data_pos > 0) + { + int chunk_len; + + chunk_len = charlen_to_bytelen(start_ptr, + pmatch[0].rm_so - data_pos); + appendBinaryStringInfo(&buf, start_ptr, chunk_len); + + /* + * Advance start_ptr over that text, to avoid multiple rescans of + * it if the replace_text contains multiple back-references. + */ + start_ptr += chunk_len; + data_pos = pmatch[0].rm_so; + } + + /* + * Copy the replace_text, processing escapes if any are present. + */ + if (escape_status > 0) + appendStringInfoRegexpSubstr(&buf, replace_text, pmatch, + start_ptr, data_pos); + else + appendStringInfoText(&buf, replace_text); + + /* Advance start_ptr and data_pos over the matched text. */ + start_ptr += charlen_to_bytelen(start_ptr, + pmatch[0].rm_eo - data_pos); + data_pos = pmatch[0].rm_eo; + + /* + * If we only want to replace one occurrence, we're done. + */ + if (n > 0) + break; + + /* + * Advance search position. Normally we start the next search at the + * end of the previous match; but if the match was of zero length, we + * have to advance by one character, or we'd just find the same match + * again. + */ + search_start = data_pos; + if (pmatch[0].rm_so == pmatch[0].rm_eo) + search_start++; + } + + /* + * Copy the text to the right of the last match. + */ + if (data_pos < data_len) + { + int chunk_len; + + chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr; + appendBinaryStringInfo(&buf, start_ptr, chunk_len); + } + + ret_text = cstring_to_text_with_len(buf.data, buf.len); + pfree(buf.data); + pfree(data); + + return ret_text; +} + +#else + +#define orafce_replace_text_regexp replace_text_regexp + +#endif /* * RE_wchar_execute - execute a RE on pg_wchar data @@ -683,3 +1038,167 @@ orafce_regexp_instr_no_subexpr(PG_FUNCTION_ARGS) { return orafce_regexp_instr(fcinfo); } + +/* + * textregexreplace_noopt() + * Return a string matched by a regular expression, with replacement. + * + * This version doesn't have an option argument: we default to case + * sensitive match, replace the first instance only. + */ +Datum +orafce_textregexreplace_noopt(PG_FUNCTION_ARGS) +{ + text *s; + text *p; + text *r; + + if (PG_ARGISNULL(1) && !PG_ARGISNULL(0)) + PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0)); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + s = PG_GETARG_TEXT_PP(0); + p = PG_GETARG_TEXT_PP(1); + r = PG_GETARG_TEXT_PP(2); + + PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r, + REG_ADVANCED, PG_GET_COLLATION(), + 0, 1)); +} + +/* + * textregexreplace() + * Return a string matched by a regular expression, with replacement. + */ +Datum +orafce_textregexreplace(PG_FUNCTION_ARGS) +{ + text *s; + text *p; + text *r; + text *opt = NULL; + pg_re_flags flags; + + if (PG_ARGISNULL(1) && !PG_ARGISNULL(0)) + PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0)); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + s = PG_GETARG_TEXT_PP(0); + p = PG_GETARG_TEXT_PP(1); + r = PG_GETARG_TEXT_PP(2); + + if (!PG_ARGISNULL(3)) + opt = PG_GETARG_TEXT_PP(3); + + /* + * regexp_replace() with four arguments will be preferentially resolved as + * this form when the fourth argument is of type UNKNOWN. However, the + * user might have intended to call textregexreplace_extended_no_n. If we + * see flags that look like an integer, emit the same error that + * parse_re_flags would, but add a HINT about how to fix it. + */ + if (opt && VARSIZE_ANY_EXHDR(opt) > 0) + { + char *opt_p = VARDATA_ANY(opt); + + if (*opt_p >= '0' && *opt_p <= '9') + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid regular expression option: \"%.*s\"", + pg_mblen(opt_p), opt_p), + errhint("If you meant to use regexp_replace() with a start parameter, cast the fourth argument to integer explicitly."))); + } + + parse_re_flags(&flags, opt); + + PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r, + flags.cflags, PG_GET_COLLATION(), + 0, flags.glob ? 0 : 1)); +} + +/* + * textregexreplace_extended() + * Return a string matched by a regular expression, with replacement. + * Extends textregexreplace by allowing a start position and the + * choice of the occurrence to replace (0 means all occurrences). + */ +Datum +orafce_textregexreplace_extended(PG_FUNCTION_ARGS) +{ + text *s; + text *p; + text *r; + int start = 1; + int n = 1; + text *flags = NULL; + pg_re_flags re_flags; + + if (PG_ARGISNULL(1) && !PG_ARGISNULL(0)) + PG_RETURN_TEXT_P(PG_GETARG_TEXT_PP(0)); + + if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2)) + PG_RETURN_NULL(); + + s = PG_GETARG_TEXT_PP(0); + p = PG_GETARG_TEXT_PP(1); + r = PG_GETARG_TEXT_PP(2); + + /* Collect optional parameters */ + if (PG_NARGS() > 3) + { + if (PG_ARGISNULL(3)) + PG_RETURN_NULL(); + + start = PG_GETARG_INT32(3); + if (start <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("argument 'position' must be a number greater than 0"))); + } + if (PG_NARGS() > 4) + { + if (PG_ARGISNULL(4)) + PG_RETURN_NULL(); + + n = PG_GETARG_INT32(4); + if (n < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("argument 'occurrence' must be a positive number"))); + } + if (PG_NARGS() > 5) + { + if (!PG_ARGISNULL(5)) + flags = PG_GETARG_TEXT_PP(5); + } + + /* Determine options */ + parse_re_flags(&re_flags, flags); + + /* If N was not specified, deduce it from the 'g' flag */ + if (PG_NARGS() <= 4) + n = re_flags.glob ? 0 : 1; + + /* Do the replacement(s) */ + PG_RETURN_TEXT_P(orafce_replace_text_regexp(s, p, r, + re_flags.cflags, PG_GET_COLLATION(), + start - 1, n)); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +orafce_textregexreplace_extended_no_n(PG_FUNCTION_ARGS) +{ + return orafce_textregexreplace_extended(fcinfo); +} + +/* This is separate to keep the opr_sanity regression test from complaining */ +Datum +orafce_textregexreplace_extended_no_flags(PG_FUNCTION_ARGS) +{ + return orafce_textregexreplace_extended(fcinfo); +} From 8b96e4442f31bb90aaacc07957817a33d5fcd9fd Mon Sep 17 00:00:00 2001 From: "okbob@github.com" Date: Wed, 13 Apr 2022 15:19:24 +0200 Subject: [PATCH 4/4] REG_NOSUB is working only on PostgreSQL 15 --- regexp.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/regexp.c b/regexp.c index 1f6790f97dd7..43238d6f4d30 100644 --- a/regexp.c +++ b/regexp.c @@ -415,6 +415,10 @@ orafce_replace_text_regexp(text *src_text, text *pattern_text, /* Check whether replace_text has escapes, especially regexp submatches. */ escape_status = check_replace_text_has_escape(replace_text); +#if PG_VERSION_NUM >= 150000 + + /* REG_NOSUB doesn't work well in pre PostgreSQL 15 */ + /* If no regexp submatches, we can use REG_NOSUB. */ if (escape_status < 2) { @@ -423,6 +427,8 @@ orafce_replace_text_regexp(text *src_text, text *pattern_text, nmatch = 1; } +#endif + /* Prepare the regexp. */ re = RE_compile_and_cache(pattern_text, cflags, collation); @@ -629,7 +635,6 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, int orig_len; pg_wchar *wide_str; int wide_len; - int cflags; regex_t *cpattern; regmatch_t *pmatch; int pmatch_len; @@ -638,6 +643,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, int prev_match_end; int prev_valid_match_end; int maxlen = 0; /* largest fetch length in characters */ + int cflags; /* save original string --- we'll extract result substrings from it */ matchctx->orig_str = orig_str; @@ -649,8 +655,16 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags, /* set up the compiled pattern */ cflags = re_flags->cflags; + +#if PG_VERSION_NUM >= 150000 + + /* REG_NOSUB doesn't work well in pre PostgreSQL 15 */ + if (!use_subpatterns) cflags |= REG_NOSUB; + +#endif + cpattern = RE_compile_and_cache(pattern, cflags, collation); /* do we want to remember subpatterns? */