Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename LRE_FLAG_UTF16 to LRE_FLAG_UNICODE #186

Merged
merged 1 commit into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 33 additions & 33 deletions libregexp.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ typedef struct {
const uint8_t *buf_end;
const uint8_t *buf_start;
int re_flags;
BOOL is_utf16;
BOOL is_unicode;
BOOL ignore_case;
BOOL dotall;
int capture_count;
Expand Down Expand Up @@ -122,11 +122,11 @@ static int dbuf_insert(DynBuf *s, int pos, int len)
}

/* canonicalize with the specific JS regexp rules */
static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
static uint32_t lre_canonicalize(uint32_t c, BOOL is_unicode)
{
uint32_t res[LRE_CC_RES_LEN_MAX];
int len;
if (is_utf16) {
if (is_unicode) {
if (likely(c < 128)) {
if (c >= 'A' && c <= 'Z')
c = c - 'A' + 'a';
Expand Down Expand Up @@ -751,10 +751,10 @@ static int get_class_atom(REParseState *s, CharRange *cr,
if ((c >= 'a' && c <= 'z') ||
(c >= 'A' && c <= 'Z') ||
(((c >= '0' && c <= '9') || c == '_') &&
inclass && !s->is_utf16)) { /* Annex B.1.4 */
inclass && !s->is_unicode)) { /* Annex B.1.4 */
c &= 0x1f;
p++;
} else if (s->is_utf16) {
} else if (s->is_unicode) {
goto invalid_escape;
} else {
/* otherwise return '\' and 'c' */
Expand All @@ -764,7 +764,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
break;
case 'p':
case 'P':
if (s->is_utf16) {
if (s->is_unicode) {
if (parse_unicode_property(s, cr, &p, (c == 'P')))
return -1;
c = CLASS_RANGE_BASE;
Expand All @@ -773,14 +773,14 @@ static int get_class_atom(REParseState *s, CharRange *cr,
/* fall thru */
default:
p--;
ret = lre_parse_escape(&p, s->is_utf16 * 2);
ret = lre_parse_escape(&p, s->is_unicode * 2);
if (ret >= 0) {
c = ret;
} else {
if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
/* always valid to escape these characters */
goto normal_char;
} else if (s->is_utf16) {
} else if (s->is_unicode) {
invalid_escape:
return re_parse_error(s, "invalid escape sequence in regular expression");
} else {
Expand All @@ -802,7 +802,7 @@ static int get_class_atom(REParseState *s, CharRange *cr,
/* normal char */
if (c >= 128) {
c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
if ((unsigned)c > 0xffff && !s->is_utf16) {
if ((unsigned)c > 0xffff && !s->is_unicode) {
/* XXX: should handle non BMP-1 code points */
return re_parse_error(s, "malformed unicode char");
}
Expand Down Expand Up @@ -878,7 +878,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
if (*p == '-' && p[1] != ']') {
const uint8_t *p0 = p + 1;
if (c1 >= CLASS_RANGE_BASE) {
if (s->is_utf16) {
if (s->is_unicode) {
cr_free(cr1);
goto invalid_class_range;
}
Expand All @@ -890,7 +890,7 @@ static int re_parse_char_class(REParseState *s, const uint8_t **pp)
goto fail;
if (c2 >= CLASS_RANGE_BASE) {
cr_free(cr1);
if (s->is_utf16) {
if (s->is_unicode) {
goto invalid_class_range;
}
/* Annex B: match '-' character */
Expand Down Expand Up @@ -1248,7 +1248,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
re_emit_op(s, REOP_prev);
break;
case '{':
if (s->is_utf16) {
if (s->is_unicode) {
return re_parse_error(s, "syntax error");
} else if (!is_digit(p[1])) {
/* Annex B: we accept '{' not followed by digits as a
Expand Down Expand Up @@ -1300,7 +1300,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
lookahead:
/* Annex B allows lookahead to be used as an atom for
the quantifiers */
if (!s->is_utf16 && !is_backward_lookahead) {
if (!s->is_unicode && !is_backward_lookahead) {
last_atom_start = s->byte_code.size;
last_capture_count = s->capture_count;
}
Expand Down Expand Up @@ -1376,15 +1376,15 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
/* annex B: we tolerate invalid group names in non
unicode mode if there is no named capture
definition */
if (s->is_utf16 || re_has_named_captures(s))
if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "expecting group name");
else
goto parse_class_atom;
}
p1 += 3;
if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
&p1)) {
if (s->is_utf16 || re_has_named_captures(s))
if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "invalid group name");
else
goto parse_class_atom;
Expand All @@ -1395,7 +1395,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
after (inefficient, but hopefully not common */
c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
if (c < 0) {
if (s->is_utf16 || re_has_named_captures(s))
if (s->is_unicode || re_has_named_captures(s))
return re_parse_error(s, "group name not defined");
else
goto parse_class_atom;
Expand All @@ -1407,7 +1407,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
case '0':
p += 2;
c = 0;
if (s->is_utf16) {
if (s->is_unicode) {
if (is_digit(*p)) {
return re_parse_error(s, "invalid decimal escape in regular expression");
}
Expand All @@ -1429,7 +1429,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)

c = parse_digits(&p, FALSE);
if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
if (!s->is_utf16) {
if (!s->is_unicode) {
/* Annex B.1.4: accept legacy octal */
p = q;
if (*p <= '7') {
Expand Down Expand Up @@ -1471,7 +1471,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
break;
case ']':
case '}':
if (s->is_utf16)
if (s->is_unicode)
return re_parse_error(s, "syntax error");
goto parse_class_atom;
default:
Expand All @@ -1493,7 +1493,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
return -1;
} else {
if (s->ignore_case)
c = lre_canonicalize(c, s->is_utf16);
c = lre_canonicalize(c, s->is_unicode);
if (c <= 0x7f)
re_emit_op_u8(s, REOP_char8, c);
else if (c <= 0xffff)
Expand Down Expand Up @@ -1531,7 +1531,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
/* As an extension (see ES6 annex B), we accept '{' not
followed by digits as a normal atom */
if (!is_digit(p[1])) {
if (s->is_utf16)
if (s->is_unicode)
goto invalid_quant_count;
break;
}
Expand All @@ -1550,7 +1550,7 @@ static int re_parse_term(REParseState *s, BOOL is_backward_dir)
quant_max = INT32_MAX; /* infinity */
}
}
if (*p != '}' && !s->is_utf16) {
if (*p != '}' && !s->is_unicode) {
/* Annex B: normal atom if invalid '{' syntax */
p = p1;
break;
Expand Down Expand Up @@ -1839,7 +1839,7 @@ uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
s->buf_end = s->buf_ptr + buf_len;
s->buf_start = s->buf_ptr;
s->re_flags = re_flags;
s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
s->is_unicode = ((re_flags & LRE_FLAG_UNICODE) != 0);
is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
Expand Down Expand Up @@ -2039,7 +2039,7 @@ typedef struct {
int stack_size_max;
BOOL multi_line;
BOOL ignore_case;
BOOL is_utf16;
BOOL is_unicode;
void *opaque; /* used for stack overflow check */

size_t state_size;
Expand Down Expand Up @@ -2189,7 +2189,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match;
GET_CHAR(c, cptr, cbuf_end);
if (s->ignore_case) {
c = lre_canonicalize(c, s->is_utf16);
c = lre_canonicalize(c, s->is_unicode);
}
if (val != c)
goto no_match;
Expand Down Expand Up @@ -2346,8 +2346,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
GET_CHAR(c1, cptr1, cptr1_end);
GET_CHAR(c2, cptr, cbuf_end);
if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_utf16);
c2 = lre_canonicalize(c2, s->is_utf16);
c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode);
}
if (c1 != c2)
goto no_match;
Expand All @@ -2360,8 +2360,8 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
GET_PREV_CHAR(c1, cptr1, cptr1_start);
GET_PREV_CHAR(c2, cptr, s->cbuf);
if (s->ignore_case) {
c1 = lre_canonicalize(c1, s->is_utf16);
c2 = lre_canonicalize(c2, s->is_utf16);
c1 = lre_canonicalize(c1, s->is_unicode);
c2 = lre_canonicalize(c2, s->is_unicode);
}
if (c1 != c2)
goto no_match;
Expand All @@ -2380,7 +2380,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match;
GET_CHAR(c, cptr, cbuf_end);
if (s->ignore_case) {
c = lre_canonicalize(c, s->is_utf16);
c = lre_canonicalize(c, s->is_unicode);
}
idx_min = 0;
low = get_u16(pc + 0 * 4);
Expand Down Expand Up @@ -2420,7 +2420,7 @@ static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
goto no_match;
GET_CHAR(c, cptr, cbuf_end);
if (s->ignore_case) {
c = lre_canonicalize(c, s->is_utf16);
c = lre_canonicalize(c, s->is_unicode);
}
idx_min = 0;
low = get_u32(pc + 0 * 8);
Expand Down Expand Up @@ -2512,13 +2512,13 @@ int lre_exec(uint8_t **capture,
re_flags = bc_buf[RE_HEADER_FLAGS];
s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
s->is_unicode = (re_flags & LRE_FLAG_UNICODE) != 0;
s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
s->cbuf = cbuf;
s->cbuf_end = cbuf + (clen << cbuf_type);
s->cbuf_type = cbuf_type;
if (s->cbuf_type == 1 && s->is_utf16)
if (s->cbuf_type == 1 && s->is_unicode)
s->cbuf_type = 2;
s->opaque = opaque;

Expand Down
2 changes: 1 addition & 1 deletion libregexp.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
#define LRE_FLAG_IGNORECASE (1 << 1)
#define LRE_FLAG_MULTILINE (1 << 2)
#define LRE_FLAG_DOTALL (1 << 3)
#define LRE_FLAG_UTF16 (1 << 4)
#define LRE_FLAG_UNICODE (1 << 4)
#define LRE_FLAG_STICKY (1 << 5)
#define LRE_FLAG_INDICES (1 << 6) /* Unused by libregexp, just recorded. */

Expand Down
8 changes: 4 additions & 4 deletions quickjs.c
Original file line number Diff line number Diff line change
Expand Up @@ -40450,7 +40450,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
mask = LRE_FLAG_DOTALL;
break;
case 'u':
mask = LRE_FLAG_UTF16;
mask = LRE_FLAG_UNICODE;
break;
case 'y':
mask = LRE_FLAG_STICKY;
Expand All @@ -40468,7 +40468,7 @@ static JSValue js_compile_regexp(JSContext *ctx, JSValueConst pattern,
JS_FreeCString(ctx, str);
}

str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UTF16));
str = JS_ToCStringLen2(ctx, &len, pattern, !(re_flags & LRE_FLAG_UNICODE));
if (!str)
return JS_EXCEPTION;
re_bytecode_buf = lre_compile(&re_bytecode_len, error_msg,
Expand Down Expand Up @@ -41113,7 +41113,7 @@ static JSValue JS_RegExpDelete(JSContext *ctx, JSValueConst this_val, JSValueCon
break;
}
if (end == start) {
if (!(re_flags & LRE_FLAG_UTF16) || (unsigned)end >= str->len || !str->is_wide_char) {
if (!(re_flags & LRE_FLAG_UNICODE) || (unsigned)end >= str->len || !str->is_wide_char) {
end++;
} else {
string_getc(str, &end);
Expand Down Expand Up @@ -41873,7 +41873,7 @@ static const JSCFunctionListEntry js_regexp_proto_funcs[] = {
JS_CGETSET_MAGIC_DEF("ignoreCase", js_regexp_get_flag, NULL, LRE_FLAG_IGNORECASE ),
JS_CGETSET_MAGIC_DEF("multiline", js_regexp_get_flag, NULL, LRE_FLAG_MULTILINE ),
JS_CGETSET_MAGIC_DEF("dotAll", js_regexp_get_flag, NULL, LRE_FLAG_DOTALL ),
JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UTF16 ),
JS_CGETSET_MAGIC_DEF("unicode", js_regexp_get_flag, NULL, LRE_FLAG_UNICODE ),
JS_CGETSET_MAGIC_DEF("sticky", js_regexp_get_flag, NULL, LRE_FLAG_STICKY ),
JS_CGETSET_MAGIC_DEF("hasIndices", js_regexp_get_flag, NULL, LRE_FLAG_INDICES ),
JS_CFUNC_DEF("exec", 1, js_regexp_exec ),
Expand Down