Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve parsing of bracketed expressions #109

Merged
merged 3 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
238 changes: 118 additions & 120 deletions lib/tre-parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,166 +259,164 @@ tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
int *items_size)
{
const tre_char_t *re = ctx->re;
reg_errcode_t status = REG_OK;
reg_errcode_t status;
tre_ctype_t class = (tre_ctype_t)0;
tre_cint_t min = 0, max = 0;
int i = *num_items;
int max_i = *items_size;
int skip;

/* Build an array of the items in the bracket expression. */
while (status == REG_OK)
for (;;)
{
skip = 0;
if (re == ctx->re_end)
{
status = REG_EBRACK;
return REG_EBRACK;
}
else if (*re == CHAR_RBRACKET && re > ctx->re)
if (*re == CHAR_RBRACKET && re > ctx->re)
{
DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n", REST(re)));
re++;
break;
}
else
class = (tre_ctype_t)0;
if (re + 2 < ctx->re_end
&& *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
{
tre_cint_t min = 0, max = 0;

class = (tre_ctype_t)0;
if (re + 2 < ctx->re_end
&& *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
{
DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
min = *re;
max = *(re + 2);
re += 3;
/* XXX - Should use collation order instead of encoding values
in character ranges. */
if (min > max)
status = REG_ERANGE;
}
else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
status = REG_ECOLLATE;
else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
status = REG_ECOLLATE;
else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n", REST(re)));
min = *re;
max = *(re + 2);
re += 3;
/* XXX - Should use collation order instead of encoding values
in character ranges. */
if (min > max)
return REG_ERANGE;
}
else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
return REG_ECOLLATE;
else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
return REG_ECOLLATE;
else if (re + 1 < ctx->re_end
&& *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
{
char tmp_str[64];
const tre_char_t *endptr = re + 2;
size_t len;
DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re)));
while (endptr < ctx->re_end && *endptr != CHAR_COLON)
endptr++;
if (endptr != ctx->re_end)
{
char tmp_str[64];
const tre_char_t *endptr = re + 2;
size_t len;
DPRINT(("tre_parse_bracket: class: '%.*" STRF "'\n", REST(re)));
while (endptr < ctx->re_end && *endptr != CHAR_COLON)
endptr++;
if (endptr != ctx->re_end)
{
len = MIN(endptr - re - 2, 63);
len = MIN(endptr - re - 2, 63);
#ifdef TRE_WCHAR
{
tre_char_t tmp_wcs[64];
wcsncpy(tmp_wcs, re + 2, len);
tmp_wcs[len] = L'\0';
{
tre_char_t tmp_wcs[64];
wcsncpy(tmp_wcs, re + 2, len);
tmp_wcs[len] = L'\0';
#if defined HAVE_WCSRTOMBS
{
mbstate_t state;
const tre_char_t *src = tmp_wcs;
memset(&state, '\0', sizeof(state));
len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
}
{
mbstate_t state;
const tre_char_t *src = tmp_wcs;
memset(&state, '\0', sizeof(state));
len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
}
#elif defined HAVE_WCSTOMBS
len = wcstombs(tmp_str, tmp_wcs, 63);
len = wcstombs(tmp_str, tmp_wcs, 63);
#endif /* defined HAVE_WCSTOMBS */
}
if (len == (size_t)-1)
return REG_ECTYPE;
}
#else /* !TRE_WCHAR */
strncpy(tmp_str, (const char*)re + 2, len);
strncpy(tmp_str, (const char*)re + 2, len);
#endif /* !TRE_WCHAR */
tmp_str[len] = '\0';
DPRINT((" class name: %s\n", tmp_str));
class = tre_ctype(tmp_str);
if (!class)
status = REG_ECTYPE;
/* Optimize character classes for 8 bit character sets. */
if (status == REG_OK && ctx->mb_cur_max == 1)
{
status = tre_expand_ctype(ctx->mem, class, items,
&i, &max_i, ctx->cflags);
class = (tre_ctype_t)0;
skip = 1;
}
re = endptr + 2;
tmp_str[len] = '\0';
DPRINT((" class name: %s\n", tmp_str));
class = tre_ctype(tmp_str);
if (!class)
return REG_ECTYPE;
/* Optimize character classes for 8 bit character sets. */
if (ctx->mb_cur_max == 1)
{
status = tre_expand_ctype(ctx->mem, class, items,
&i, &max_i, ctx->cflags);
if (status != REG_OK)
return status;
class = (tre_ctype_t)0;
skip = 1;
}
else
status = REG_ECTYPE;
min = 0;
max = TRE_CHAR_MAX;
re = endptr + 2;
}
else
{
DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
&& ctx->re != re)
/* Two ranges are not allowed to share and endpoint. */
status = REG_ERANGE;
min = max = *re++;
}
return REG_ECTYPE;
min = 0;
max = TRE_CHAR_MAX;
}
else
{
DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n", REST(re)));
if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
&& ctx->re != re)
/* Two ranges are not allowed to share and endpoint. */
return REG_ERANGE;
min = max = *re++;
}

if (class && negate)
if (*num_neg_classes >= MAX_NEG_CLASSES)
return REG_ESPACE;
else
neg_classes[(*num_neg_classes)++] = class;
else if (!skip)
{
status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
if (status != REG_OK)
break;
return status;
((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
}

if (class && negate)
if (*num_neg_classes >= MAX_NEG_CLASSES)
status = REG_ESPACE;
else
neg_classes[(*num_neg_classes)++] = class;
else if (!skip)
{
status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
if (status != REG_OK)
break;
((tre_literal_t*)((*items)[i-1])->obj)->u.class = class;
}
/* Add opposite-case counterpoints if REG_ICASE is present.
This is broken if there are more than two "same" characters. */
if (ctx->cflags & REG_ICASE && !class && !skip)
{
tre_cint_t cmin, ccurr;

/* Add opposite-case counterpoints if REG_ICASE is present.
This is broken if there are more than two "same" characters. */
if (ctx->cflags & REG_ICASE && !class && status == REG_OK && !skip)
DPRINT(("adding opposite-case counterpoints\n"));
while (min <= max)
{
tre_cint_t cmin, ccurr;

DPRINT(("adding opposite-case counterpoints\n"));
while (min <= max)
if (tre_islower(min))
{
if (tre_islower(min))
{
cmin = ccurr = tre_toupper(min++);
while (tre_islower(min) && tre_toupper(min) == ccurr + 1
&& min <= max)
ccurr = tre_toupper(min++);
status = tre_new_item(ctx->mem, cmin, ccurr,
&i, &max_i, items);
}
else if (tre_isupper(min))
{
cmin = ccurr = tre_tolower(min++);
while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
&& min <= max)
ccurr = tre_tolower(min++);
status = tre_new_item(ctx->mem, cmin, ccurr,
&i, &max_i, items);
}
else min++;
cmin = ccurr = tre_toupper(min++);
while (tre_islower(min) && tre_toupper(min) == ccurr + 1
&& min <= max)
ccurr = tre_toupper(min++);
status = tre_new_item(ctx->mem, cmin, ccurr,
&i, &max_i, items);
if (status != REG_OK)
break;
return status;
}
if (status != REG_OK)
break;
else if (tre_isupper(min))
{
cmin = ccurr = tre_tolower(min++);
while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
&& min <= max)
ccurr = tre_tolower(min++);
status = tre_new_item(ctx->mem, cmin, ccurr,
&i, &max_i, items);
if (status != REG_OK)
return status;
}
else
min++;
}
}
}
*num_items = i;
*items_size = max_i;
ctx->re = re;
return status;
return REG_OK;
}

static reg_errcode_t
Expand Down Expand Up @@ -1494,7 +1492,7 @@ tre_parse(tre_parse_ctx_t *ctx)
/* Wide char. */
char tmp[9]; /* max 8 hex digits + terminator */
long val;
int i = 0;
size_t i = 0;
ctx->re++;
while (ctx->re_end - ctx->re >= 0)
{
Expand Down
1 change: 1 addition & 0 deletions tests/retest.c
Original file line number Diff line number Diff line change
Expand Up @@ -1105,6 +1105,7 @@ main(int argc, char **argv)
test_comp("[[:xdigit:]]+", REG_EXTENDED, 0);
test_exec("-0123456789ABCDEFabcdef", 0, REG_OK, 1, 23, END);
test_comp("[[:bogus-character-class-name:]", REG_EXTENDED, REG_ECTYPE);
test_comp("[[:\xff:", REG_EXTENDED, REG_ECTYPE);


/* Range expressions (assuming that the C locale is being used). */
Expand Down
Loading