Skip to content

Commit

Permalink
Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due to miss…
Browse files Browse the repository at this point in the history
…ing new GB9c rule implementation
  • Loading branch information
Matthew Barnett committed Jun 22, 2024
1 parent 8eabb42 commit 6d086ff
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 0 deletions.
4 changes: 4 additions & 0 deletions changelog.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Version: 2024.6.22

Git issue 535: Regex fails Unicode 15.1 GraphemeBreakTest due to missing new GB9c rule implementation

Version: 2024.5.15

Git issue 530: hangs with fuzzy and optionals
Expand Down
33 changes: 33 additions & 0 deletions regex_3/_regex.c
Original file line number Diff line number Diff line change
Expand Up @@ -1803,6 +1803,7 @@ static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos)
Py_UCS4 right_char;
RE_UINT32 left_prop;
RE_UINT32 right_prop;
RE_UINT32 prop;
Py_ssize_t pos;

/* Break at the start and end of text, unless the text is empty. */
Expand Down Expand Up @@ -1873,6 +1874,38 @@ static BOOL unicode_at_grapheme_boundary(RE_State* state, Py_ssize_t text_pos)
if (left_prop == RE_GBREAK_PREPEND)
return FALSE;

/* The GB9c rule only applies to extended grapheme clusters: Do not break
* within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
*/
/* GB9c */
if (re_get_indic_conjunct_break(right_char) == RE_INCB_CONSONANT) {
BOOL has_linker;

has_linker = FALSE;
pos = left_pos;

do {
prop = re_get_indic_conjunct_break(char_at(state->text, pos));

switch (prop) {
case RE_INCB_LINKER:
has_linker = TRUE;
break;
case RE_INCB_EXTEND:
break;
case RE_INCB_CONSONANT:
if (has_linker)
return FALSE;
goto end_GB9c;
default:
goto end_GB9c;
}

--pos;
} while (pos >= state->text_start);
}

end_GB9c:
/* Do not break within emoji modifier sequences or emoji zwj sequences. */
/* GB11 */
if (left_prop == RE_GBREAK_ZWJ && re_get_extended_pictographic(right_char))
Expand Down
5 changes: 5 additions & 0 deletions regex_3/_regex_unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ typedef RE_UINT32 (*RE_GetPropertyFunc)(RE_UINT32 codepoint);
#define RE_LBREAK_REGIONALINDICATOR 46
#define RE_LBREAK_EMODIFIER 47

#define RE_INCB_NONE 0
#define RE_INCB_EXTEND 1
#define RE_INCB_CONSONANT 2
#define RE_INCB_LINKER 3

extern char* re_strings[1506];
extern RE_Property re_properties[183];
extern RE_PropertyValue re_property_values[1651];
Expand Down
10 changes: 10 additions & 0 deletions tools/build_regex_unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -1733,6 +1733,16 @@ def make_key(names):

h_file.write('\n')

val_list = unique(properties[munge('Indic_Conjunct_Break')]['values'].values(),
key=id)
values = [(value['id'], value['names'][0]) for value in val_list]

for val_id, name in sorted(values):
h_file.write('#define RE_INCB_{} {}\n'.format(munge(name),
val_id))

h_file.write('\n')

h_file.write('extern char* re_strings[{}];\n'.format(unicode_data['string_count']))
h_file.write('extern RE_Property re_properties[{}];\n'.format(unicode_data['property_table_count']))
h_file.write('extern RE_PropertyValue re_property_values[{}];\n'.format(unicode_data['valueset_table_count']))
Expand Down

0 comments on commit 6d086ff

Please sign in to comment.