From 17dfcb85c7f2c3a90526c9221aa6b98661b7135f Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sat, 13 Nov 2021 22:15:37 -0500 Subject: [PATCH 1/3] Extend RegexCharClass.Canonicalize range inversion optimization There's a simple optimization in RegexCharClass.Canonicalize that was added in .NET 5, with the goal of taking a set that's made up of exactly two ranges and seeing whether those ranges were leaving out exactly one character. If they were, the set can instead be rewritten as that character negated, which is a normalized form used downstream and optimized. We can extend this normalization ever so slightly to be for two ranges separated not just be a single character but by more than that as well. --- .../Text/RegularExpressions/RegexCharClass.cs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 1c2cf0ff65817..1e8bd47647979 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1390,23 +1390,24 @@ private void Canonicalize(bool isNonBacktracking) rangelist.RemoveRange(j, rangelist.Count - j); } - // If the class now represents a single negated character, but does so by including every - // other character, invert it to produce a normalized form recognized by IsSingletonInverse. - if (!isNonBacktracking && // do not produce the IsSingletonInverse transformation in NonBacktracking mode + // If the class now represents a single negated range, but does so by including every + // other character, invert it to produce a normalized form with a single range. This + // is valuable for subsequent optimizations in most of the engines. + if (!isNonBacktracking && // TODO: Why is NonBacktracking special-cased? !_negate && _subtractor is null && (_categories is null || _categories.Length == 0)) { if (rangelist.Count == 2) { - // There are two ranges in the list. See if there's one missing element between them. + // There are two ranges in the list. See if there's one missing range between them. + // Such a range might be as small as a single character. if (rangelist[0].First == 0 && - rangelist[0].Last == (char)(rangelist[1].First - 2) && - rangelist[1].Last == LastChar) + rangelist[1].Last == LastChar && + rangelist[0].Last < rangelist[1].First - 1) { - char ch = (char)(rangelist[0].Last + 1); + rangelist[0] = new SingleRange((char)(rangelist[0].Last + 1), (char)(rangelist[1].First - 1)); rangelist.RemoveAt(1); - rangelist[0] = new SingleRange(ch, ch); _negate = true; } } From 585a7a094acedff89d0b9b242719c79f11f35f3f Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Sun, 14 Nov 2021 20:21:27 -0500 Subject: [PATCH 2/3] Update TODO comment --- .../src/System/Text/RegularExpressions/RegexCharClass.cs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs index 1e8bd47647979..96ccdd289871b 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs @@ -1393,7 +1393,12 @@ private void Canonicalize(bool isNonBacktracking) // If the class now represents a single negated range, but does so by including every // other character, invert it to produce a normalized form with a single range. This // is valuable for subsequent optimizations in most of the engines. - if (!isNonBacktracking && // TODO: Why is NonBacktracking special-cased? + // TODO: https://github.com/dotnet/runtime/issues/61048. The special-casing for NonBacktracking + // can be deleted once this issue is addressed. The special-casing exists because NonBacktracking + // is on a different casing plan than the other engines and doesn't use ToLower on each input + // character at match time; this in turn can highlight differences between sets and their inverted + // versions of themselves, e.g. a difference between [0-AC-\uFFFF] and [^B]. + if (!isNonBacktracking && !_negate && _subtractor is null && (_categories is null || _categories.Length == 0)) From 0199b5abcd9652dbbbf2c8b4303f88488cc48e52 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 15 Nov 2021 16:34:48 -0500 Subject: [PATCH 3/3] Add some more reduction tests --- .../tests/RegexReductionTests.cs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs index ad5ca8d0754d9..810cbe31f043d 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexReductionTests.cs @@ -324,6 +324,13 @@ private static int GetMinRequiredLength(Regex r) [InlineData("[^\n]*", ".*")] [InlineData("(?>[^\n]*)", "(?>.*)")] [InlineData("[^\n]*?", ".*?")] + // Set reduction + [InlineData("[\u0001-\uFFFF]", "[^\u0000]")] + [InlineData("[\u0000-\uFFFE]", "[^\uFFFF]")] + [InlineData("[\u0000-AB-\uFFFF]", "[\u0000-\uFFFF]")] + [InlineData("[ABC-EG-J]", "[A-EG-J]")] + [InlineData("[\u0000-AC-\uFFFF]", "[^B]")] + [InlineData("[\u0000-AF-\uFFFF]", "[^B-E]")] // Large loop patterns [InlineData("a*a*a*a*a*a*a*b*b*?a+a*", "a*b*b*?a+")] [InlineData("a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "a{0,30}aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")]