Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add backtracking loops, backreferences, and if-then-else constructs to Regex "simplified" code gen #61906

Merged
merged 9 commits into from
Nov 23, 2021

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ private void MakeLoopAtomic()
[Conditional("DEBUG")]
private void ValidateFinalTreeInvariants()
{
Debug.Assert(Type == Capture, "Every generated tree should begin with a capture node");

var toExamine = new Stack<RegexNode>();
toExamine.Push(this);
while (toExamine.Count > 0)
Expand Down Expand Up @@ -306,8 +308,11 @@ private void ValidateFinalTreeInvariants()
break;

case Testref:
Debug.Assert(childCount is 1 or 2, $"Expected one or two children for {node.TypeName}, got {childCount}");
break;

case Testgroup:
Debug.Assert(childCount >= 1, $"Expected at least one child for {node.TypeName}, got {childCount}.");
Debug.Assert(childCount is 2 or 3, $"Expected two or three children for {node.TypeName}, got {childCount}");
break;

case Concatenate:
Expand Down Expand Up @@ -2229,9 +2234,18 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
case Empty:
case Nothing:
case UpdateBumpalong:
// Backreferences are supported
case Ref:
supported = true;
break;

// Conditional backreference tests are also supported, so long as both their yes/no branches are supported.
case Testref:
supported =
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
(childCount == 1 || Child(1).SupportsSimplifiedCodeGenerationImplementation());
break;

// Single character greedy/lazy loops are supported if either they're actually a repeater
// or they're not contained in any construct other than simple nesting (e.g. concat, capture).
case Oneloop:
Expand All @@ -2244,17 +2258,10 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
supported = M == N || AncestorsAllowBacktracking(Next);
break;

// Loop repeaters are the same, except their child also needs to be supported.
// We also support such loops being atomic.
// For greedy and lazy loops, they're supported if the node they wrap is supported
// and either the node is actually a repeater, is atomic, or is in the tree in a
// location where backtracking is allowed.
case Loop:
supported =
(M == N || (Next != null && Next.Type == Atomic)) &&
Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// Similarly, as long as the wrapped node supports simplified code gen,
// Lazy is supported if it's a repeater or atomic, but also if it's in
// a place where backtracking is allowed (e.g. it's top-level).
case Lazyloop:
supported =
(M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) &&
Expand Down Expand Up @@ -2297,11 +2304,10 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
break;

case Capture:
// Currently we only support capnums without uncapnums (for balancing groups)
supported = N == -1;
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
if (supported)
{
// And we only support them in certain places in the tree.
// Captures are currently only supported in certain places in the tree.
RegexNode? parent = Next;
while (parent != null)
{
Expand All @@ -2322,25 +2328,31 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
}
}

// If we've found a supported capture, mark all of the nodes in its parent
// hierarchy as containing a capture.
if (supported)
{
// And we only support them if their children are supported.
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();

// If we've found a supported capture, mark all of the nodes in its parent
// hierarchy as containing a capture.
if (supported)
parent = this;
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
{
parent = this;
while (parent != null && ((parent.Options & HasCapturesFlag) == 0))
{
parent.Options |= HasCapturesFlag;
parent = parent.Next;
}
parent.Options |= HasCapturesFlag;
parent = parent.Next;
}
}
}
break;

case Testgroup:
supported =
Child(0).SupportsSimplifiedCodeGenerationImplementation() &&
Child(1).SupportsSimplifiedCodeGenerationImplementation() &&
(childCount == 2 || Child(2).SupportsSimplifiedCodeGenerationImplementation());
break;

default:
Debug.Fail($"Unknown type: {Type}");
supported = false;
break;
}
}
#if DEBUG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2071,6 +2071,17 @@ private bool IsCaptureSlot(int i)
return i >= 0 && i < _capsize;
}

/// <summary>
/// When generating code on a regex that uses a sparse set
/// of capture slots, we hash them to a dense set of indices
/// for an array of capture slots. Instead of doing the hash
/// at match time, it's done at compile time, here.
/// </summary>
internal static int MapCaptureNumber(int capnum, Hashtable? caps) =>
capnum == -1 ? -1 :
caps != null ? (int)caps[capnum]! :
capnum;

/// <summary>Looks up the slot number for a given name</summary>
private bool IsCaptureName(string capname) => _capnames != null && _capnames.ContainsKey(capname);

Expand Down Expand Up @@ -2171,7 +2182,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement)
_concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture));
break;

case > 1 when !UseOptionI() || isReplacement:
case > 1 when !UseOptionI() || isReplacement || !RegexCharClass.ParticipatesInCaseConversion(_pattern.AsSpan(pos, cch)):
_concatenation!.AddChild(new RegexNode(RegexNode.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch)));
break;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,17 +214,6 @@ private int StringCode(string str)
return i;
}

/// <summary>
/// When generating code on a regex that uses a sparse set
/// of capture slots, we hash them to a dense set of indices
/// for an array of capture slots. Instead of doing the hash
/// at match time, it's done at compile time, here.
/// </summary>
private int MapCapnum(int capnum) =>
capnum == -1 ? -1 :
_caps != null ? (int)_caps[capnum]! :
capnum;

/// <summary>
/// The main RegexCode generator. It does a depth-first walk
/// through the tree and calls EmitFragment to emits code before
Expand Down Expand Up @@ -283,7 +272,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
Emit(RegexCode.Setjump);
_intStack.Append(_emitted.Length);
Emit(RegexCode.Lazybranch, 0);
Emit(RegexCode.Testref, MapCapnum(node.M));
Emit(RegexCode.Testref, RegexParser.MapCaptureNumber(node.M, _caps));
Emit(RegexCode.Forejump);
break;
}
Expand Down Expand Up @@ -391,7 +380,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
break;

case RegexNode.Capture | AfterChild:
Emit(RegexCode.Capturemark, MapCapnum(node.M), MapCapnum(node.N));
Emit(RegexCode.Capturemark, RegexParser.MapCaptureNumber(node.M, _caps), RegexParser.MapCaptureNumber(node.N, _caps));
break;

case RegexNode.Require | BeforeChild:
Expand Down Expand Up @@ -471,7 +460,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex)
break;

case RegexNode.Ref:
Emit(node.Type | bits, MapCapnum(node.M));
Emit(node.Type | bits, RegexParser.MapCaptureNumber(node.M, _caps));
break;

case RegexNode.Nothing:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ public static IEnumerable<object[]> ValidateRegex_MemberData()

(string Pattern, RegexOptions Options, string Input, string Expected) testCase = allEngineCases[i];
yield return new object[] { engine, testCase.Pattern, testCase.Options, results[i], testCase.Input, expected };
yield return new object[] { engine, testCase.Pattern, testCase.Options | RegexOptions.CultureInvariant, results[i], testCase.Input, expected };
if ((testCase.Options & RegexOptions.IgnoreCase) != 0)
danmoseley marked this conversation as resolved.
Show resolved Hide resolved
{
yield return new object[] { engine, testCase.Pattern, testCase.Options | RegexOptions.CultureInvariant, results[i], testCase.Input, expected };
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,15 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } };
yield return new object[] { engine, null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } };

// Captures inside varying constructs with backtracking needing to uncapture
yield return new object[] { engine, null, @"a(bc)d|abc(e)", "abce", RegexOptions.None, new string[] { "abce", "", "e" } }; // alternation
yield return new object[] { engine, null, @"((ab){2}cd)*", "ababcdababcdababc", RegexOptions.None, new string[] { "ababcdababcd", "ababcd", "ab" } }; // loop
yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "aba", RegexOptions.None, new string[] { "a", "", "" } }; // positive lookahead in a loop
yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "ababa", RegexOptions.None, new string[] { "aba", "ab", "a" } }; // positive lookahead in a loop
yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "abababa", RegexOptions.None, new string[] { "ababa", "ab", "a" } }; // positive lookahead in a loop
yield return new object[] { engine, null, @"\w\w(?!(\d)\d)", "aa..", RegexOptions.None, new string[] { "aa", "" } }; // negative lookahead
yield return new object[] { engine, null, @"\w\w(?!(\d)\d)", "aa.3", RegexOptions.None, new string[] { "aa", "" } }; // negative lookahead

// Quantifiers
yield return new object[] { engine, null, @"a*", "", RegexOptions.None, new string[] { "" } };
yield return new object[] { engine, null, @"a*", "a", RegexOptions.None, new string[] { "a" } };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -504,8 +504,6 @@ public async Task Docs_GroupingConstructs_NonbacktrackingSubexpressions(RegexEng

Regex rBack = await RegexHelpers.GetRegexAsync(engine, @"(\w)\1+.\b");
Regex rNoBack = await RegexHelpers.GetRegexAsync(engine, @"(?>(\w)\1+).\b");
string[] inputs = { "aaad", "aaaa" };

Match back, noback;

back = rBack.Match("cccd.");
Expand Down Expand Up @@ -1117,6 +1115,95 @@ public async Task Docs_Anchors_ContiguousMatches(RegexEngine engine)
Regex.Replace(Input, Pattern, m => string.Concat(m.Value.Reverse())));
}

//
// Based on examples from https://blog.stevenlevithan.com/archives/balancing-groups
//

[Theory]
[MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
public async Task Blog_Levithan_BalancingGroups_Palindromes(RegexEngine engine)
{
if (RegexHelpers.IsNonBacktracking(engine))
{
// balancing groups not supported
return;
}

Regex r = await RegexHelpers.GetRegexAsync(engine, @"(?<N>.)+.?(?<-N>\k<N>)+(?(N)(?!))");

// Palindromes
Assert.All(new[]
{
"kayak",
"racecar",
"never odd or even",
"madam im adam"
}, p => Assert.True(r.IsMatch(p)));

// Non-Palindromes
Assert.All(new[]
{
"canoe",
"raceboat"
}, p => Assert.False(r.IsMatch(p)));
}

[Theory]
[MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
public async Task Blog_Levithan_BalancingGroups_MatchingParentheses(RegexEngine engine)
{
if (RegexHelpers.IsNonBacktracking(engine))
{
// balancing groups not supported
return;
}

Regex r = await RegexHelpers.GetRegexAsync(engine, @"^\(
(?>
[^()]+
|
\( (?<Depth>)
|
\) (?<-Depth>)
)*
(?(Depth)(?!))
\)$", RegexOptions.IgnorePatternWhitespace);

Assert.True(r.IsMatch("()"));
Assert.True(r.IsMatch("(a(b c(de(f(g)hijkl))mn))"));

Assert.False(r.IsMatch("("));
Assert.False(r.IsMatch(")"));
Assert.False(r.IsMatch("())"));
Assert.False(r.IsMatch("(()"));
Assert.False(r.IsMatch("(ab(cd)ef"));
}

[Theory]
[MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))]
public async Task Blog_Levithan_BalancingGroups_WordLengthIncreases(RegexEngine engine)
{
if (RegexHelpers.IsNonBacktracking(engine))
{
// balancing groups not supported
return;
}

Regex r = await RegexHelpers.GetRegexAsync(engine, @"^(?:
(?(A)\s|)
(?<B>)
(?<C-B>\w)+ (?(B)(?!))
(?:
\s
(?<C>)
(?<B-C>\w)+ (?(C)(?!))
(?<A>)
)?
)+ \b$", RegexOptions.IgnorePatternWhitespace);

Assert.True(r.IsMatch("a bc def ghij klmni"));
Assert.False(r.IsMatch("a bc def ghi klmn"));
}

//
// These patterns come from real-world customer usages
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,8 @@ internal static async Task<Regex[]> SourceGenRegexAsync(
if (generatorResults.Diagnostics.Length != 0)
{
throw new ArgumentException(
string.Join(Environment.NewLine, generatorResults.Diagnostics) + Environment.NewLine +
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))));
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine +
string.Join(Environment.NewLine, generatorResults.Diagnostics));
}

// Compile the assembly to a stream
Expand All @@ -122,8 +122,8 @@ internal static async Task<Regex[]> SourceGenRegexAsync(
if (!results.Success || results.Diagnostics.Length != 0)
{
throw new ArgumentException(
string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorResults.Diagnostics)) + Environment.NewLine +
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))));
string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine +
string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorResults.Diagnostics)));
}
dll.Position = 0;

Expand Down