From fc488523a94ed39cc15983e7910614c1b41084d7 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 22 Nov 2021 22:51:55 -0500 Subject: [PATCH] Add backtracking loops, backreferences, and if-then-else constructs to Regex "simplified" code gen (#61906) * Add a few tests for captures inside various constructs * Slightly optimize creation of multis that don't participate in case conversion * Add simple codegen support for backtracking Loops * Add simple codegen support for backreferences * Add simple codegen support for if-then-else backreference conditionals * Add simple codegen support for if-then-else expression conditionals * Flip default on MarkLabel emitting semicolon Labels need to be followed by something other than a closing brace. Previously I was trying to opt labels in to emitting a semi-colon, but it's too error prone. Instead, we now by default emit a semicolon, and only skip it from call sites that opt-out because it's obvious they'll always be followed by code. * Add simple codegen support for balancing groups * Address PR feedback --- .../gen/RegexGenerator.Emitter.cs | 408 ++++++++++++++--- .../Text/RegularExpressions/RegexCompiler.cs | 419 +++++++++++++++++- .../Text/RegularExpressions/RegexNode.cs | 64 +-- .../Text/RegularExpressions/RegexParser.cs | 13 +- .../Text/RegularExpressions/RegexWriter.cs | 17 +- .../tests/MonoRegexTests.cs | 5 +- .../tests/Regex.Groups.Tests.cs | 9 + .../tests/Regex.KnownPattern.Tests.cs | 91 +++- .../tests/RegexGeneratorHelper.netcoreapp.cs | 8 +- 9 files changed, 896 insertions(+), 138 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 7c588f0c3dbd2..44493b4b59108 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -139,11 +139,6 @@ private static void EmitRegexMethod(IndentedTextWriter writer, RegexMethod rm, s writer.WriteLine(); writer.WriteLine($" private {id}()"); writer.WriteLine($" {{"); -#if DEBUG - writer.WriteLine(" /*"); - writer.WriteLine($"{rm.Code.Tree.ToString().Replace("*/", @"* /")}"); - writer.WriteLine(" */"); -#endif writer.WriteLine($" base.pattern = {patternExpression};"); writer.WriteLine($" base.roptions = {optionsExpression};"); writer.WriteLine($" base.internalMatchTimeout = {timeoutExpression};"); @@ -186,6 +181,16 @@ private static void EmitRegexMethod(IndentedTextWriter writer, RegexMethod rm, s // Main implementation methods writer.WriteLine($" protected override void InitTrackCount() => base.runtrackcount = {rm.Code.TrackCount};"); writer.WriteLine(); +#if DEBUG + writer.WriteLine(" // Node tree:"); + var treeLineReader = new StringReader(rm.Code.Tree.ToString()); + string? treeLine = null; + while ((treeLine = treeLineReader.ReadLine()) != null) + { + writer.WriteLine($" // {treeLine}"); + } + writer.WriteLine(); +#endif writer.WriteLine($" protected override bool FindFirstChar()"); writer.WriteLine($" {{"); writer.Indent += 4; @@ -691,8 +696,17 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, bool rtl = code.RightToLeft; bool hasTimeout = false; - int localCounter = 0; - string NextLocalName(string prefix) => $"{prefix}{localCounter++}"; + // Helper to define names. Names start unadorned, but as soon as there's repetition, + // they begin to have a numbered suffix. + var usedNames = new Dictionary(); + string ReserveName(string prefix) + { + usedNames.TryGetValue(prefix, out int count); + usedNames[prefix] = count + 1; + return count == 0 ? prefix : $"{prefix}{count}"; + } + + void MarkLabel(string label, bool emitSemicolon = true) => writer.WriteLine($"{label}:{(emitSemicolon ? ";" : "")}"); RegexNode node = rm.Code.Tree.Root; Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node"); @@ -731,6 +745,7 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, writer.WriteLine("int runtextpos = base.runtextpos;"); writer.WriteLine("int runtextend = base.runtextend;"); writer.WriteLine("int originalruntextpos = runtextpos;"); + writer.WriteLine("int runstackpos = 0;"); writer.WriteLine("global::System.ReadOnlySpan byteSpan;"); writer.WriteLine("char ch;"); hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm); @@ -748,11 +763,7 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, LoadTextSpanLocal(writer, defineLocal: true); writer.WriteLine(); - int labelCounter = 0; - string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}"; - void MarkLabel(string label, bool addEmptyStatement = false) => writer.WriteLine($"{label}:{(addEmptyStatement ? " ;" : "")}"); - void Goto(string label) => writer.WriteLine($"goto {label};"); - string doneLabel = "NoMatch"; + string doneLabel = ReserveName("NoMatch"); string originalDoneLabel = doneLabel; // Emit the code for all nodes in the tree. @@ -772,7 +783,7 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, // Emit failure writer.WriteLine("// No match"); - MarkLabel(originalDoneLabel, !expressionHasCaptures); + MarkLabel(originalDoneLabel, emitSemicolon: !expressionHasCaptures); if (expressionHasCaptures) { EmitUncaptureUntil("0"); @@ -936,10 +947,10 @@ static RegexNode CloneMultiWithoutFirstChar(RegexNode node) void EmitAllBranches() { // Label to jump to when any branch completes successfully. - string doneAlternateLabel = DefineLabel("Match"); + string doneAlternateLabel = ReserveName("Match"); // Save off runtextpos. We'll need to reset this each time a branch fails. - string startingRunTextPosName = NextLocalName("startingRunTextPos"); + string startingRunTextPosName = ReserveName("startingRunTextPos"); writer.WriteLine($"int {startingRunTextPosName} = runtextpos;"); int startingTextSpanPos = textSpanPos; @@ -948,7 +959,7 @@ void EmitAllBranches() // as the alternation is atomic, so we're not concerned about captures after // the alternation. bool hasStartingCrawlpos = (node.Options & RegexNode.HasCapturesFlag) != 0; - string startingCrawlPos = NextLocalName("startingCrawlPos"); + string startingCrawlPos = ReserveName("startingCrawlPos"); if (hasStartingCrawlpos) { writer.WriteLine($"int {startingCrawlPos} = base.Crawlpos();"); @@ -963,7 +974,7 @@ void EmitAllBranches() { using var __ = EmitScope(writer, $"Branch {i}"); - string nextBranch = DefineLabel("NoMatch"); + string nextBranch = ReserveName("NoMatch"); doneLabel = nextBranch; // Emit the code for each branch. @@ -980,7 +991,7 @@ void EmitAllBranches() // setting runtextpos back to what it was at the beginning of the alternation, // updating textSpan to be the full length it was, and if there's a capture that // needs to be reset, uncapturing it. - MarkLabel(nextBranch); + MarkLabel(nextBranch, emitSemicolon: false); writer.WriteLine($"runtextpos = {startingRunTextPosName};"); LoadTextSpanLocal(writer); textSpanPos = startingTextSpanPos; @@ -997,13 +1008,13 @@ void EmitAllBranches() { if (hasStartingCrawlpos) { - string uncapture = DefineLabel("Uncapture"); + string uncapture = ReserveName("Uncapture"); doneLabel = uncapture; EmitNode(node.Child(childCount - 1)); doneLabel = postAlternateDoneLabel; TransferTextSpanPosToRunTextPos(); writer.WriteLine($"goto {doneAlternateLabel};"); - MarkLabel(uncapture); + MarkLabel(uncapture, emitSemicolon: false); EmitUncaptureUntil(startingCrawlPos); writer.WriteLine($"goto {doneLabel};"); } @@ -1017,34 +1028,180 @@ void EmitAllBranches() // Successfully completed the alternate. MarkLabel(doneAlternateLabel); - writer.WriteLine(";"); Debug.Assert(textSpanPos == 0); } } + // Emits the code to handle a backreference. + void EmitBackreference(RegexNode node) + { + int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); + + TransferTextSpanPosToRunTextPos(); + + using (EmitBlock(writer, $"if (base.IsMatched({capnum}))")) + { + string matchLength = ReserveName("matchLength"); + writer.WriteLine($"int {matchLength} = base.MatchLength({capnum});"); + using (EmitBlock(writer, $"if ({textSpanLocal}.Length < {matchLength})")) + { + writer.WriteLine($"goto {doneLabel};"); + } + writer.WriteLine(); + + string matchIndex = ReserveName("matchIndex"); + writer.WriteLine($"int {matchIndex} = base.MatchIndex({capnum});"); + + string i = ReserveName("i"); + using (EmitBlock(writer, $"for (int {i} = 0; {i} < {matchLength}; {i}++)")) + { + using (EmitBlock(writer, $"if ({ToLowerIfNeeded(hasTextInfo, options, $"runtext[{matchIndex} + {i}]", IsCaseInsensitive(node))} != {ToLowerIfNeeded(hasTextInfo, options, $"{textSpanLocal}[{i}]", IsCaseInsensitive(node))})")) + { + writer.WriteLine($"goto {doneLabel};"); + } + } + writer.WriteLine(); + + writer.WriteLine($"runtextpos += {matchLength};"); + LoadTextSpanLocal(writer); + } + + if ((node.Options & RegexOptions.ECMAScript) == 0) + { + using (EmitBlock(writer, "else")) + { + writer.WriteLine($"goto {doneLabel};"); + } + } + } + + // Emits the code for an if(backreference)-then-else conditional. + void EmitBackreferenceConditional(RegexNode node) + { + int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); + int startingTextSpanPos = textSpanPos; + + using (EmitBlock(writer, $"if (base.IsMatched({capnum}))")) + { + EmitNode(node.Child(0)); + TransferTextSpanPosToRunTextPos(); + } + + if (node.ChildCount() > 1) + { + textSpanPos = startingTextSpanPos; + using (EmitBlock(writer, "else")) + { + EmitNode(node.Child(1)); + TransferTextSpanPosToRunTextPos(); + } + } + } + + // Emits the code for an if(expression)-then-else conditional. + void EmitExpressionConditional(RegexNode node) + { + // The first child node is the conditional expression. If this matches, then we branch to the "yes" branch. + // If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes" + // branch, otherwise. The conditional is treated as a positive lookahead. If it's not already + // such a node, wrap it in one. + RegexNode conditional = node.Child(0); + if (conditional is not { Type: RegexNode.Require }) + { + var newConditional = new RegexNode(RegexNode.Require, conditional.Options); + newConditional.AddChild(conditional); + conditional = newConditional; + } + + // Get the "yes" branch and the optional "no" branch, if it exists. + RegexNode yesBranch = node.Child(1); + RegexNode? noBranch = node.ChildCount() > 2 ? node.Child(2) : null; + + string end = ReserveName("end"); + string? no = noBranch is not null ? ReserveName("NoMatch") : null; + + // If the conditional expression has captures, we'll need to uncapture them in the case of no match. + string? startingCrawlPos = null; + if ((conditional.Options & RegexNode.HasCapturesFlag) != 0) + { + startingCrawlPos = ReserveName("startingCrawlPos"); + writer.WriteLine($"int {startingCrawlPos} = base.Crawlpos();"); + writer.WriteLine(); + } + + // Emit the conditional expression. We need to reroute any match failures to either the "no" branch + // if it exists, or to the end of the node (skipping the "yes" branch) if it doesn't. + string originalDoneLabel = doneLabel; + string tmpDoneLabel = no ?? end; + doneLabel = tmpDoneLabel; + EmitPositiveLookaheadAssertion(conditional); + if (doneLabel == tmpDoneLabel) + { + doneLabel = originalDoneLabel; + } + + // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch. + // Since the "yes" branch may have a different execution path than the "no" branch or the lack of + // any branch, we need to store the current textSpanPosition and reset it prior to emitting the code + // for what comes after the "yes" branch, so that everyone is on equal footing. + int startingTextSpanPos = textSpanPos; + EmitNode(yesBranch); + TransferTextSpanPosToRunTextPos(); // ensure all subsequent code sees the same textSpanPos value by setting it to 0 + + // If there's a no branch, we need to emit it, but skipping it from a successful "yes" branch match. + if (no is not null) + { + writer.WriteLine($"goto {end};"); + writer.WriteLine(); + + // Emit the no branch, first uncapturing any captures from the expression condition that failed + // to match and emit the branch. + MarkLabel(no, emitSemicolon: startingCrawlPos is null); + if (startingCrawlPos is not null) + { + EmitUncaptureUntil(startingCrawlPos); + } + textSpanPos = startingTextSpanPos; + EmitNode(noBranch); + TransferTextSpanPosToRunTextPos(); // ensure all subsequent code sees the same textSpanPos value by setting it to 0 + } + + MarkLabel(end); + } + // Emits the code for a Capture node. void EmitCapture(RegexNode node, RegexNode? subsequent = null) { - Debug.Assert(node.N == -1); - - // Get the capture number. This needs to be kept in sync with MapCapNum in RegexWriter. Debug.Assert(node.Type == RegexNode.Capture); - Debug.Assert(node.N == -1, "Currently only support capnum, not uncapnum"); - int capnum = node.M; - if (capnum != -1 && rm.Code.Caps != null) + int capnum = RegexParser.MapCaptureNumber(node.M, rm.Code.Caps); + int uncapnum = RegexParser.MapCaptureNumber(node.N, rm.Code.Caps); + + if (uncapnum != -1) { - capnum = (int)rm.Code.Caps[capnum]!; + using (EmitBlock(writer, $"if (!base.IsMatched({uncapnum}))")) + { + writer.WriteLine($"goto {doneLabel};"); + } + writer.WriteLine(); } TransferTextSpanPosToRunTextPos(); - string startingRunTextPosName = NextLocalName("startingRunTextPos"); + string startingRunTextPosName = ReserveName("startingRunTextPos"); writer.WriteLine($"int {startingRunTextPosName} = runtextpos;"); + writer.WriteLine(); // Emit child node. EmitNode(node.Child(0), subsequent); TransferTextSpanPosToRunTextPos(); - writer.WriteLine($"base.Capture({capnum}, {startingRunTextPosName}, runtextpos);"); + if (uncapnum == -1) + { + writer.WriteLine($"base.Capture({capnum}, {startingRunTextPosName}, runtextpos);"); + } + else + { + writer.WriteLine($"base.TransferCapture({capnum}, {uncapnum}, {startingRunTextPosName}, runtextpos);"); + } } // Emits code to unwind the capture stack until the crawl position specified in the provided local. @@ -1060,8 +1217,9 @@ void EmitUncaptureUntil(string crawlpos) void EmitPositiveLookaheadAssertion(RegexNode node) { // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. - string startingRunTextPosName = NextLocalName("startingRunTextPos"); + string startingRunTextPosName = ReserveName("startingRunTextPos"); writer.WriteLine($"int {startingRunTextPosName} = runtextpos;"); + writer.WriteLine(); int startingTextSpanPos = textSpanPos; // Emit the child. @@ -1078,12 +1236,12 @@ void EmitPositiveLookaheadAssertion(RegexNode node) void EmitNegativeLookaheadAssertion(RegexNode node) { // Save off runtextpos. We'll need to reset this upon successful completion of the lookahead. - string startingRunTextPosName = NextLocalName("startingRunTextPos"); + string startingRunTextPosName = ReserveName("startingRunTextPos"); writer.WriteLine($"int {startingRunTextPosName} = runtextpos;"); int startingTextSpanPos = textSpanPos; string originalDoneLabel = doneLabel; - string negativeLookaheadDoneLabel = DefineLabel("Match"); + string negativeLookaheadDoneLabel = ReserveName("Match"); doneLabel = negativeLookaheadDoneLabel; // Emit the child. @@ -1091,10 +1249,10 @@ void EmitNegativeLookaheadAssertion(RegexNode node) // If the generated code ends up here, it matched the lookahead, which actually // means failure for a _negative_ lookahead, so we need to jump to the original done. - Goto(originalDoneLabel); + writer.WriteLine($"goto {originalDoneLabel};"); // Failures (success for a negative lookahead) jump here. - MarkLabel(negativeLookaheadDoneLabel); + MarkLabel(negativeLookaheadDoneLabel, emitSemicolon: false); Debug.Assert(doneLabel == negativeLookaheadDoneLabel); doneLabel = originalDoneLabel; @@ -1122,7 +1280,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck return; case RegexNode.Atomic: - EmitNode(node.Child(0), subsequent); + EmitAtomic(node, subsequent); return; } @@ -1165,7 +1323,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.Loop: - EmitAtomicNodeLoop(node); + EmitLoop(node); break; case RegexNode.Onelazy: @@ -1189,6 +1347,18 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; + case RegexNode.Ref: + EmitBackreference(node); + break; + + case RegexNode.Testref: + EmitBackreferenceConditional(node); + break; + + case RegexNode.Testgroup: + EmitExpressionConditional(node); + break; + case RegexNode.Capture: EmitCapture(node, subsequent); break; @@ -1251,6 +1421,17 @@ static bool NodesWithCrossScopeLabels(RegexNode node, HashSet results } } + // Emits the node for an atomic. + void EmitAtomic(RegexNode node, RegexNode? subsequent) + { + // Atomic simply outputs the code for the child, but it ensures that any done label left + // set by the child is reset to what it was prior to the node's processing. That way, + // anything later that tries to jump back won't see labels set inside the atomic. + string originalDoneLabel = doneLabel; + EmitNode(node.Child(0), subsequent); + doneLabel = originalDoneLabel; + } + // Emits the code to handle updating base.runtextpos to runtextpos in response to // an UpdateBumpalong node. This is used when we want to inform the scan loop that // it should bump from this location rather than from the original location. @@ -1419,7 +1600,7 @@ void EmitAnchors(RegexNode node) { // If we statically know we've already matched part of the regex, there's no way we're at the // beginning or start, as we've already progressed past it. - Goto(doneLabel); + writer.WriteLine($"goto {doneLabel};"); } else { @@ -1567,7 +1748,7 @@ void EmitOr() else { EmitSpanLengthCheck(str.Length); - string i = NextLocalName("i"); + string i = ReserveName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {Literal(node.Str)}.Length; {i}++)")) { string textSpanIndex = textSpanPos > 0 ? $"{i} + {textSpanPos}" : i; @@ -1595,10 +1776,10 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // of the wrapped loop (1). Debug.Assert(node.M < node.N); - string backtrackingLabel = DefineLabel("Backtrack"); - string endLoop = DefineLabel("EndLoop"); - string startingPos = NextLocalName("startingRunTextPos"); - string endingPos = NextLocalName("endingRunTextPos"); + string backtrackingLabel = ReserveName("Backtrack"); + string endLoop = ReserveName("EndLoop"); + string startingPos = ReserveName("startingRunTextPos"); + string endingPos = ReserveName("endingRunTextPos"); // We're about to enter a loop, so ensure our text position is 0. TransferTextSpanPosToRunTextPos(); @@ -1615,7 +1796,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL string? crawlPos = null; if (expressionHasCaptures) { - crawlPos = NextLocalName("crawlPos"); + crawlPos = ReserveName("crawlPos"); writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); } if (node.M > 0) @@ -1628,7 +1809,7 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // Backtracking section. Subsequent failures will jump to here, at which // point we decrement the matched count as long as it's above the minimum // required, and try again by flowing to everything that comes after this. - MarkLabel(backtrackingLabel); + MarkLabel(backtrackingLabel, emitSemicolon: false); string originalDoneLabel = doneLabel; using (EmitBlock(writer, $"if ({startingPos} >= {endingPos})")) { @@ -1704,8 +1885,8 @@ void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true) string? maxIterations = null; if (node.N != int.MaxValue) { - iterationCount = NextLocalName("i"); - maxIterations = NextLocalName("maxIterations"); + iterationCount = ReserveName("i"); + maxIterations = ReserveName("maxIterations"); writer.WriteLine($"int {iterationCount} = 0;"); writer.WriteLine($"int {maxIterations} = {node.N - node.M};"); } @@ -1714,24 +1895,24 @@ void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true) string? crawlPos = null; if (expressionHasCaptures) { - crawlPos = NextLocalName("crawlPos"); + crawlPos = ReserveName("crawlPos"); writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); } // Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which // is also incremented each time we match another character in the loop. - string nextPos = NextLocalName("nextPos"); + string nextPos = ReserveName("nextPos"); writer.WriteLine($"int {nextPos} = runtextpos;"); // Skip the backtracking section for the initial subsequent matching. We've already matched the // minimum number of iterations, which means we can successfully match with zero additional iterations. - string endLoopLabel = DefineLabel("endLoop"); + string endLoopLabel = ReserveName("endLoop"); writer.WriteLine($"goto {endLoopLabel};"); writer.WriteLine(); // Backtracking section. Subsequent failures will jump to here. - string backtrackingLabel = DefineLabel("Backtrack"); - MarkLabel(backtrackingLabel); + string backtrackingLabel = ReserveName("Backtrack"); + MarkLabel(backtrackingLabel, emitSemicolon: false); // Uncapture any captures if the expression has any. It's possible the captures it has // are before this node, in which case this is wasted effort, but still functionally correct. @@ -1774,7 +1955,7 @@ void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true) doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes writer.WriteLine(); - MarkLabel(endLoopLabel, addEmptyStatement: true); + MarkLabel(endLoopLabel); // We explicitly do not reset doneLabel back to originalDoneLabel. // It's left pointing to the backtracking label for everything subsequent in the expression. @@ -1812,7 +1993,7 @@ void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true) { string spanLocal = "slice"; // As this repeater doesn't wrap arbitrary node emits, this shouldn't conflict with anything writer.WriteLine($"global::System.ReadOnlySpan {spanLocal} = {textSpanLocal}.Slice({textSpanPos}, {iterations});"); - string i = NextLocalName("i"); + string i = ReserveName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {spanLocal}.Length; {i}++)")) { EmitTimeoutCheck(writer, hasTimeout); @@ -1848,7 +2029,7 @@ void EmitNodeRepeater(RegexNode node) // Ensure textSpanPos is 0 prior to emitting the child. TransferTextSpanPosToRunTextPos(); - string i = NextLocalName("i"); + string i = ReserveName("i"); using (EmitBlock(writer, $"for (int {i} = 0; {i} < {iterations}; {i}++)")) { EmitTimeoutCheck(writer, hasTimeout); @@ -1882,7 +2063,7 @@ void EmitSingleCharAtomicLoop(RegexNode node, bool emitLengthChecksIfRequired = Span setChars = stackalloc char[5]; // 5 is max optimized by IndexOfAny today int numSetChars = 0; - string iterationLocal = NextLocalName("i"); + string iterationLocal = ReserveName("i"); if (node.IsNotoneFamily && maxIterations == int.MaxValue && (!IsCaseInsensitive(node))) @@ -2018,8 +2199,86 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) } } + void EmitLoop(RegexNode node) + { + // If the loop is atomic, emit it as such and avoid all backtracking. + if (node.Next is { Type: RegexNode.Atomic }) + { + EmitAtomicNodeLoop(node); + return; + } + + // If the loop is actually a repeater, similarly emit it as such and avoid all backtracking. + if (node.M == node.N) + { + EmitNodeRepeater(node); + return; + } + + // Emit backtracking around an atomic loop, but tracking the starting position of each iteration + // along the way so that we can backtrack through each position. + + Debug.Assert(node.M < node.N); + string backtrackingLabel = ReserveName("Backtrack"); + string endLoop = ReserveName("EndLoop"); + string startingTrackPos = ReserveName("startingTrackPos"); + string endingTrackPos = ReserveName("endingTrackPos"); + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // Grab the current position, then emit the loop as atomic, except keeping track of the position + // before each iteration match, which enables us to then apply the backtracking. + writer.WriteLine($"int {startingTrackPos} = runstackpos;"); + EmitAtomicNodeLoop(node, trackStartingPositions: true); + TransferTextSpanPosToRunTextPos(); + writer.WriteLine($"int {endingTrackPos} = runstackpos;"); + string? crawlPos = null; + if (expressionHasCaptures) + { + crawlPos = ReserveName("crawlPos"); + writer.WriteLine($"int {crawlPos} = base.Crawlpos();"); + } + if (node.M > 0) + { + writer.WriteLine($"{startingTrackPos} += {node.M};"); + } + writer.WriteLine($"goto {endLoop};"); + writer.WriteLine(); + + // Backtracking section. Subsequent failures will jump to here, at which + // point we decrement the matched count as long as it's above the minimum + // required, and try again by flowing to everything that comes after this. + MarkLabel(backtrackingLabel, emitSemicolon: false); + string originalDoneLabel = doneLabel; + using (EmitBlock(writer, $"if ({startingTrackPos} >= {endingTrackPos})")) + { + writer.WriteLine($"goto {originalDoneLabel};"); + } + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + if (expressionHasCaptures) + { + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + EmitUncaptureUntil(crawlPos); + } + + writer.WriteLine($"runtextpos = base.runstack![--{endingTrackPos}];"); + + LoadTextSpanLocal(writer); + writer.WriteLine(); + + MarkLabel(endLoop); + + // We explicitly do not reset doneLabel back to originalDoneLabel. + // It's left pointing to the backtracking label for everything subsequent in the expression. + } + // Emits the code to handle a non-backtracking, variable-length loop around another node. - void EmitAtomicNodeLoop(RegexNode node) + // If trackStartingPositions is true, it will also handle emitting code to use runstack[runstackpos] + // to store the starting positions of each iteration. + void EmitAtomicNodeLoop(RegexNode node, bool trackStartingPositions = false) { Debug.Assert(node.Type == RegexNode.Loop, $"Unexpected type: {node.Type}"); Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}"); @@ -2035,7 +2294,7 @@ void EmitAtomicNodeLoop(RegexNode node) } string originalDoneLabel = doneLabel; - string atomicNodeLabel = DefineLabel("NoMatch"); + string atomicNodeLabel = ReserveName("NoMatch"); doneLabel = atomicNodeLabel; // We might loop any number of times. In order to ensure this loop @@ -2046,39 +2305,51 @@ void EmitAtomicNodeLoop(RegexNode node) TransferTextSpanPosToRunTextPos(); // int i = 0; - string iterationLocal = NextLocalName("iter"); + string iterationLocal = ReserveName("iter"); writer.WriteLine($"int {iterationLocal} = 0;"); using (EmitBlock(writer, maxIterations == int.MaxValue ? "while (true)" : $"while ({iterationLocal} < {maxIterations})")) { EmitTimeoutCheck(writer, hasTimeout); - string successfulIterationLabel = DefineLabel("Match"); + string successfulIterationLabel = ReserveName("Match"); // Iteration body string prevDone = doneLabel; - string iterationLabel = DefineLabel("NoMatch"); + string iterationLabel = ReserveName("NoMatch"); doneLabel = iterationLabel; // Save off runtextpos. - string startingRunTextPosLocal = NextLocalName("startingRunTextPos"); + string startingRunTextPosLocal = ReserveName("startingRunTextPos"); writer.WriteLine($"int {startingRunTextPosLocal} = runtextpos;"); + if (trackStartingPositions) + { + // Track the starting position of each loop iteration to enable backtracking. + writer.WriteLine(); + using (EmitBlock(writer, "if (runstackpos == base.runstack!.Length)")) + { + writer.WriteLine("global::System.Array.Resize(ref base.runstack, base.runstack.Length * 2);"); + } + writer.WriteLine("base.runstack[runstackpos++] = runtextpos;"); + writer.WriteLine(); + } + // Emit the child. Debug.Assert(textSpanPos == 0); EmitNode(node.Child(0)); TransferTextSpanPosToRunTextPos(); // ensure textSpanPos remains 0 - Goto(successfulIterationLabel); // iteration succeeded + writer.WriteLine($"goto {successfulIterationLabel};"); // iteration succeeded // If the generated code gets here, the iteration failed. // Reset state, branch to done. - MarkLabel(iterationLabel); + MarkLabel(iterationLabel, emitSemicolon: false); Debug.Assert(doneLabel == iterationLabel); doneLabel = prevDone; // reset done label writer.WriteLine($"runtextpos = {startingRunTextPosLocal};"); - Goto(doneLabel); + writer.WriteLine($"goto {doneLabel};"); // Successful iteration. - MarkLabel(successfulIterationLabel); + MarkLabel(successfulIterationLabel, emitSemicolon: false); writer.WriteLine($"{iterationLocal}++;"); } @@ -2095,11 +2366,6 @@ void EmitAtomicNodeLoop(RegexNode node) writer.WriteLine($"goto {originalDoneLabel};"); } } - else - { - // Labels require a statement after them. - writer.WriteLine(";"); - } } } diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 054df1c5c33d9..58d5f5f1717ab 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -74,6 +74,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_stringIndexOfCharInt = typeof(string).GetMethod("IndexOf", new Type[] { typeof(char), typeof(int) })!; private static readonly MethodInfo s_stringLastIndexOfCharIntInt = typeof(string).GetMethod("LastIndexOf", new Type[] { typeof(char), typeof(int), typeof(int) })!; private static readonly MethodInfo s_textInfoToLowerMethod = typeof(TextInfo).GetMethod("ToLower", new Type[] { typeof(char) })!; + private static readonly MethodInfo s_arrayResize = typeof(Array).GetMethod("Resize")!.MakeGenericMethod(typeof(int)); protected ILGenerator? _ilg; @@ -1679,6 +1680,11 @@ private bool TryGenerateSimplifiedGo(RegexNode node) Ldloc(runtextposLocal); Stloc(originalruntextposLocal); + // int runstackpos = 0; + LocalBuilder runstackpos = DeclareInt32(); + Ldc(0); + Stloc(runstackpos); + // The implementation tries to use const indexes into the span wherever possible, which we can do // in all places except for variable-length loops. For everything else, we know at any point in // the regex exactly how far into it we are, and we can use that to index into the span created @@ -1939,20 +1945,217 @@ void EmitAtomicAlternate(RegexNode node) Debug.Assert(textSpanPos == 0); } + // Emits the code to handle a backreference. + void EmitBackreference(RegexNode node) + { + int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); + + TransferTextSpanPosToRunTextPos(); + + Label end = DefineLabel(); + + // if (!base.IsMatched(capnum)) goto (!ecmascript ? doneLabel : end); + Ldthis(); + Ldc(capnum); + Call(s_isMatchedMethod); + BrfalseFar((node.Options & RegexOptions.ECMAScript) == 0 ? doneLabel : end); + + using RentedLocalBuilder matchLength = RentInt32Local(); + using RentedLocalBuilder matchIndex = RentInt32Local(); + using RentedLocalBuilder i = RentInt32Local(); + + // int matchLength = base.MatchLength(capnum); + Ldthis(); + Ldc(capnum); + Call(s_matchLengthMethod); + Stloc(matchLength); + + // if (textSpan.Length < matchLength) goto doneLabel; + Ldloca(textSpanLocal); + Call(s_spanGetLengthMethod); + Ldloc(matchLength); + BltFar(doneLabel); + + // int matchIndex = base.MatchIndex(capnum); + Ldthis(); + Ldc(capnum); + Call(s_matchIndexMethod); + Stloc(matchIndex); + + Label condition = DefineLabel(); + Label body = DefineLabel(); + + // for (int i = 0; ...) + Ldc(0); + Stloc(i); + Br(condition); + + MarkLabel(body); + + // if (runtext[matchIndex + i] != textSpan[i]) goto doneLabel; + Ldloc(runtextLocal); + Ldloc(matchIndex); + Ldloc(i); + Add(); + Call(s_stringGetCharsMethod); + if (IsCaseInsensitive(node)) + { + CallToLower(); + } + Ldloca(textSpanLocal); + Ldloc(i); + Call(s_spanGetItemMethod); + LdindU2(); + if (IsCaseInsensitive(node)) + { + CallToLower(); + } + BneFar(doneLabel); + + // for (...; ...; i++) + Ldloc(i); + Ldc(1); + Add(); + Stloc(i); + + // for (...; i < matchLength; ...) + MarkLabel(condition); + Ldloc(i); + Ldloc(matchLength); + Blt(body); + + // runtextpos += matchLength; + Ldloc(runtextposLocal); + Ldloc(matchLength); + Add(); + Stloc(runtextposLocal); + LoadTextSpanLocal(); + + MarkLabel(end); + } + + // Emits the code for an if(backreference)-then-else conditional. + void EmitBackreferenceConditional(RegexNode node) + { + int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); + int startingTextSpanPos = textSpanPos; + bool hasNo = node.ChildCount() > 1; + + Label no = DefineLabel(); + Label end = DefineLabel(); + + // if (!base.IsMatched(capnum)) goto end/no; + Ldthis(); + Ldc(capnum); + Call(s_isMatchedMethod); + BrfalseFar(hasNo ? no : end); + + // yes branch + EmitNode(node.Child(0)); + TransferTextSpanPosToRunTextPos(); + + if (hasNo) + { + BrFar(end); + + // no branch + MarkLabel(no); + textSpanPos = startingTextSpanPos; + EmitNode(node.Child(1)); + TransferTextSpanPosToRunTextPos(); + } + + MarkLabel(end); + } + + // Emits the code for an if(expression)-then-else conditional. + void EmitExpressionConditional(RegexNode node) + { + // The first child node is the conditional expression. If this matches, then we branch to the "yes" branch. + // If it doesn't match, then we branch to the optional "no" branch if it exists, or simply skip the "yes" + // branch, otherwise. The conditional is treated as a positive lookahead. If it's not already + // such a node, wrap it in one. + RegexNode conditional = node.Child(0); + if (conditional is not { Type: RegexNode.Require }) + { + var newConditional = new RegexNode(RegexNode.Require, conditional.Options); + newConditional.AddChild(conditional); + conditional = newConditional; + } + + // Get the "yes" branch and the optional "no" branch, if it exists. + RegexNode yesBranch = node.Child(1); + RegexNode? noBranch = node.ChildCount() > 2 ? node.Child(2) : null; + + Label end = DefineLabel(); + Label no = DefineLabel(); + + // If the conditional expression has captures, we'll need to uncapture them in the case of no match. + LocalBuilder? startingCrawlPos = null; + if ((conditional.Options & RegexNode.HasCapturesFlag) != 0) + { + // int startingCrawlPos = base.Crawlpos(); + startingCrawlPos = DeclareInt32(); + Ldthis(); + Call(s_crawlposMethod); + Stloc(startingCrawlPos); + } + + // Emit the conditional expression. We need to reroute any match failures to either the "no" branch + // if it exists, or to the end of the node (skipping the "yes" branch) if it doesn't. + Label originalDoneLabel = doneLabel; + Label tmpDoneLabel = noBranch is not null ? no : end; + doneLabel = tmpDoneLabel; + EmitPositiveLookaheadAssertion(conditional); + if (doneLabel == tmpDoneLabel) + { + doneLabel = originalDoneLabel; + } + + // If we get to this point of the code, the conditional successfully matched, so run the "yes" branch. + // Since the "yes" branch may have a different execution path than the "no" branch or the lack of + // any branch, we need to store the current textSpanPosition and reset it prior to emitting the code + // for what comes after the "yes" branch, so that everyone is on equal footing. + int startingTextSpanPos = textSpanPos; + EmitNode(yesBranch); + TransferTextSpanPosToRunTextPos(); // ensure all subsequent code sees the same textSpanPos value by setting it to 0 + + // If there's a no branch, we need to emit it, but skipping it from a successful "yes" branch match. + if (noBranch is not null) + { + BrFar(end); + + // Emit the no branch, first uncapturing any captures from the expression condition that failed + // to match and emit the branch. + MarkLabel(no); + if (startingCrawlPos is not null) + { + EmitUncaptureUntil(startingCrawlPos); + } + textSpanPos = startingTextSpanPos; + EmitNode(noBranch); + TransferTextSpanPosToRunTextPos(); // ensure all subsequent code sees the same textSpanPos value by setting it to 0 + } + + MarkLabel(end); + } + // Emits the code for a Capture node. void EmitCapture(RegexNode node, RegexNode? subsequent = null) { - Debug.Assert(node.N == -1); LocalBuilder startingRunTextPos = DeclareInt32(); - // Get the capture number. This needs to be kept - // in sync with MapCapNum in RegexWriter. Debug.Assert(node.Type == RegexNode.Capture); - Debug.Assert(node.N == -1, "Currently only support capnum, not uncapnum"); - int capnum = node.M; - if (capnum != -1 && _code!.Caps != null) + int capnum = RegexParser.MapCaptureNumber(node.M, _code!.Caps); + int uncapnum = RegexParser.MapCaptureNumber(node.N, _code.Caps); + + if (uncapnum != -1) { - capnum = (int)_code.Caps[capnum]!; + // if (!IsMatched(uncapnum)) goto doneLabel; + Ldthis(); + Ldc(uncapnum); + Call(s_isMatchedMethod); + BrfalseFar(doneLabel); } // runtextpos += textSpanPos; @@ -1967,13 +2170,27 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null) // runtextpos += textSpanPos; // textSpan = textSpan.Slice(textSpanPos); - // Capture(capnum, startingRunTextPos, runtextpos); TransferTextSpanPosToRunTextPos(); - Ldthis(); - Ldc(capnum); - Ldloc(startingRunTextPos); - Ldloc(runtextposLocal); - Call(s_captureMethod); + + if (uncapnum == -1) + { + // Capture(capnum, startingRunTextPos, runtextpos); + Ldthis(); + Ldc(capnum); + Ldloc(startingRunTextPos); + Ldloc(runtextposLocal); + Call(s_captureMethod); + } + else + { + // TransferCapture(capnum, uncapnum, startingRunTextPos, runtextpos); + Ldthis(); + Ldc(capnum); + Ldc(uncapnum); + Ldloc(startingRunTextPos); + Ldloc(runtextposLocal); + Call(s_transferCaptureMethod); + } } // Emits code to unwind the capture stack until the crawl position specified in the provided local. @@ -2091,7 +2308,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.Loop: - EmitAtomicNodeLoop(node); + EmitLoop(node); break; case RegexNode.Onelazy: @@ -2102,7 +2319,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck break; case RegexNode.Atomic: - EmitNode(node.Child(0), subsequent); + EmitAtomic(node, subsequent); break; case RegexNode.Alternate: @@ -2119,6 +2336,18 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; + case RegexNode.Ref: + EmitBackreference(node); + break; + + case RegexNode.Testref: + EmitBackreferenceConditional(node); + break; + + case RegexNode.Testgroup: + EmitExpressionConditional(node); + break; + case RegexNode.Capture: EmitCapture(node, subsequent); break; @@ -2149,6 +2378,17 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck } } + // Emits the node for an atomic. + void EmitAtomic(RegexNode node, RegexNode? subsequent) + { + // Atomic simply outputs the code for the child, but it ensures that any done label left + // set by the child is reset to what it was prior to the node's processing. That way, + // anything later that tries to jump back won't see labels set inside the atomic. + Label originalDoneLabel = doneLabel; + EmitNode(node.Child(0), subsequent); + doneLabel = originalDoneLabel; + } + // Emits the code to handle updating base.runtextpos to runtextpos in response to // an UpdateBumpalong node. This is used when we want to inform the scan loop that // it should bump from this location rather than from the original location. @@ -3111,12 +3351,116 @@ void EmitAtomicSingleCharZeroOrOne(RegexNode node) MarkLabel(skipUpdatesLabel); } + // Emits the code to handle a backtracking loop + void EmitLoop(RegexNode node) + { + // If the loop is atomic, emit it as such and avoid all backtracking. + if (node.Next is { Type: RegexNode.Atomic }) + { + EmitAtomicNodeLoop(node); + return; + } + + // If the loop is actually a repeater, similarly emit it as such and avoid all backtracking. + if (node.M == node.N) + { + EmitNodeRepeater(node); + return; + } + + // Emit backtracking around an atomic loop, but tracking the starting position of each iteration + // along the way so that we can backtrack through each position. + + Debug.Assert(node.M < node.N); + Label backtrackingLabel = DefineLabel(); + Label endLoop = DefineLabel(); + LocalBuilder startingStackPos = DeclareInt32(); + LocalBuilder endingStackPos = DeclareInt32(); + + // We're about to enter a loop, so ensure our text position is 0. + TransferTextSpanPosToRunTextPos(); + + // Grab the current position, then emit the loop as atomic, except keeping track of the position + // before each iteration match, which enables us to then apply the backtracking. + + // int startingStackPos = runstackpos; + Ldloc(runstackpos); + Stloc(startingStackPos); + + EmitAtomicNodeLoop(node, trackStartingPositions: true); + TransferTextSpanPosToRunTextPos(); + + // int endingStackPos = runstackpos; + Ldloc(runstackpos); + Stloc(endingStackPos); + + LocalBuilder? crawlPos = null; + if (expressionHasCaptures) + { + // int crawlPos = base.Crawlpos(); + crawlPos = DeclareInt32(); + Ldthis(); + Call(s_crawlposMethod); + Stloc(crawlPos); + } + if (node.M > 0) + { + // startingStackPos += node.M; + Ldloc(startingStackPos); + Ldc(node.M); + Add(); + Stloc(startingStackPos); + } + + // goto endLoop; + BrFar(endLoop); + + // Backtracking section. Subsequent failures will jump to here, at which + // point we decrement the matched count as long as it's above the minimum + // required, and try again by flowing to everything that comes after this. + + // Backtrack: + // if (startingStackPos >= endingStackPos) goto endLoop; + MarkLabel(backtrackingLabel); + Label originalDoneLabel = doneLabel; + Ldloc(startingStackPos); + Ldloc(endingStackPos); + BgeFar(originalDoneLabel); + doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes + + if (expressionHasCaptures) + { + // Uncapture any captures if the expression has any. It's possible the captures it has + // are before this node, in which case this is wasted effort, but still functionally correct. + EmitUncaptureUntil(crawlPos!); + } + + // runtextpos = base.runstack[--endingStackPos]; + Ldthisfld(s_runstackField); + Ldloc(endingStackPos); + Ldc(1); + Sub(); + Stloc(endingStackPos); + Ldloc(endingStackPos); + LdelemI4(); + Stloc(runtextposLocal); + + LoadTextSpanLocal(); + + MarkLabel(endLoop); + + // We explicitly do not reset doneLabel back to originalDoneLabel. + // It's left pointing to the backtracking label for everything subsequent in the expression. + } + // Emits the code to handle a non-backtracking, variable-length loop around another node. - void EmitAtomicNodeLoop(RegexNode node) + // If trackStartingPositions is true, it will also handle emitting code to use runstack[runstackpos] + // to store the starting positions of each iteration. + void EmitAtomicNodeLoop(RegexNode node, bool trackStartingPositions = false) { - Debug.Assert(node.Type == RegexNode.Loop); - Debug.Assert(node.M == node.N || (node.Next != null && (node.Next.Type is RegexNode.Atomic or RegexNode.Capture))); - Debug.Assert(node.M < int.MaxValue); + Debug.Assert(node.Type == RegexNode.Loop, $"Unexpected type: {node.Type}"); + Debug.Assert(node.M < int.MaxValue, $"Unexpected M={node.M}"); + Debug.Assert(node.N >= node.M, $"Unexpected M={node.M}, N={node.N}"); // If this is actually a repeater, emit that instead. if (node.M == node.N) @@ -3168,6 +3512,43 @@ void EmitAtomicNodeLoop(RegexNode node) Ldloc(runtextposLocal); Stloc(startingRunTextPosLocal); + if (trackStartingPositions) + { + // Track the starting position of each loop iteration to enable backtracking. + + // if (runstackpos != base.runstack.Length) goto storeLabel; + Ldloc(runstackpos); + Ldthisfld(s_runstackField); + Ldlen(); + Label storeLabel = DefineLabel(); + Bne(storeLabel); + + // Array.Resize(ref base.runstack, base.runstack.Length * 2); + Ldthis(); + _ilg!.Emit(OpCodes.Ldflda, s_runstackField); + Ldthisfld(s_runstackField); + Ldlen(); + _ilg!.Emit(OpCodes.Conv_I4); + Ldc(2); + Mul(); + Call(s_arrayResize); + + // storeLabel: + MarkLabel(storeLabel); + + // base.runstack[runstackpos] = runtextpos; + Ldthisfld(s_runstackField); + Ldloc(runstackpos); + Ldloc(runtextposLocal); + StelemI4(); + + // runstackpos++; + Ldloc(runstackpos); + Ldc(1); + Add(); + Stloc(runstackpos); + } + // Emit the child. Debug.Assert(textSpanPos == 0); EmitNode(node.Child(0)); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index d53ddaf082c2e..d936322c02438 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -240,6 +240,8 @@ private void MakeLoopAtomic() [Conditional("DEBUG")] private void ValidateFinalTreeInvariants() { + Debug.Assert(Type == Capture, "Every generated tree should begin with a capture node"); + var toExamine = new Stack(); toExamine.Push(this); while (toExamine.Count > 0) @@ -306,8 +308,11 @@ private void ValidateFinalTreeInvariants() break; case Testref: + Debug.Assert(childCount is 1 or 2, $"Expected one or two children for {node.TypeName}, got {childCount}"); + break; + case Testgroup: - Debug.Assert(childCount >= 1, $"Expected at least one child for {node.TypeName}, got {childCount}."); + Debug.Assert(childCount is 2 or 3, $"Expected two or three children for {node.TypeName}, got {childCount}"); break; case Concatenate: @@ -2229,9 +2234,18 @@ internal bool SupportsSimplifiedCodeGenerationImplementation() case Empty: case Nothing: case UpdateBumpalong: + // Backreferences are supported + case Ref: supported = true; break; + // Conditional backreference tests are also supported, so long as both their yes/no branches are supported. + case Testref: + supported = + Child(0).SupportsSimplifiedCodeGenerationImplementation() && + (childCount == 1 || Child(1).SupportsSimplifiedCodeGenerationImplementation()); + break; + // Single character greedy/lazy loops are supported if either they're actually a repeater // or they're not contained in any construct other than simple nesting (e.g. concat, capture). case Oneloop: @@ -2244,17 +2258,10 @@ internal bool SupportsSimplifiedCodeGenerationImplementation() supported = M == N || AncestorsAllowBacktracking(Next); break; - // Loop repeaters are the same, except their child also needs to be supported. - // We also support such loops being atomic. + // For greedy and lazy loops, they're supported if the node they wrap is supported + // and either the node is actually a repeater, is atomic, or is in the tree in a + // location where backtracking is allowed. case Loop: - supported = - (M == N || (Next != null && Next.Type == Atomic)) && - Child(0).SupportsSimplifiedCodeGenerationImplementation(); - break; - - // Similarly, as long as the wrapped node supports simplified code gen, - // Lazy is supported if it's a repeater or atomic, but also if it's in - // a place where backtracking is allowed (e.g. it's top-level). case Lazyloop: supported = (M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) && @@ -2297,11 +2304,10 @@ internal bool SupportsSimplifiedCodeGenerationImplementation() break; case Capture: - // Currently we only support capnums without uncapnums (for balancing groups) - supported = N == -1; + supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); if (supported) { - // And we only support them in certain places in the tree. + // Captures are currently only supported in certain places in the tree. RegexNode? parent = Next; while (parent != null) { @@ -2322,25 +2328,31 @@ internal bool SupportsSimplifiedCodeGenerationImplementation() } } + // If we've found a supported capture, mark all of the nodes in its parent + // hierarchy as containing a capture. if (supported) { - // And we only support them if their children are supported. - supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); - - // If we've found a supported capture, mark all of the nodes in its parent - // hierarchy as containing a capture. - if (supported) + parent = this; + while (parent != null && ((parent.Options & HasCapturesFlag) == 0)) { - parent = this; - while (parent != null && ((parent.Options & HasCapturesFlag) == 0)) - { - parent.Options |= HasCapturesFlag; - parent = parent.Next; - } + parent.Options |= HasCapturesFlag; + parent = parent.Next; } } } break; + + case Testgroup: + supported = + Child(0).SupportsSimplifiedCodeGenerationImplementation() && + Child(1).SupportsSimplifiedCodeGenerationImplementation() && + (childCount == 2 || Child(2).SupportsSimplifiedCodeGenerationImplementation()); + break; + + default: + Debug.Fail($"Unknown type: {Type}"); + supported = false; + break; } } #if DEBUG diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index 0bda8a2367ed6..e327952a2560e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -2071,6 +2071,17 @@ private bool IsCaptureSlot(int i) return i >= 0 && i < _capsize; } + /// + /// When generating code on a regex that uses a sparse set + /// of capture slots, we hash them to a dense set of indices + /// for an array of capture slots. Instead of doing the hash + /// at match time, it's done at compile time, here. + /// + internal static int MapCaptureNumber(int capnum, Hashtable? caps) => + capnum == -1 ? -1 : + caps != null ? (int)caps[capnum]! : + capnum; + /// Looks up the slot number for a given name private bool IsCaptureName(string capname) => _capnames != null && _capnames.ContainsKey(capname); @@ -2171,7 +2182,7 @@ private void AddConcatenate(int pos, int cch, bool isReplacement) _concatenation!.AddChild(RegexNode.CreateOneWithCaseConversion(_pattern[pos], isReplacement ? _options & ~RegexOptions.IgnoreCase : _options, _culture)); break; - case > 1 when !UseOptionI() || isReplacement: + case > 1 when !UseOptionI() || isReplacement || !RegexCharClass.ParticipatesInCaseConversion(_pattern.AsSpan(pos, cch)): _concatenation!.AddChild(new RegexNode(RegexNode.Multi, _options & ~RegexOptions.IgnoreCase, _pattern.Substring(pos, cch))); break; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs index 93420b2381381..5ef7281b5884d 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexWriter.cs @@ -214,17 +214,6 @@ private int StringCode(string str) return i; } - /// - /// When generating code on a regex that uses a sparse set - /// of capture slots, we hash them to a dense set of indices - /// for an array of capture slots. Instead of doing the hash - /// at match time, it's done at compile time, here. - /// - private int MapCapnum(int capnum) => - capnum == -1 ? -1 : - _caps != null ? (int)_caps[capnum]! : - capnum; - /// /// The main RegexCode generator. It does a depth-first walk /// through the tree and calls EmitFragment to emits code before @@ -283,7 +272,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex) Emit(RegexCode.Setjump); _intStack.Append(_emitted.Length); Emit(RegexCode.Lazybranch, 0); - Emit(RegexCode.Testref, MapCapnum(node.M)); + Emit(RegexCode.Testref, RegexParser.MapCaptureNumber(node.M, _caps)); Emit(RegexCode.Forejump); break; } @@ -391,7 +380,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex) break; case RegexNode.Capture | AfterChild: - Emit(RegexCode.Capturemark, MapCapnum(node.M), MapCapnum(node.N)); + Emit(RegexCode.Capturemark, RegexParser.MapCaptureNumber(node.M, _caps), RegexParser.MapCaptureNumber(node.N, _caps)); break; case RegexNode.Require | BeforeChild: @@ -471,7 +460,7 @@ private void EmitFragment(int nodetype, RegexNode node, int curIndex) break; case RegexNode.Ref: - Emit(node.Type | bits, MapCapnum(node.M)); + Emit(node.Type | bits, RegexParser.MapCaptureNumber(node.M, _caps)); break; case RegexNode.Nothing: diff --git a/src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs b/src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs index 65289ac73d121..9fa5749f667d0 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/MonoRegexTests.cs @@ -70,7 +70,10 @@ public static IEnumerable ValidateRegex_MemberData() (string Pattern, RegexOptions Options, string Input, string Expected) testCase = allEngineCases[i]; yield return new object[] { engine, testCase.Pattern, testCase.Options, results[i], testCase.Input, expected }; - yield return new object[] { engine, testCase.Pattern, testCase.Options | RegexOptions.CultureInvariant, results[i], testCase.Input, expected }; + if ((testCase.Options & RegexOptions.IgnoreCase) != 0) + { + yield return new object[] { engine, testCase.Pattern, testCase.Options | RegexOptions.CultureInvariant, results[i], testCase.Input, expected }; + } } } diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs index 3204d7a989268..723bb034acc6c 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.Groups.Tests.cs @@ -619,6 +619,15 @@ public static IEnumerable Groups_Basic_TestData() yield return new object[] { engine, null, @"(.*)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages/homepage.aspx", "index" } }; yield return new object[] { engine, null, @"(.*)/(.+)/(.+).aspx", "/pages/homepage.aspx/index.aspx", RegexOptions.None, new string[] { "/pages/homepage.aspx/index.aspx", "/pages", "homepage.aspx", "index" } }; + // Captures inside varying constructs with backtracking needing to uncapture + yield return new object[] { engine, null, @"a(bc)d|abc(e)", "abce", RegexOptions.None, new string[] { "abce", "", "e" } }; // alternation + yield return new object[] { engine, null, @"((ab){2}cd)*", "ababcdababcdababc", RegexOptions.None, new string[] { "ababcdababcd", "ababcd", "ab" } }; // loop + yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "aba", RegexOptions.None, new string[] { "a", "", "" } }; // positive lookahead in a loop + yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "ababa", RegexOptions.None, new string[] { "aba", "ab", "a" } }; // positive lookahead in a loop + yield return new object[] { engine, null, @"(ab(?=(\w)\w))*a", "abababa", RegexOptions.None, new string[] { "ababa", "ab", "a" } }; // positive lookahead in a loop + yield return new object[] { engine, null, @"\w\w(?!(\d)\d)", "aa..", RegexOptions.None, new string[] { "aa", "" } }; // negative lookahead + yield return new object[] { engine, null, @"\w\w(?!(\d)\d)", "aa.3", RegexOptions.None, new string[] { "aa", "" } }; // negative lookahead + // Quantifiers yield return new object[] { engine, null, @"a*", "", RegexOptions.None, new string[] { "" } }; yield return new object[] { engine, null, @"a*", "a", RegexOptions.None, new string[] { "a" } }; diff --git a/src/libraries/System.Text.RegularExpressions/tests/Regex.KnownPattern.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/Regex.KnownPattern.Tests.cs index 7aefe06de78b7..b53526645705f 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/Regex.KnownPattern.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/Regex.KnownPattern.Tests.cs @@ -504,8 +504,6 @@ public async Task Docs_GroupingConstructs_NonbacktrackingSubexpressions(RegexEng Regex rBack = await RegexHelpers.GetRegexAsync(engine, @"(\w)\1+.\b"); Regex rNoBack = await RegexHelpers.GetRegexAsync(engine, @"(?>(\w)\1+).\b"); - string[] inputs = { "aaad", "aaaa" }; - Match back, noback; back = rBack.Match("cccd."); @@ -1117,6 +1115,95 @@ public async Task Docs_Anchors_ContiguousMatches(RegexEngine engine) Regex.Replace(Input, Pattern, m => string.Concat(m.Value.Reverse()))); } + // + // Based on examples from https://blog.stevenlevithan.com/archives/balancing-groups + // + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task Blog_Levithan_BalancingGroups_Palindromes(RegexEngine engine) + { + if (RegexHelpers.IsNonBacktracking(engine)) + { + // balancing groups not supported + return; + } + + Regex r = await RegexHelpers.GetRegexAsync(engine, @"(?.)+.?(?<-N>\k)+(?(N)(?!))"); + + // Palindromes + Assert.All(new[] + { + "kayak", + "racecar", + "never odd or even", + "madam im adam" + }, p => Assert.True(r.IsMatch(p))); + + // Non-Palindromes + Assert.All(new[] + { + "canoe", + "raceboat" + }, p => Assert.False(r.IsMatch(p))); + } + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task Blog_Levithan_BalancingGroups_MatchingParentheses(RegexEngine engine) + { + if (RegexHelpers.IsNonBacktracking(engine)) + { + // balancing groups not supported + return; + } + + Regex r = await RegexHelpers.GetRegexAsync(engine, @"^\( + (?> + [^()]+ + | + \( (?) + | + \) (?<-Depth>) + )* + (?(Depth)(?!)) + \)$", RegexOptions.IgnorePatternWhitespace); + + Assert.True(r.IsMatch("()")); + Assert.True(r.IsMatch("(a(b c(de(f(g)hijkl))mn))")); + + Assert.False(r.IsMatch("(")); + Assert.False(r.IsMatch(")")); + Assert.False(r.IsMatch("())")); + Assert.False(r.IsMatch("(()")); + Assert.False(r.IsMatch("(ab(cd)ef")); + } + + [Theory] + [MemberData(nameof(RegexHelpers.AvailableEngines_MemberData), MemberType = typeof(RegexHelpers))] + public async Task Blog_Levithan_BalancingGroups_WordLengthIncreases(RegexEngine engine) + { + if (RegexHelpers.IsNonBacktracking(engine)) + { + // balancing groups not supported + return; + } + + Regex r = await RegexHelpers.GetRegexAsync(engine, @"^(?: + (?(A)\s|) + (?) + (?\w)+ (?(B)(?!)) + (?: + \s + (?) + (?\w)+ (?(C)(?!)) + (?) + )? + )+ \b$", RegexOptions.IgnorePatternWhitespace); + + Assert.True(r.IsMatch("a bc def ghij klmni")); + Assert.False(r.IsMatch("a bc def ghi klmn")); + } // // These patterns come from real-world customer usages diff --git a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs index 47ece73defcc7..3c77f7110b073 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/RegexGeneratorHelper.netcoreapp.cs @@ -111,8 +111,8 @@ internal static async Task SourceGenRegexAsync( if (generatorResults.Diagnostics.Length != 0) { throw new ArgumentException( - string.Join(Environment.NewLine, generatorResults.Diagnostics) + Environment.NewLine + - string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString())))); + string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine + + string.Join(Environment.NewLine, generatorResults.Diagnostics)); } // Compile the assembly to a stream @@ -122,8 +122,8 @@ internal static async Task SourceGenRegexAsync( if (!results.Success || results.Diagnostics.Length != 0) { throw new ArgumentException( - string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorResults.Diagnostics)) + Environment.NewLine + - string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString())))); + string.Join(Environment.NewLine, generatorResults.GeneratedTrees.Select(t => NumberLines(t.ToString()))) + Environment.NewLine + + string.Join(Environment.NewLine, results.Diagnostics.Concat(generatorResults.Diagnostics))); } dll.Position = 0;