diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 87a1e7cf21213..7c588f0c3dbd2 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -750,7 +750,7 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, int labelCounter = 0; string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}"; - void MarkLabel(string label) => writer.WriteLine($"{label}:"); + void MarkLabel(string label, bool addEmptyStatement = false) => writer.WriteLine($"{label}:{(addEmptyStatement ? " ;" : "")}"); void Goto(string label) => writer.WriteLine($"goto {label};"); string doneLabel = "NoMatch"; string originalDoneLabel = doneLabel; @@ -772,17 +772,11 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm, // Emit failure writer.WriteLine("// No match"); - MarkLabel(originalDoneLabel); + MarkLabel(originalDoneLabel, !expressionHasCaptures); if (expressionHasCaptures) { EmitUncaptureUntil("0"); } - else - { - // We can't have a label at the end of the method, so explicitly - // add a "return;" if the End label would otherwise be an issue. - writer.WriteLine("return;"); - } return; static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0; @@ -1174,15 +1168,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitAtomicNodeLoop(node); break; + case RegexNode.Onelazy: + case RegexNode.Notonelazy: + case RegexNode.Setlazy: case RegexNode.Lazyloop: - // An atomic lazy loop amounts to doing the minimum amount of work possible. - // That means iterating as little as is required, which means a repeater - // for the min, and if min is 0, doing nothing. - Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic)); - if (node.M > 0) - { - EmitNodeRepeater(node); - } + EmitLazy(node, emitLengthChecksIfRequired); break; case RegexNode.Alternate: @@ -1195,12 +1185,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); break; - case RegexNode.Onelazy: - case RegexNode.Notonelazy: - case RegexNode.Setlazy: - EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); - break; - case RegexNode.Concatenate: EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; @@ -1682,17 +1666,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL // It's left pointing to the backtracking label for everything subsequent in the expression. } - void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true) { + bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily; + // Emit the min iterations as a repeater. Any failures here don't necessitate backtracking, // as the lazy itself failed to match. if (node.M > 0) { - EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + if (isSingleChar) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + } + else + { + EmitNodeRepeater(node); + } } - // If the whole thing was actually that repeater, we're done. - if (node.M == node.N) + // If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic + // lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum. + if (node.M == node.N || node.Next is { Type: RegexNode.Atomic }) { return; } @@ -1762,7 +1756,15 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL // for the next time we backtrack. writer.WriteLine($"runtextpos = {nextPos};"); LoadTextSpanLocal(writer); - EmitSingleChar(node); + if (isSingleChar) + { + EmitSingleChar(node); + } + else + { + writer.WriteLine(); + EmitNode(node.Child(0)); + } TransferTextSpanPosToRunTextPos(); writer.WriteLine($"{nextPos} = runtextpos;"); @@ -1772,7 +1774,7 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes writer.WriteLine(); - MarkLabel(endLoopLabel); + MarkLabel(endLoopLabel, addEmptyStatement: true); // We explicitly do not reset doneLabel back to originalDoneLabel. // It's left pointing to the backtracking label for everything subsequent in the expression. diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index 3a57f1840516c..054df1c5c33d9 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -2094,15 +2094,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitAtomicNodeLoop(node); break; + case RegexNode.Onelazy: + case RegexNode.Notonelazy: + case RegexNode.Setlazy: case RegexNode.Lazyloop: - // An atomic lazy loop amounts to doing the minimum amount of work possible. - // That means iterating as little as is required, which means a repeater - // for the min, and if min is 0, doing nothing. - Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic)); - if (node.M > 0) - { - EmitNodeRepeater(node); - } + EmitLazy(node, emitLengthChecksIfRequired); break; case RegexNode.Atomic: @@ -2119,12 +2115,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired); break; - case RegexNode.Onelazy: - case RegexNode.Notonelazy: - case RegexNode.Setlazy: - EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired); - break; - case RegexNode.Concatenate: EmitConcatenation(node, subsequent, emitLengthChecksIfRequired); break; @@ -2558,17 +2548,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL MarkLabel(endLoop); } - void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true) + void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true) { + bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily; + // Emit the min iterations as a repeater. Any failures here don't necessitate backtracking, // as the lazy itself failed to match. if (node.M > 0) { - EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + if (isSingleChar) + { + EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired); + } + else + { + EmitNodeRepeater(node); + } } - // If the whole thing was actually that repeater, we're done. - if (node.M == node.N) + // If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic + // lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum. + if (node.M == node.N || node.Next is { Type: RegexNode.Atomic }) { return; } @@ -2657,7 +2657,14 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL Ldloc(nextPos); Stloc(runtextposLocal); LoadTextSpanLocal(); - EmitSingleChar(node); + if (isSingleChar) + { + EmitSingleChar(node); + } + else + { + EmitNode(node.Child(0)); + } TransferTextSpanPosToRunTextPos(); Ldloc(runtextposLocal); Stloc(nextPos); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs index 88a2800f8f96b..d53ddaf082c2e 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexNode.cs @@ -407,14 +407,6 @@ internal RegexNode FinalOptimize() } } - // Optimization: Unnecessary root atomic. - // If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing - // to backtrack into it, so we can remove it. - while (rootNode.Child(0).Type == Atomic) - { - rootNode.ReplaceChild(0, rootNode.Child(0).Child(0)); - } - // Done optimizing. Return the final tree. #if DEBUG rootNode.ValidateFinalTreeInvariants(); @@ -2250,42 +2242,30 @@ internal bool SupportsSimplifiedCodeGenerationImplementation() case Setlazy: Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type."); supported = M == N || AncestorsAllowBacktracking(Next); - static bool AncestorsAllowBacktracking(RegexNode? node) - { - while (node is not null) - { - switch (node.Type) - { - case Concatenate: - case Capture: - case Atomic: - node = node.Next; - break; - - default: - return false; - } - } - - return true; - } break; - // {Lazy}Loop repeaters are the same, except their child also needs to be supported. + // Loop repeaters are the same, except their child also needs to be supported. // We also support such loops being atomic. case Loop: - case Lazyloop: supported = (M == N || (Next != null && Next.Type == Atomic)) && Child(0).SupportsSimplifiedCodeGenerationImplementation(); break; - // We can handle atomic as long as we can handle making its child atomic, or - // its child doesn't have that concept. - case Atomic: + // Similarly, as long as the wrapped node supports simplified code gen, + // Lazy is supported if it's a repeater or atomic, but also if it's in + // a place where backtracking is allowed (e.g. it's top-level). + case Lazyloop: + supported = + (M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) && + Child(0).SupportsSimplifiedCodeGenerationImplementation(); + break; + + // We can handle atomic as long as its child is supported. // Lookahead assertions also only require that the child node be supported. // The RightToLeft check earlier is important to differentiate lookbehind, // which is not supported. + case Atomic: case Require: case Prevent: supported = Child(0).SupportsSimplifiedCodeGenerationImplementation(); @@ -2370,6 +2350,26 @@ static bool AncestorsAllowBacktracking(RegexNode? node) } #endif return supported; + + static bool AncestorsAllowBacktracking(RegexNode? node) + { + while (node is not null) + { + switch (node.Type) + { + case Concatenate: + case Capture: + case Atomic: + node = node.Next; + break; + + default: + return false; + } + } + + return true; + } } /// Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.