Skip to content

Commit

Permalink
Expand One/Notone/Setlazy support to Lazy
Browse files Browse the repository at this point in the history
The logic added in a recent PR to support simplified code generation for a Regex with {One/Notone/Set}lazy nodes is almost sufficient to support Lazy as well.  This fills the gap.  It also deletes an erroneous optimization added in .NET 5 that removed top-level Atomic nodes; the idea behind it was that such nodes are meaningless as, at the top-level, nothing can backtrack in anyway, but it then means that any node which is paying attention to whether its parent is Atomic may no longer find it (and that's needed by Lazy).
  • Loading branch information
stephentoub committed Nov 19, 2021
1 parent 779c498 commit 854795b
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 79 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,7 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm,

int labelCounter = 0;
string DefineLabel(string prefix = "L") => $"{prefix}{labelCounter++}";
void MarkLabel(string label) => writer.WriteLine($"{label}:");
void MarkLabel(string label, bool addEmptyStatement = false) => writer.WriteLine($"{label}:{(addEmptyStatement ? " ;" : "")}");
void Goto(string label) => writer.WriteLine($"goto {label};");
string doneLabel = "NoMatch";
string originalDoneLabel = doneLabel;
Expand All @@ -772,17 +772,11 @@ private static void EmitSimplifiedGo(IndentedTextWriter writer, RegexMethod rm,

// Emit failure
writer.WriteLine("// No match");
MarkLabel(originalDoneLabel);
MarkLabel(originalDoneLabel, !expressionHasCaptures);
if (expressionHasCaptures)
{
EmitUncaptureUntil("0");
}
else
{
// We can't have a label at the end of the method, so explicitly
// add a "return;" if the End label would otherwise be an issue.
writer.WriteLine("return;");
}
return;

static bool IsCaseInsensitive(RegexNode node) => (node.Options & RegexOptions.IgnoreCase) != 0;
Expand Down Expand Up @@ -1174,15 +1168,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitAtomicNodeLoop(node);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
case RegexNode.Lazyloop:
// An atomic lazy loop amounts to doing the minimum amount of work possible.
// That means iterating as little as is required, which means a repeater
// for the min, and if min is 0, doing nothing.
Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic));
if (node.M > 0)
{
EmitNodeRepeater(node);
}
EmitLazy(node, emitLengthChecksIfRequired);
break;

case RegexNode.Alternate:
Expand All @@ -1195,12 +1185,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
break;
Expand Down Expand Up @@ -1682,17 +1666,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
{
bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily;

// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
if (isSingleChar)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}
else
{
EmitNodeRepeater(node);
}
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
// If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic
// lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum.
if (node.M == node.N || node.Next is { Type: RegexNode.Atomic })
{
return;
}
Expand Down Expand Up @@ -1762,7 +1756,15 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
// for the next time we backtrack.
writer.WriteLine($"runtextpos = {nextPos};");
LoadTextSpanLocal(writer);
EmitSingleChar(node);
if (isSingleChar)
{
EmitSingleChar(node);
}
else
{
writer.WriteLine();
EmitNode(node.Child(0));
}
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"{nextPos} = runtextpos;");

Expand All @@ -1772,7 +1774,7 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

writer.WriteLine();
MarkLabel(endLoopLabel);
MarkLabel(endLoopLabel, addEmptyStatement: true);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2094,15 +2094,11 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitAtomicNodeLoop(node);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
case RegexNode.Lazyloop:
// An atomic lazy loop amounts to doing the minimum amount of work possible.
// That means iterating as little as is required, which means a repeater
// for the min, and if min is 0, doing nothing.
Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic));
if (node.M > 0)
{
EmitNodeRepeater(node);
}
EmitLazy(node, emitLengthChecksIfRequired);
break;

case RegexNode.Atomic:
Expand All @@ -2119,12 +2115,6 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
EmitSingleCharLoop(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
EmitConcatenation(node, subsequent, emitLengthChecksIfRequired);
break;
Expand Down Expand Up @@ -2558,17 +2548,27 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
MarkLabel(endLoop);
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
void EmitLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
{
bool isSingleChar = node.IsOneFamily || node.IsNotoneFamily || node.IsSetFamily;

// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
if (isSingleChar)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}
else
{
EmitNodeRepeater(node);
}
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
// If the whole thing was actually that repeater, we're done. Similarly, if this is actually an atomic
// lazy loop, nothing will ever backtrack into this node, so we never need to iterate more than the minimum.
if (node.M == node.N || node.Next is { Type: RegexNode.Atomic })
{
return;
}
Expand Down Expand Up @@ -2657,7 +2657,14 @@ void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitL
Ldloc(nextPos);
Stloc(runtextposLocal);
LoadTextSpanLocal();
EmitSingleChar(node);
if (isSingleChar)
{
EmitSingleChar(node);
}
else
{
EmitNode(node.Child(0));
}
TransferTextSpanPosToRunTextPos();
Ldloc(runtextposLocal);
Stloc(nextPos);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -407,14 +407,6 @@ internal RegexNode FinalOptimize()
}
}

// Optimization: Unnecessary root atomic.
// If the root node under the implicit Capture is an Atomic, the Atomic is useless as there's nothing
// to backtrack into it, so we can remove it.
while (rootNode.Child(0).Type == Atomic)
{
rootNode.ReplaceChild(0, rootNode.Child(0).Child(0));
}

// Done optimizing. Return the final tree.
#if DEBUG
rootNode.ValidateFinalTreeInvariants();
Expand Down Expand Up @@ -2250,42 +2242,30 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
case Setlazy:
Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type.");
supported = M == N || AncestorsAllowBacktracking(Next);
static bool AncestorsAllowBacktracking(RegexNode? node)
{
while (node is not null)
{
switch (node.Type)
{
case Concatenate:
case Capture:
case Atomic:
node = node.Next;
break;

default:
return false;
}
}

return true;
}
break;

// {Lazy}Loop repeaters are the same, except their child also needs to be supported.
// Loop repeaters are the same, except their child also needs to be supported.
// We also support such loops being atomic.
case Loop:
case Lazyloop:
supported =
(M == N || (Next != null && Next.Type == Atomic)) &&
Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// We can handle atomic as long as we can handle making its child atomic, or
// its child doesn't have that concept.
case Atomic:
// Similarly, as long as the wrapped node supports simplified code gen,
// Lazy is supported if it's a repeater or atomic, but also if it's in
// a place where backtracking is allowed (e.g. it's top-level).
case Lazyloop:
supported =
(M == N || (Next != null && Next.Type == Atomic) || AncestorsAllowBacktracking(Next)) &&
Child(0).SupportsSimplifiedCodeGenerationImplementation();
break;

// We can handle atomic as long as its child is supported.
// Lookahead assertions also only require that the child node be supported.
// The RightToLeft check earlier is important to differentiate lookbehind,
// which is not supported.
case Atomic:
case Require:
case Prevent:
supported = Child(0).SupportsSimplifiedCodeGenerationImplementation();
Expand Down Expand Up @@ -2370,6 +2350,26 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
}
#endif
return supported;

static bool AncestorsAllowBacktracking(RegexNode? node)
{
while (node is not null)
{
switch (node.Type)
{
case Concatenate:
case Capture:
case Atomic:
node = node.Next;
break;

default:
return false;
}
}

return true;
}
}

/// <summary>Gets whether the node is a Set/Setloop/Setloopatomic/Setlazy node.</summary>
Expand Down

0 comments on commit 854795b

Please sign in to comment.