Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add single char lazy loop support to simplified Regex code gen #61698

Merged
merged 2 commits into from
Nov 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
Expand Down Expand Up @@ -1615,7 +1615,6 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
string endLoop = DefineLabel("EndLoop");
string startingPos = NextLocalName("startingRunTextPos");
string endingPos = NextLocalName("endingRunTextPos");
string crawlPos = NextLocalName("crawlPos");

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();
Expand All @@ -1629,7 +1628,12 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
EmitSingleCharAtomicLoop(node);
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"int {endingPos} = runtextpos;");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
string? crawlPos = null;
if (expressionHasCaptures)
{
crawlPos = NextLocalName("crawlPos");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
}
if (node.M > 0)
{
writer.WriteLine($"{startingPos} += {node.M};");
Expand Down Expand Up @@ -1678,6 +1682,102 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
{
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
joperezr marked this conversation as resolved.
Show resolved Hide resolved
{
return;
}

Debug.Assert(node.M < node.N);

// We now need to match one character at a time, each time allowing the remainder of the expression
// to try to match, and only matching another character if the subsequent expression fails to match.

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();

// If the loop isn't unbounded, track the number of iterations and the max number to allow.
string? iterationCount = null;
string? maxIterations = null;
if (node.N != int.MaxValue)
{
iterationCount = NextLocalName("i");
maxIterations = NextLocalName("maxIterations");
writer.WriteLine($"int {iterationCount} = 0;");
writer.WriteLine($"int {maxIterations} = {node.N - node.M};");
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
string? crawlPos = null;
if (expressionHasCaptures)
{
crawlPos = NextLocalName("crawlPos");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
}

// Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which
// is also incremented each time we match another character in the loop.
string nextPos = NextLocalName("nextPos");
writer.WriteLine($"int {nextPos} = runtextpos;");

// Skip the backtracking section for the initial subsequent matching. We've already matched the
// minimum number of iterations, which means we can successfully match with zero additional iterations.
string endLoopLabel = DefineLabel("endLoop");
writer.WriteLine($"goto {endLoopLabel};");
writer.WriteLine();

// Backtracking section. Subsequent failures will jump to here.
string backtrackingLabel = DefineLabel("Backtrack");
MarkLabel(backtrackingLabel);

// Uncapture any captures if the expression has any. It's possible the captures it has
// are before this node, in which case this is wasted effort, but still functionally correct.
if (expressionHasCaptures)
{
EmitUncaptureUntil(crawlPos);
}

// If there's a max number of iterations, see if we've exceeded the maximum number of characters
// to match. If we haven't, increment the iteration count.
if (maxIterations is not null)
{
using (EmitBlock(writer, $"if ({iterationCount} >= {maxIterations})"))
{
writer.WriteLine($"goto {doneLabel};");
}
writer.WriteLine($"{iterationCount}++;");
}

// Now match the next character in the lazy loop. We need to reset the runtextpos to the position
// just after the last character in this loop was matched, and we need to store the resulting position
// for the next time we backtrack.
writer.WriteLine($"runtextpos = {nextPos};");
LoadTextSpanLocal(writer);
EmitSingleChar(node);
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"{nextPos} = runtextpos;");

// Update the done label for everything that comes after this node. This is done after we emit the single char
// matching, as that failing indicates the loop itself has failed to match.
string originalDoneLabel = doneLabel;
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

writer.WriteLine();
MarkLabel(endLoopLabel);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

// Emits the code to handle a loop (repeater) with a fixed number of iterations.
// RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2122,7 +2122,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
Expand Down Expand Up @@ -2558,6 +2558,121 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
MarkLabel(endLoop);
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
{
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
{
return;
}

Debug.Assert(node.M < node.N);

// We now need to match one character at a time, each time allowing the remainder of the expression
// to try to match, and only matching another character if the subsequent expression fails to match.

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();

// If the loop isn't unbounded, track the number of iterations and the max number to allow.
LocalBuilder? iterationCount = null;
LocalBuilder? maxIterations = null;
if (node.N != int.MaxValue)
{
// int iterationCount = 0;
// int maxIterations = node.N - node.M;
iterationCount = DeclareInt32();
maxIterations = DeclareInt32();
Ldc(0);
Stloc(iterationCount);
Ldc(node.N - node.M);
Stloc(maxIterations);
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
LocalBuilder? crawlPos = null;
if (expressionHasCaptures)
{
// int crawlPos = base.Crawlpos();
crawlPos = DeclareInt32();
Ldthis();
Call(s_crawlposMethod);
Stloc(crawlPos);
}

// Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which
// is also incremented each time we match another character in the loop.
// int nextPos = runtextpos;
LocalBuilder nextPos = DeclareInt32();
Ldloc(runtextposLocal);
Stloc(nextPos);

// Skip the backtracking section for the initial subsequent matching. We've already matched the
// minimum number of iterations, which means we can successfully match with zero additional iterations.
// goto endLoopLabel;
Label endLoopLabel = DefineLabel();
BrFar(endLoopLabel);

// Backtracking section. Subsequent failures will jump to here.
Label backtrackingLabel = DefineLabel();
MarkLabel(backtrackingLabel);

// Uncapture any captures if the expression has any. It's possible the captures it has
// are before this node, in which case this is wasted effort, but still functionally correct.
if (expressionHasCaptures)
{
EmitUncaptureUntil(crawlPos!);
}

// If there's a max number of iterations, see if we've exceeded the maximum number of characters
// to match. If we haven't, increment the iteration count.
if (maxIterations is not null)
{
// if (iterationCount >= maxIterations) goto doneLabel;
Ldloc(iterationCount!);
Ldloc(maxIterations);
BgeFar(doneLabel);

// iterationCount++;
Ldloc(iterationCount!);
Ldc(1);
Add();
Stloc(iterationCount!);
}

// Now match the next character in the lazy loop. We need to reset the runtextpos to the position
// just after the last character in this loop was matched, and we need to store the resulting position
// for the next time we backtrack.

// runtextpos = nextPos;
// MatchSingleChar();
// nextpos = runtextpos;
Ldloc(nextPos);
Stloc(runtextposLocal);
LoadTextSpanLocal();
EmitSingleChar(node);
TransferTextSpanPosToRunTextPos();
Ldloc(runtextposLocal);
Stloc(nextPos);

// Update the done label for everything that comes after this node. This is done after we emit the single char
// matching, as that failing indicates the loop itself has failed to match.
Label originalDoneLabel = doneLabel;
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

MarkLabel(endLoopLabel);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

// Emits the code to handle a loop (repeater) with a fixed number of iterations.
// RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,17 +216,21 @@ private void MakeLoopAtomic()
{
switch (Type)
{
case Oneloop:
Type = Oneloopatomic;
case Oneloop or Notoneloop or Setloop:
// For loops, we simply change the Type to the atomic variant.
// Atomic greedy loops should consume as many values as they can.
Type += Oneloopatomic - Oneloop;
break;
case Notoneloop:
Type = Notoneloopatomic;

case Onelazy or Notonelazy or Setlazy:
// For lazy, we not only change the Type, we also lower the max number of iterations
// to the minimum number of iterations, as they should end up matching as little as possible.
Type += Oneloopatomic - Onelazy;
N = M;
break;

default:
#if DEBUG
Debug.Assert(Type == Setloop, $"Unexpected type: {TypeName}");
#endif
Type = Setloopatomic;
Debug.Fail($"Unexpected type: {Type}");
break;
}
}
Expand Down Expand Up @@ -445,11 +449,15 @@ private void EliminateEndingBacktracking()
{
switch (node.Type)
{
// {One/Notone/Set}loops can be upgraded to {One/Notone/Set}loopatomic nodes,
// e.g. [abc]* => (?>[abc]*)
// {One/Notone/Set}loops can be upgraded to {One/Notone/Set}loopatomic nodes, e.g. [abc]* => (?>[abc]*).
// And {One/Notone/Set}lazys can similarly be upgraded to be atomic, which really makes them into repeaters
// or even empty nodes.
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
node.MakeLoopAtomic();
break;

Expand Down Expand Up @@ -642,11 +650,14 @@ private RegexNode ReduceAtomic()
case Setloopatomic:
return child;

// If an atomic subexpression contains only a {one/notone/set}loop,
// If an atomic subexpression contains only a {one/notone/set}{loop/lazy},
// change it to be an {one/notone/set}loopatomic and remove the atomic node.
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
child.MakeLoopAtomic();
return child;

Expand Down Expand Up @@ -2229,11 +2240,14 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
supported = true;
break;

// Single character greedy loops are supported if they're either they're actually a repeater
// Single character greedy/lazy loops are supported if either they're actually a repeater
// or they're not contained in any construct other than simple nesting (e.g. concat, capture).
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type.");
supported = M == N || AncestorsAllowBacktracking(Next);
static bool AncestorsAllowBacktracking(RegexNode? node)
Expand All @@ -2257,12 +2271,6 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
}
break;

case Onelazy:
case Notonelazy:
case Setlazy:
supported = M == N || (Next != null && Next.Type == Atomic);
break;

// {Lazy}Loop repeaters are the same, except their child also needs to be supported.
// We also support such loops being atomic.
case Loop:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,6 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
{
throw new Xunit.Sdk.EqualException(result2, result1);
}

Assert.NotEqual(GetRegexCodes(new Regex(pattern1, RegexOptions.RightToLeft)), GetRegexCodes(new Regex(pattern2)));
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
}

[Theory]
Expand Down Expand Up @@ -476,9 +474,12 @@ public void PatternsReduceIdentically(string pattern1, string pattern2)
[InlineData("(?:ab|cd|ae)f", "(?>ab|cd|ae)f")]
public void PatternsReduceDifferently(string pattern1, string pattern2)
{
var r1 = new Regex(pattern1);
var r2 = new Regex(pattern2);
Assert.NotEqual(GetRegexCodes(r1), GetRegexCodes(r2));
string result1 = GetRegexCodes(new Regex(pattern1));
string result2 = GetRegexCodes(new Regex(pattern2));
if (result1 == result2)
{
throw new Xunit.Sdk.EqualException(result2, result1);
}
}

[Theory]
Expand Down