Skip to content

Commit

Permalink
Add single char lazy loop support to simplified code gen
Browse files Browse the repository at this point in the history
  • Loading branch information
stephentoub committed Nov 18, 2021
1 parent ae570bc commit b051c2c
Show file tree
Hide file tree
Showing 3 changed files with 223 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
Expand Down Expand Up @@ -1615,7 +1615,6 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
string endLoop = DefineLabel("EndLoop");
string startingPos = NextLocalName("startingRunTextPos");
string endingPos = NextLocalName("endingRunTextPos");
string crawlPos = NextLocalName("crawlPos");

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();
Expand All @@ -1629,7 +1628,12 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
EmitSingleCharAtomicLoop(node);
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"int {endingPos} = runtextpos;");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
string? crawlPos = null;
if (expressionHasCaptures)
{
crawlPos = NextLocalName("crawlPos");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
}
if (node.M > 0)
{
writer.WriteLine($"{startingPos} += {node.M};");
Expand Down Expand Up @@ -1678,6 +1682,102 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
{
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
{
return;
}

Debug.Assert(node.M < node.N);

// We now need to match one character at a time, each time allowing the remainder of the expression
// to try to match, and only matching another character if the subsequent expression fails to match.

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();

// If the loop isn't unbounded, track the number of iterations and the max number to allow.
string? iterationCount = null;
string? maxIterations = null;
if (node.N != int.MaxValue)
{
iterationCount = NextLocalName("i");
maxIterations = NextLocalName("maxIterations");
writer.WriteLine($"int {iterationCount} = 0;");
writer.WriteLine($"int {maxIterations} = {node.N - node.M};");
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
string? crawlPos = null;
if (expressionHasCaptures)
{
crawlPos = NextLocalName("crawlPos");
writer.WriteLine($"int {crawlPos} = base.Crawlpos();");
}

// Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which
// is also incremented each time we match another character in the loop.
string nextPos = NextLocalName("nextPos");
writer.WriteLine($"int {nextPos} = runtextpos;");

// Skip the backtracking section for the initial subsequent matching. We've already matched the
// minimum number of iterations, which means we can successfully match with zero additional iterations.
string endLoopLabel = DefineLabel("endLoop");
writer.WriteLine($"goto {endLoopLabel};");
writer.WriteLine();

// Backtracking section. Subsequent failures will jump to here.
string backtrackingLabel = DefineLabel("Backtrack");
MarkLabel(backtrackingLabel);

// Uncapture any captures if the expression has any. It's possible the captures it has
// are before this node, in which case this is wasted effort, but still functionally correct.
if (expressionHasCaptures)
{
EmitUncaptureUntil(crawlPos);
}

// If there's a max number of iterations, see if we've exceeded the maximum number of characters
// to match. If we haven't, increment the iteration count.
if (maxIterations is not null)
{
using (EmitBlock(writer, $"if ({iterationCount} >= {maxIterations})"))
{
writer.WriteLine($"goto {doneLabel};");
}
writer.WriteLine($"{iterationCount}++;");
}

// Now match the next character in the lazy loop. We need to reset the runtextpos to the position
// just after the last character in this loop was matched, and we need to store the resulting position
// for the next time we backtrack.
writer.WriteLine($"runtextpos = {nextPos};");
LoadTextSpanLocal(writer);
EmitSingleChar(node);
TransferTextSpanPosToRunTextPos();
writer.WriteLine($"{nextPos} = runtextpos;");

// Update the done label for everything that comes after this node. This is done after we emit the single char
// matching, as that failing indicates the loop itself has failed to match.
string originalDoneLabel = doneLabel;
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

writer.WriteLine();
MarkLabel(endLoopLabel);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

// Emits the code to handle a loop (repeater) with a fixed number of iterations.
// RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthCheck = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2122,7 +2122,7 @@ void EmitNode(RegexNode node, RegexNode? subsequent = null, bool emitLengthCheck
case RegexNode.Onelazy:
case RegexNode.Notonelazy:
case RegexNode.Setlazy:
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
EmitSingleCharLazy(node, subsequent, emitLengthChecksIfRequired);
break;

case RegexNode.Concatenate:
Expand Down Expand Up @@ -2558,6 +2558,121 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
MarkLabel(endLoop);
}

void EmitSingleCharLazy(RegexNode node, RegexNode? subsequent = null, bool emitLengthChecksIfRequired = true)
{
// Emit the min iterations as a repeater. Any failures here don't necessitate backtracking,
// as the lazy itself failed to match.
if (node.M > 0)
{
EmitSingleCharFixedRepeater(node, emitLengthChecksIfRequired);
}

// If the whole thing was actually that repeater, we're done.
if (node.M == node.N)
{
return;
}

Debug.Assert(node.M < node.N);

// We now need to match one character at a time, each time allowing the remainder of the expression
// to try to match, and only matching another character if the subsequent expression fails to match.

// We're about to enter a loop, so ensure our text position is 0.
TransferTextSpanPosToRunTextPos();

// If the loop isn't unbounded, track the number of iterations and the max number to allow.
LocalBuilder? iterationCount = null;
LocalBuilder? maxIterations = null;
if (node.N != int.MaxValue)
{
// int iterationCount = 0;
// int maxIterations = node.N - node.M;
iterationCount = DeclareInt32();
maxIterations = DeclareInt32();
Ldc(0);
Stloc(iterationCount);
Ldc(node.N - node.M);
Stloc(maxIterations);
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
LocalBuilder? crawlPos = null;
if (expressionHasCaptures)
{
// int crawlPos = base.Crawlpos();
crawlPos = DeclareInt32();
Ldthis();
Call(s_crawlposMethod);
Stloc(crawlPos);
}

// Track the current runtextpos. Each time we backtrack, we'll reset to the stored position, which
// is also incremented each time we match another character in the loop.
// int nextPos = runtextpos;
LocalBuilder nextPos = DeclareInt32();
Ldloc(runtextposLocal);
Stloc(nextPos);

// Skip the backtracking section for the initial subsequent matching. We've already matched the
// minimum number of iterations, which means we can successfully match with zero additional iterations.
// goto endLoopLabel;
Label endLoopLabel = DefineLabel();
BrFar(endLoopLabel);

// Backtracking section. Subsequent failures will jump to here.
Label backtrackingLabel = DefineLabel();
MarkLabel(backtrackingLabel);

// Uncapture any captures if the expression has any. It's possible the captures it has
// are before this node, in which case this is wasted effort, but still functionally correct.
if (expressionHasCaptures)
{
EmitUncaptureUntil(crawlPos!);
}

// If there's a max number of iterations, see if we've exceeded the maximum number of characters
// to match. If we haven't, increment the iteration count.
if (maxIterations is not null)
{
// if (iterationCount >= maxIterations) goto doneLabel;
Ldloc(iterationCount!);
Ldloc(maxIterations);
BgeFar(doneLabel);

// iterationCount++;
Ldloc(iterationCount!);
Ldc(1);
Add();
Stloc(iterationCount!);
}

// Now match the next character in the lazy loop. We need to reset the runtextpos to the position
// just after the last character in this loop was matched, and we need to store the resulting position
// for the next time we backtrack.

// runtextpos = nextPos;
// MatchSingleChar();
// nextpos = runtextpos;
Ldloc(nextPos);
Stloc(runtextposLocal);
LoadTextSpanLocal();
EmitSingleChar(node);
TransferTextSpanPosToRunTextPos();
Ldloc(runtextposLocal);
Stloc(nextPos);

// Update the done label for everything that comes after this node. This is done after we emit the single char
// matching, as that failing indicates the loop itself has failed to match.
Label originalDoneLabel = doneLabel;
doneLabel = backtrackingLabel; // leave set to the backtracking label for all subsequent nodes

MarkLabel(endLoopLabel);

// We explicitly do not reset doneLabel back to originalDoneLabel.
// It's left pointing to the backtracking label for everything subsequent in the expression.
}

// Emits the code to handle a loop (repeater) with a fixed number of iterations.
// RegexNode.M is used for the number of iterations; RegexNode.N is ignored.
void EmitSingleCharFixedRepeater(RegexNode node, bool emitLengthChecksIfRequired = true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2240,11 +2240,14 @@ internal bool SupportsSimplifiedCodeGenerationImplementation()
supported = true;
break;

// Single character greedy loops are supported if they're either they're actually a repeater
// Single character greedy/lazy loops are supported if either they're actually a repeater
// or they're not contained in any construct other than simple nesting (e.g. concat, capture).
case Oneloop:
case Notoneloop:
case Setloop:
case Onelazy:
case Notonelazy:
case Setlazy:
Debug.Assert(Next == null || Next.Type != Atomic, "Loop should have been transformed into an atomic type.");
supported = M == N || AncestorsAllowBacktracking(Next);
static bool AncestorsAllowBacktracking(RegexNode? node)
Expand All @@ -2268,12 +2271,6 @@ static bool AncestorsAllowBacktracking(RegexNode? node)
}
break;

case Onelazy:
case Notonelazy:
case Setlazy:
supported = M == N || (Next != null && Next.Type == Atomic);
break;

// {Lazy}Loop repeaters are the same, except their child also needs to be supported.
// We also support such loops being atomic.
case Loop:
Expand Down

0 comments on commit b051c2c

Please sign in to comment.