Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable regex to use IndexOf(..., OrdinalIgnoreCase) for prefix searching #85438

Merged
merged 3 commits into from
May 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,7 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w
switch (regexTree.FindOptimizations.FindMode)
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOf_LeftToRight();
break;
Expand Down Expand Up @@ -948,14 +949,20 @@ void EmitIndexOf_LeftToRight()
{
RegexFindOptimizations opts = regexTree.FindOptimizations;

string substring = "";
string offset = "";
string offsetDescription = "at the beginning of the pattern";
string substring = "", stringComparison = "", offset = "", offsetDescription = "";

switch (opts.FindMode)
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
substring = regexTree.FindOptimizations.LeadingPrefix;
offsetDescription = "at the beginning of the pattern";
Debug.Assert(!string.IsNullOrEmpty(substring));
break;

case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
substring = regexTree.FindOptimizations.LeadingPrefix;
stringComparison = ", StringComparison.OrdinalIgnoreCase";
offsetDescription = "ordinal case-insensitive at the beginning of the pattern";
Debug.Assert(!string.IsNullOrEmpty(substring));
break;

Expand All @@ -976,7 +983,7 @@ void EmitIndexOf_LeftToRight()

writer.WriteLine($"// The pattern has the literal {Literal(substring)} {offsetDescription}. Find the next occurrence.");
writer.WriteLine($"// If it can't be found, there's no match.");
writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)});");
writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)}{stringComparison});");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ internal abstract class RegexCompiler
private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan<char>).GetMethod("get_Length")!;
private static readonly MethodInfo s_spanIndexOfChar = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfSpanStringComparison = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<char>), typeof(ReadOnlySpan<char>), typeof(StringComparison) })!;
stephentoub marked this conversation as resolved.
Show resolved Hide resolved
private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char));
private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char));
Expand Down Expand Up @@ -456,6 +457,7 @@ protected void EmitTryFindNextPossibleStartingPosition()
switch (_regexTree.FindOptimizations.FindMode)
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOf_LeftToRight();
break;
Expand Down Expand Up @@ -745,25 +747,33 @@ bool EmitAnchors()
void EmitIndexOf_LeftToRight()
{
RegexFindOptimizations opts = _regexTree.FindOptimizations;
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);

using RentedLocalBuilder i = RentInt32Local();

// int i = inputSpan.Slice(pos).IndexOf(prefix);
Ldloca(inputSpan);
Ldloc(pos);
if (opts.FindMode == FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
opts.FixedDistanceLiteral is { Distance: > 0 } literal)
{
Ldc(literal.Distance);
Add();
}
Call(s_spanSliceIntMethod);
Ldstr(opts.FindMode == FindNextStartingPositionMode.LeadingString_LeftToRight ?
Ldstr(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
opts.LeadingPrefix :
opts.FixedDistanceLiteral.String!);
Call(s_stringAsSpanMethod);
Call(s_spanIndexOfSpan);
if (opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight)
{
Ldc((int)StringComparison.OrdinalIgnoreCase);
Call(s_spanIndexOfSpanStringComparison);
}
else
{
Call(s_spanIndexOfSpan);
}
Stloc(i);

// if (i < 0) goto ReturnFalse;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
}

// If there's a leading substring, just use IndexOf and inherit all of its optimizations.
string prefix = RegexPrefixAnalyzer.FindPrefix(root);
string? prefix = RegexPrefixAnalyzer.FindPrefix(root);
if (prefix.Length > 1)
{
LeadingPrefix = prefix;
Expand Down Expand Up @@ -126,6 +126,16 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
return;
}

// We're now left-to-right only.

prefix = RegexPrefixAnalyzer.FindPrefixOrdinalCaseInsensitive(root);
if (prefix is { Length: > 1 })
{
LeadingPrefix = prefix;
FindMode = FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight;
return;
}

// We're now left-to-right only and looking for sets.

// Build up a list of all of the sets that are a fixed distance from the start of the expression.
Expand Down Expand Up @@ -547,6 +557,21 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
return false;
}

// There's a case-insensitive prefix. Search for it with ordinal case-insensitive IndexOf.

case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
{
int i = textSpan.Slice(pos).IndexOf(LeadingPrefix.AsSpan(), StringComparison.OrdinalIgnoreCase);
if (i >= 0)
{
pos += i;
return true;
}

pos = textSpan.Length;
return false;
}

// There's a set at the beginning of the pattern. Search for it.

case FindNextStartingPositionMode.LeadingSet_LeftToRight:
Expand Down Expand Up @@ -776,6 +801,8 @@ internal enum FindNextStartingPositionMode
LeadingString_LeftToRight,
/// <summary>A multi-character substring at the beginning of the right-to-left pattern.</summary>
LeadingString_RightToLeft,
/// <summary>A multi-character ordinal case-insensitive substring at the beginning of the pattern.</summary>
LeadingString_OrdinalIgnoreCase_LeftToRight,

/// <summary>A set starting the pattern.</summary>
LeadingSet_LeftToRight,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,31 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
}
}

/// <summary>Computes the leading ordinal case-insensitive substring in <paramref name="node"/>.</summary>
public static string? FindPrefixOrdinalCaseInsensitive(RegexNode node)
{
while (true)
{
// Search down the left side of the tree looking for a concatenation. If we find one,
// ask it for any ordinal case-insensitive prefix it has.
switch (node.Kind)
{
case RegexNodeKind.Atomic:
case RegexNodeKind.Capture:
case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M > 0:
node = node.Child(0);
continue;

case RegexNodeKind.Concatenate:
node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString);
return caseInsensitiveString;

default:
return null;
}
}
}

/// <summary>Finds sets at fixed-offsets from the beginning of the pattern/</summary>
/// <param name="root">The RegexNode tree root.</param>
/// <param name="thorough">true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete.</param>
Expand Down