Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable Regex to use SearchValues<string> in compiled / source generator for IgnoreCase multi-strings #98791

Merged
merged 5 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,11 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w
EmitIndexOfString_RightToLeft();
break;

case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
EmitIndexOfStrings_LeftToRight();
break;

case FindNextStartingPositionMode.LeadingSet_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
EmitFixedSet_LeftToRight();
Expand Down Expand Up @@ -1041,6 +1046,37 @@ UnicodeCategory.NonSpacingMark or
}
}

// Emits a case-sensitive left-to-right search for any one of multiple leading prefixes.
void EmitIndexOfStrings_LeftToRight()
{
RegexFindOptimizations opts = regexTree.FindOptimizations;
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);

string prefixes = string.Join(", ", opts.LeadingPrefixes.Select(prefix => Literal(prefix)));
StringComparison stringComparison = opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ?
StringComparison.OrdinalIgnoreCase :
StringComparison.Ordinal;
string fieldName = GetSHA256FieldName($"s_indexOfAnyStrings_{stringComparison}_", prefixes);

if (!requiredHelpers.ContainsKey(fieldName))
{
requiredHelpers.Add(fieldName,
[
$"/// <summary>Supports searching for the specified strings.</summary>",
$"internal static readonly SearchValues<string> {fieldName} = SearchValues.Create([{prefixes}], StringComparison.{stringComparison});", // explicitly using an array in case prefixes is large
]);
}

writer.WriteLine($"// The pattern has multiple strings that could begin the match. Search for any of them.");
writer.WriteLine($"// If none can be found, there's no match.");
writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOfAny({HelpersTypeName}.{fieldName});");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
writer.WriteLine("return true;");
}
}

// Emits a case-sensitive right-to-left search for a substring.
void EmitIndexOfString_RightToLeft()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,21 @@ public static bool IsAscii(ReadOnlySpan<char> s)
#endif
}

/// <summary>Gets whether the set description string is for two ASCII letters that case to each other under OrdinalIgnoreCase rules.</summary>
public static bool SetContainsAsciiOrdinalIgnoreCaseCharacter(string set, Span<char> twoChars)
{
Debug.Assert(twoChars.Length >= 2);
return
!IsNegated(set) &&
GetSetChars(set, twoChars) == 2 &&
twoChars[0] < 128 &&
twoChars[1] < 128 &&
twoChars[0] != twoChars[1] &&
char.IsLetter(twoChars[0]) &&
char.IsLetter(twoChars[1]) &&
(twoChars[0] | 0x20) == (twoChars[1] | 0x20);
}

/// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
/// <remarks>This may enumerate negated characters if the set is negated. This will return false if the set has subtraction.</remarks>
private static bool CanEasilyEnumerateSetContents(string set) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,8 @@ protected void EmitTryFindNextPossibleStartingPosition()
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOfString_LeftToRight();
break;
Expand Down Expand Up @@ -745,15 +747,19 @@ bool EmitAnchors()
return false;
}

// Emits a case-sensitive left-to-right search for a substring.
// Emits a case-sensitive left-to-right search for a substring or substrings.
void EmitIndexOfString_LeftToRight()
{
RegexFindOptimizations opts = _regexTree.FindOptimizations;
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or
FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or
FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
FindNextStartingPositionMode.LeadingStrings_LeftToRight or
FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);

using RentedLocalBuilder i = RentInt32Local();

// int i = inputSpan.Slice(pos).IndexOf(prefix);
// int i = inputSpan.Slice(pos)...
Ldloca(inputSpan);
Ldloc(pos);
if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
Expand All @@ -763,11 +769,21 @@ void EmitIndexOfString_LeftToRight()
Add();
}
Call(s_spanSliceIntMethod);
string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
opts.LeadingPrefix :
opts.FixedDistanceLiteral.String!;
LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
Call(s_spanIndexOfAnySearchValuesString);

// ...IndexOf(prefix);
if (opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight)
{
LoadSearchValues(opts.LeadingPrefixes, opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
Call(s_spanIndexOfAnySearchValuesString);
}
else
{
string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
opts.LeadingPrefix :
opts.FixedDistanceLiteral.String!;
LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
Call(s_spanIndexOfAnySearchValuesString);
}
Stloc(i);

// if (i < 0) goto ReturnFalse;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,28 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
return;
}

// We're now left-to-right only and looking for sets.
// We're now left-to-right only and looking for multiple prefixes and/or sets.

// If there are multiple leading strings, we can search for any of them.
if (compiled)
{
if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: true) is { Length: > 1 } caseInsensitivePrefixes)
{
LeadingPrefixes = caseInsensitivePrefixes;
FindMode = FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight;
return;
}

// TODO: While some benchmarks benefit from this significantly, others regressed a bit (in particular those with few
// matches). Before enabling this, we need to investigate the performance impact on real-world scenarios,
// and see if there are ways to reduce the impact.
//if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: false) is { Length: > 1 } caseSensitivePrefixes)
//{
// LeadingPrefixes = caseSensitivePrefixes;
// FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
// return;
//}
}

// Build up a list of all of the sets that are a fixed distance from the start of the expression.
List<FixedDistanceSet>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter);
Expand Down Expand Up @@ -244,6 +265,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
/// <summary>Gets the leading prefix. May be an empty string.</summary>
public string LeadingPrefix { get; } = string.Empty;

/// <summary>Gets the leading prefixes. May be an empty array.</summary>
public string[] LeadingPrefixes { get; } = Array.Empty<string>();

/// <summary>When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern.</summary>
public (char Char, string? String, int Distance) FixedDistanceLiteral { get; }

Expand Down Expand Up @@ -767,10 +791,16 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
return false;
}

// Not supported in the interpreter, but we could end up here for patterns so complex the compiler gave up on them.

case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
return true;

// Nothing special to look for. Just return true indicating this is a valid position to try to match.

default:
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch, $"Unexpected FindMode {FindMode}");
return true;
}
}
Expand Down Expand Up @@ -810,6 +840,11 @@ internal enum FindNextStartingPositionMode
/// <summary>A multi-character ordinal case-insensitive substring at the beginning of the pattern.</summary>
LeadingString_OrdinalIgnoreCase_LeftToRight,

/// <summary>Multiple leading prefix strings</summary>
LeadingStrings_LeftToRight,
/// <summary>Multiple leading ordinal case-insensitive prefix strings</summary>
LeadingStrings_OrdinalIgnoreCase_LeftToRight,

/// <summary>A set starting the pattern.</summary>
LeadingSet_LeftToRight,
/// <summary>A set starting the right-to-left pattern.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2561,14 +2561,7 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
{
// In particular we want to look for sets that contain only the upper and lowercase variant
// of the same ASCII letter.
if (RegexCharClass.IsNegated(child.Str!) ||
RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 ||
twoChars[0] >= 128 ||
twoChars[1] >= 128 ||
twoChars[0] == twoChars[1] ||
!char.IsLetter(twoChars[0]) ||
!char.IsLetter(twoChars[1]) ||
((twoChars[0] | 0x20) != (twoChars[1] | 0x20)))
if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(child.Str!, twoChars))
{
break;
}
Expand Down
Loading
Loading