Skip to content

Commit

Permalink
Enable Regex to use SearchValues<string> in compiled / source generat…
Browse files Browse the repository at this point in the history
…or for IgnoreCase multi-strings (#98791)

* Enable Regex to use SearchValues<string> in compiled / source generator TryFindNextStartingPosition

The analyzer determines a set of prefixes that can start any match, and then uses SearchValues with IndexOfAny to find the next one from that set. It's currently only enabled for case-insensitive; we need to do some more perf validation before enabling for case-sensitive.

* Address PR feedback

* Fix unit test
  • Loading branch information
stephentoub authored Feb 23, 2024
1 parent 6f8d3e3 commit 99b7601
Show file tree
Hide file tree
Showing 7 changed files with 473 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,11 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w
EmitIndexOfString_RightToLeft();
break;

case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
EmitIndexOfStrings_LeftToRight();
break;

case FindNextStartingPositionMode.LeadingSet_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight:
EmitFixedSet_LeftToRight();
Expand Down Expand Up @@ -1041,6 +1046,37 @@ UnicodeCategory.NonSpacingMark or
}
}

// Emits a case-sensitive left-to-right search for any one of multiple leading prefixes.
void EmitIndexOfStrings_LeftToRight()
{
RegexFindOptimizations opts = regexTree.FindOptimizations;
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);

string prefixes = string.Join(", ", opts.LeadingPrefixes.Select(prefix => Literal(prefix)));
StringComparison stringComparison = opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ?
StringComparison.OrdinalIgnoreCase :
StringComparison.Ordinal;
string fieldName = GetSHA256FieldName($"s_indexOfAnyStrings_{stringComparison}_", prefixes);

if (!requiredHelpers.ContainsKey(fieldName))
{
requiredHelpers.Add(fieldName,
[
$"/// <summary>Supports searching for the specified strings.</summary>",
$"internal static readonly SearchValues<string> {fieldName} = SearchValues.Create([{prefixes}], StringComparison.{stringComparison});", // explicitly using an array in case prefixes is large
]);
}

writer.WriteLine($"// The pattern has multiple strings that could begin the match. Search for any of them.");
writer.WriteLine($"// If none can be found, there's no match.");
writer.WriteLine($"int i = inputSpan.Slice(pos).IndexOfAny({HelpersTypeName}.{fieldName});");
using (EmitBlock(writer, "if (i >= 0)"))
{
writer.WriteLine("base.runtextpos = pos + i;");
writer.WriteLine("return true;");
}
}

// Emits a case-sensitive right-to-left search for a substring.
void EmitIndexOfString_RightToLeft()
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1054,6 +1054,21 @@ public static bool IsAscii(ReadOnlySpan<char> s)
#endif
}

/// <summary>Gets whether the set description string is for two ASCII letters that case to each other under OrdinalIgnoreCase rules.</summary>
public static bool SetContainsAsciiOrdinalIgnoreCaseCharacter(string set, Span<char> twoChars)
{
Debug.Assert(twoChars.Length >= 2);
return
!IsNegated(set) &&
GetSetChars(set, twoChars) == 2 &&
twoChars[0] < 128 &&
twoChars[1] < 128 &&
twoChars[0] != twoChars[1] &&
char.IsLetter(twoChars[0]) &&
char.IsLetter(twoChars[1]) &&
(twoChars[0] | 0x20) == (twoChars[1] | 0x20);
}

/// <summary>Gets whether we can iterate through the set list pairs in order to completely enumerate the set's contents.</summary>
/// <remarks>This may enumerate negated characters if the set is negated. This will return false if the set has subtraction.</remarks>
private static bool CanEasilyEnumerateSetContents(string set) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,8 @@ protected void EmitTryFindNextPossibleStartingPosition()
{
case FindNextStartingPositionMode.LeadingString_LeftToRight:
case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
case FindNextStartingPositionMode.FixedDistanceString_LeftToRight:
EmitIndexOfString_LeftToRight();
break;
Expand Down Expand Up @@ -745,15 +747,19 @@ bool EmitAnchors()
return false;
}

// Emits a case-sensitive left-to-right search for a substring.
// Emits a case-sensitive left-to-right search for a substring or substrings.
void EmitIndexOfString_LeftToRight()
{
RegexFindOptimizations opts = _regexTree.FindOptimizations;
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight);
Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or
FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or
FindNextStartingPositionMode.FixedDistanceString_LeftToRight or
FindNextStartingPositionMode.LeadingStrings_LeftToRight or
FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight);

using RentedLocalBuilder i = RentInt32Local();

// int i = inputSpan.Slice(pos).IndexOf(prefix);
// int i = inputSpan.Slice(pos)...
Ldloca(inputSpan);
Ldloc(pos);
if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight &&
Expand All @@ -763,11 +769,21 @@ void EmitIndexOfString_LeftToRight()
Add();
}
Call(s_spanSliceIntMethod);
string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
opts.LeadingPrefix :
opts.FixedDistanceLiteral.String!;
LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
Call(s_spanIndexOfAnySearchValuesString);

// ...IndexOf(prefix);
if (opts.FindMode is FindNextStartingPositionMode.LeadingStrings_LeftToRight or FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight)
{
LoadSearchValues(opts.LeadingPrefixes, opts.FindMode is FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
Call(s_spanIndexOfAnySearchValuesString);
}
else
{
string literalString = opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ?
opts.LeadingPrefix :
opts.FixedDistanceLiteral.String!;
LoadSearchValues([literalString], opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? StringComparison.OrdinalIgnoreCase : StringComparison.Ordinal);
Call(s_spanIndexOfAnySearchValuesString);
}
Stloc(i);

// if (i < 0) goto ReturnFalse;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,28 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
return;
}

// We're now left-to-right only and looking for sets.
// We're now left-to-right only and looking for multiple prefixes and/or sets.

// If there are multiple leading strings, we can search for any of them.
if (compiled)
{
if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: true) is { Length: > 1 } caseInsensitivePrefixes)
{
LeadingPrefixes = caseInsensitivePrefixes;
FindMode = FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight;
return;
}

// TODO: While some benchmarks benefit from this significantly, others regressed a bit (in particular those with few
// matches). Before enabling this, we need to investigate the performance impact on real-world scenarios,
// and see if there are ways to reduce the impact.
//if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: false) is { Length: > 1 } caseSensitivePrefixes)
//{
// LeadingPrefixes = caseSensitivePrefixes;
// FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight;
// return;
//}
}

// Build up a list of all of the sets that are a fixed distance from the start of the expression.
List<FixedDistanceSet>? fixedDistanceSets = RegexPrefixAnalyzer.FindFixedDistanceSets(root, thorough: !interpreter);
Expand Down Expand Up @@ -244,6 +265,9 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options)
/// <summary>Gets the leading prefix. May be an empty string.</summary>
public string LeadingPrefix { get; } = string.Empty;

/// <summary>Gets the leading prefixes. May be an empty array.</summary>
public string[] LeadingPrefixes { get; } = Array.Empty<string>();

/// <summary>When in fixed distance literal mode, gets the literal and how far it is from the start of the pattern.</summary>
public (char Char, string? String, int Distance) FixedDistanceLiteral { get; }

Expand Down Expand Up @@ -767,10 +791,16 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
return false;
}

// Not supported in the interpreter, but we could end up here for patterns so complex the compiler gave up on them.

case FindNextStartingPositionMode.LeadingStrings_LeftToRight:
case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight:
return true;

// Nothing special to look for. Just return true indicating this is a valid position to try to match.

default:
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch);
Debug.Assert(FindMode == FindNextStartingPositionMode.NoSearch, $"Unexpected FindMode {FindMode}");
return true;
}
}
Expand Down Expand Up @@ -810,6 +840,11 @@ internal enum FindNextStartingPositionMode
/// <summary>A multi-character ordinal case-insensitive substring at the beginning of the pattern.</summary>
LeadingString_OrdinalIgnoreCase_LeftToRight,

/// <summary>Multiple leading prefix strings</summary>
LeadingStrings_LeftToRight,
/// <summary>Multiple leading ordinal case-insensitive prefix strings</summary>
LeadingStrings_OrdinalIgnoreCase_LeftToRight,

/// <summary>A set starting the pattern.</summary>
LeadingSet_LeftToRight,
/// <summary>A set starting the right-to-left pattern.</summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2561,14 +2561,7 @@ public bool TryGetOrdinalCaseInsensitiveString(int childIndex, int exclusiveChil
{
// In particular we want to look for sets that contain only the upper and lowercase variant
// of the same ASCII letter.
if (RegexCharClass.IsNegated(child.Str!) ||
RegexCharClass.GetSetChars(child.Str!, twoChars) != 2 ||
twoChars[0] >= 128 ||
twoChars[1] >= 128 ||
twoChars[0] == twoChars[1] ||
!char.IsLetter(twoChars[0]) ||
!char.IsLetter(twoChars[1]) ||
((twoChars[0] | 0x20) != (twoChars[1] | 0x20)))
if (!RegexCharClass.SetContainsAsciiOrdinalIgnoreCaseCharacter(child.Str!, twoChars))
{
break;
}
Expand Down
Loading

0 comments on commit 99b7601

Please sign in to comment.