From 1b4f1593164aa4f42b01168f4addd17bef773000 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Wed, 26 Apr 2023 22:09:30 -0400 Subject: [PATCH] Enable regex to use IndexOf(..., OrdinalIgnoreCase) for prefix searching As one of its many ways of finding the next possible match starting location, Regex recognizes a string known to start the expression and uses IndexOf to find it. With this change, it can also do so for OrdinalIgnoreCase. With improvements to IndexOf(..., OrdinalIgnoreCase), this now yields significantly faster searches through longer inputs, in addition to leading to simpler code in source generated regexes. --- .../gen/RegexGenerator.Emitter.cs | 15 +++++++--- .../Text/RegularExpressions/RegexCompiler.cs | 18 +++++++++--- .../RegexFindOptimizations.cs | 29 ++++++++++++++++++- .../RegularExpressions/RegexPrefixAnalyzer.cs | 25 ++++++++++++++++ 4 files changed, 78 insertions(+), 9 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 8cff7411be704..3af622cbc4fa5 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -714,6 +714,7 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w switch (regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingString_LeftToRight: + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: EmitIndexOf_LeftToRight(); break; @@ -948,14 +949,20 @@ void EmitIndexOf_LeftToRight() { RegexFindOptimizations opts = regexTree.FindOptimizations; - string substring = ""; - string offset = ""; - string offsetDescription = "at the beginning of the pattern"; + string substring = "", stringComparison = "", offset = "", offsetDescription = ""; switch (opts.FindMode) { case FindNextStartingPositionMode.LeadingString_LeftToRight: substring = regexTree.FindOptimizations.LeadingPrefix; + offsetDescription = "at the beginning of the pattern"; + Debug.Assert(!string.IsNullOrEmpty(substring)); + break; + + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: + substring = regexTree.FindOptimizations.LeadingPrefix; + stringComparison = ", StringComparison.OrdinalIgnoreCase"; + offsetDescription = "ordinal case-insensitive at the beginning of the pattern"; Debug.Assert(!string.IsNullOrEmpty(substring)); break; @@ -976,7 +983,7 @@ void EmitIndexOf_LeftToRight() writer.WriteLine($"// The pattern has the literal {Literal(substring)} {offsetDescription}. Find the next occurrence."); writer.WriteLine($"// If it can't be found, there's no match."); - writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)});"); + writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)}{stringComparison});"); using (EmitBlock(writer, "if (i >= 0)")) { writer.WriteLine("base.runtextpos = pos + i;"); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index f9be28acceeeb..9b8d3ad7a2a97 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -66,6 +66,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; private static readonly MethodInfo s_spanIndexOfChar = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfSpanStringComparison = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan), typeof(ReadOnlySpan), typeof(StringComparison) })!; private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -456,6 +457,7 @@ protected void EmitTryFindNextPossibleStartingPosition() switch (_regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingString_LeftToRight: + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: EmitIndexOf_LeftToRight(); break; @@ -745,25 +747,33 @@ bool EmitAnchors() void EmitIndexOf_LeftToRight() { RegexFindOptimizations opts = _regexTree.FindOptimizations; - Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight); + Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight); using RentedLocalBuilder i = RentInt32Local(); // int i = inputSpan.Slice(pos).IndexOf(prefix); Ldloca(inputSpan); Ldloc(pos); - if (opts.FindMode == FindNextStartingPositionMode.FixedDistanceString_LeftToRight && + if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight && opts.FixedDistanceLiteral is { Distance: > 0 } literal) { Ldc(literal.Distance); Add(); } Call(s_spanSliceIntMethod); - Ldstr(opts.FindMode == FindNextStartingPositionMode.LeadingString_LeftToRight ? + Ldstr(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? opts.LeadingPrefix : opts.FixedDistanceLiteral.String!); Call(s_stringAsSpanMethod); - Call(s_spanIndexOfSpan); + if (opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight) + { + Ldc((int)StringComparison.OrdinalIgnoreCase); + Call(s_spanIndexOfSpanStringComparison); + } + else + { + Call(s_spanIndexOfSpan); + } Stloc(i); // if (i < 0) goto ReturnFalse; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index d02c74a70c7b6..113c0749f5af4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -66,7 +66,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) } // If there's a leading substring, just use IndexOf and inherit all of its optimizations. - string prefix = RegexPrefixAnalyzer.FindPrefix(root); + string? prefix = RegexPrefixAnalyzer.FindPrefix(root); if (prefix.Length > 1) { LeadingPrefix = prefix; @@ -126,6 +126,16 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) return; } + // We're now left-to-right only. + + prefix = RegexPrefixAnalyzer.FindPrefixOrdinalCaseInsensitive(root); + if (prefix is { Length: > 1 }) + { + LeadingPrefix = prefix; + FindMode = FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight; + return; + } + // We're now left-to-right only and looking for sets. // Build up a list of all of the sets that are a fixed distance from the start of the expression. @@ -547,6 +557,21 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, return false; } + // There's a case-insensitive prefix. Search for it with ordinal case-insensitive IndexOf. + + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: + { + int i = textSpan.Slice(pos).IndexOf(LeadingPrefix.AsSpan(), StringComparison.OrdinalIgnoreCase); + if (i >= 0) + { + pos += i; + return true; + } + + pos = textSpan.Length; + return false; + } + // There's a set at the beginning of the pattern. Search for it. case FindNextStartingPositionMode.LeadingSet_LeftToRight: @@ -776,6 +801,8 @@ internal enum FindNextStartingPositionMode LeadingString_LeftToRight, /// A multi-character substring at the beginning of the right-to-left pattern. LeadingString_RightToLeft, + /// A multi-character ordinal case-insensitive substring at the beginning of the pattern. + LeadingString_OrdinalIgnoreCase_LeftToRight, /// A set starting the pattern. LeadingSet_LeftToRight, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index ccf33f6fe1987..88553b3b7d113 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -155,6 +155,31 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) } } + /// Computes the leading ordinal case-insensitive substring in . + public static string? FindPrefixOrdinalCaseInsensitive(RegexNode node) + { + while (true) + { + // Search down the left side of the tree looking for a concatenation. If we find one, + // ask it for any ordinal case-insensitive prefix it has. + switch (node.Kind) + { + case RegexNodeKind.Atomic: + case RegexNodeKind.Capture: + case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M > 0: + node = node.Child(0); + continue; + + case RegexNodeKind.Concatenate: + node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString); + return caseInsensitiveString; + + default: + return null; + } + } + } + /// Finds sets at fixed-offsets from the beginning of the pattern/ /// The RegexNode tree root. /// true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete.