diff --git a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs index 8cff7411be704..3af622cbc4fa5 100644 --- a/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs +++ b/src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs @@ -714,6 +714,7 @@ private static void EmitTryFindNextPossibleStartingPosition(IndentedTextWriter w switch (regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingString_LeftToRight: + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: EmitIndexOf_LeftToRight(); break; @@ -948,14 +949,20 @@ void EmitIndexOf_LeftToRight() { RegexFindOptimizations opts = regexTree.FindOptimizations; - string substring = ""; - string offset = ""; - string offsetDescription = "at the beginning of the pattern"; + string substring = "", stringComparison = "", offset = "", offsetDescription = ""; switch (opts.FindMode) { case FindNextStartingPositionMode.LeadingString_LeftToRight: substring = regexTree.FindOptimizations.LeadingPrefix; + offsetDescription = "at the beginning of the pattern"; + Debug.Assert(!string.IsNullOrEmpty(substring)); + break; + + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: + substring = regexTree.FindOptimizations.LeadingPrefix; + stringComparison = ", StringComparison.OrdinalIgnoreCase"; + offsetDescription = "ordinal case-insensitive at the beginning of the pattern"; Debug.Assert(!string.IsNullOrEmpty(substring)); break; @@ -976,7 +983,7 @@ void EmitIndexOf_LeftToRight() writer.WriteLine($"// The pattern has the literal {Literal(substring)} {offsetDescription}. Find the next occurrence."); writer.WriteLine($"// If it can't be found, there's no match."); - writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)});"); + writer.WriteLine($"int i = inputSpan.Slice(pos{offset}).IndexOf({Literal(substring)}{stringComparison});"); using (EmitBlock(writer, "if (i >= 0)")) { writer.WriteLine("base.runtextpos = pos + i;"); diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs index f9be28acceeeb..9b8d3ad7a2a97 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs @@ -66,6 +66,7 @@ internal abstract class RegexCompiler private static readonly MethodInfo s_spanGetLengthMethod = typeof(ReadOnlySpan).GetMethod("get_Length")!; private static readonly MethodInfo s_spanIndexOfChar = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfSpan = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); + private static readonly MethodInfo s_spanIndexOfSpanStringComparison = typeof(MemoryExtensions).GetMethod("IndexOf", new Type[] { typeof(ReadOnlySpan), typeof(ReadOnlySpan), typeof(StringComparison) })!; private static readonly MethodInfo s_spanIndexOfAnyCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnyCharCharChar = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0), Type.MakeGenericMethodParameter(0) })!.MakeGenericMethod(typeof(char)); private static readonly MethodInfo s_spanIndexOfAnySpan = typeof(MemoryExtensions).GetMethod("IndexOfAny", new Type[] { typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)), typeof(ReadOnlySpan<>).MakeGenericType(Type.MakeGenericMethodParameter(0)) })!.MakeGenericMethod(typeof(char)); @@ -456,6 +457,7 @@ protected void EmitTryFindNextPossibleStartingPosition() switch (_regexTree.FindOptimizations.FindMode) { case FindNextStartingPositionMode.LeadingString_LeftToRight: + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: case FindNextStartingPositionMode.FixedDistanceString_LeftToRight: EmitIndexOf_LeftToRight(); break; @@ -745,25 +747,33 @@ bool EmitAnchors() void EmitIndexOf_LeftToRight() { RegexFindOptimizations opts = _regexTree.FindOptimizations; - Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight); + Debug.Assert(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight or FindNextStartingPositionMode.FixedDistanceString_LeftToRight); using RentedLocalBuilder i = RentInt32Local(); // int i = inputSpan.Slice(pos).IndexOf(prefix); Ldloca(inputSpan); Ldloc(pos); - if (opts.FindMode == FindNextStartingPositionMode.FixedDistanceString_LeftToRight && + if (opts.FindMode is FindNextStartingPositionMode.FixedDistanceString_LeftToRight && opts.FixedDistanceLiteral is { Distance: > 0 } literal) { Ldc(literal.Distance); Add(); } Call(s_spanSliceIntMethod); - Ldstr(opts.FindMode == FindNextStartingPositionMode.LeadingString_LeftToRight ? + Ldstr(opts.FindMode is FindNextStartingPositionMode.LeadingString_LeftToRight or FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight ? opts.LeadingPrefix : opts.FixedDistanceLiteral.String!); Call(s_stringAsSpanMethod); - Call(s_spanIndexOfSpan); + if (opts.FindMode is FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight) + { + Ldc((int)StringComparison.OrdinalIgnoreCase); + Call(s_spanIndexOfSpanStringComparison); + } + else + { + Call(s_spanIndexOfSpan); + } Stloc(i); // if (i < 0) goto ReturnFalse; diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index d02c74a70c7b6..113c0749f5af4 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -66,7 +66,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) } // If there's a leading substring, just use IndexOf and inherit all of its optimizations. - string prefix = RegexPrefixAnalyzer.FindPrefix(root); + string? prefix = RegexPrefixAnalyzer.FindPrefix(root); if (prefix.Length > 1) { LeadingPrefix = prefix; @@ -126,6 +126,16 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) return; } + // We're now left-to-right only. + + prefix = RegexPrefixAnalyzer.FindPrefixOrdinalCaseInsensitive(root); + if (prefix is { Length: > 1 }) + { + LeadingPrefix = prefix; + FindMode = FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight; + return; + } + // We're now left-to-right only and looking for sets. // Build up a list of all of the sets that are a fixed distance from the start of the expression. @@ -547,6 +557,21 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, return false; } + // There's a case-insensitive prefix. Search for it with ordinal case-insensitive IndexOf. + + case FindNextStartingPositionMode.LeadingString_OrdinalIgnoreCase_LeftToRight: + { + int i = textSpan.Slice(pos).IndexOf(LeadingPrefix.AsSpan(), StringComparison.OrdinalIgnoreCase); + if (i >= 0) + { + pos += i; + return true; + } + + pos = textSpan.Length; + return false; + } + // There's a set at the beginning of the pattern. Search for it. case FindNextStartingPositionMode.LeadingSet_LeftToRight: @@ -776,6 +801,8 @@ internal enum FindNextStartingPositionMode LeadingString_LeftToRight, /// A multi-character substring at the beginning of the right-to-left pattern. LeadingString_RightToLeft, + /// A multi-character ordinal case-insensitive substring at the beginning of the pattern. + LeadingString_OrdinalIgnoreCase_LeftToRight, /// A set starting the pattern. LeadingSet_LeftToRight, diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs index ccf33f6fe1987..88553b3b7d113 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs @@ -155,6 +155,31 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb) } } + /// Computes the leading ordinal case-insensitive substring in . + public static string? FindPrefixOrdinalCaseInsensitive(RegexNode node) + { + while (true) + { + // Search down the left side of the tree looking for a concatenation. If we find one, + // ask it for any ordinal case-insensitive prefix it has. + switch (node.Kind) + { + case RegexNodeKind.Atomic: + case RegexNodeKind.Capture: + case RegexNodeKind.Loop or RegexNodeKind.Lazyloop when node.M > 0: + node = node.Child(0); + continue; + + case RegexNodeKind.Concatenate: + node.TryGetOrdinalCaseInsensitiveString(0, node.ChildCount(), out _, out string? caseInsensitiveString); + return caseInsensitiveString; + + default: + return null; + } + } + } + /// Finds sets at fixed-offsets from the beginning of the pattern/ /// The RegexNode tree root. /// true to spend more time finding sets (e.g. through alternations); false to do a faster analysis that's potentially more incomplete.