From 44c519e9b7606cfbe8599700e727c4124dfb0e09 Mon Sep 17 00:00:00 2001 From: Stephen Toub Date: Mon, 17 Jun 2024 14:04:42 -0400 Subject: [PATCH] Extend SearchValues use in Regex to NonBacktracking (#103496) We previously only enabled this for the compiler. Support it with NonBacktracking, too. --- .../RegexFindOptimizations.cs | 49 ++++++++++++++++--- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs index fa90486e7407a..1318769136235 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs @@ -1,6 +1,7 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Buffers; using System.Collections.Generic; using System.Diagnostics; @@ -83,6 +84,7 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) bool dfa = (options & RegexOptions.NonBacktracking) != 0; bool compiled = (options & RegexOptions.Compiled) != 0 && !dfa; // for now, we never generate code for NonBacktracking, so treat it as non-compiled bool interpreter = !compiled && !dfa; + bool usesRfoTryFind = !compiled; // For interpreter, we want to employ optimizations, but we don't want to make construction significantly // more expensive; someone who wants to pay to do more work can specify Compiled. So for the interpreter @@ -140,12 +142,18 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) // We're now left-to-right only and looking for multiple prefixes and/or sets. // If there are multiple leading strings, we can search for any of them. - if (compiled) + if (!interpreter) // this works in the interpreter, but we avoid it due to additional cost during construction { if (RegexPrefixAnalyzer.FindPrefixes(root, ignoreCase: true) is { Length: > 1 } caseInsensitivePrefixes) { LeadingPrefixes = caseInsensitivePrefixes; FindMode = FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight; +#if SYSTEM_TEXT_REGULAREXPRESSIONS + if (usesRfoTryFind) + { + LeadingStrings = SearchValues.Create(LeadingPrefixes, StringComparison.OrdinalIgnoreCase); + } +#endif return; } @@ -156,6 +164,12 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) //{ // LeadingPrefixes = caseSensitivePrefixes; // FindMode = FindNextStartingPositionMode.LeadingStrings_LeftToRight; +#if SYSTEM_TEXT_REGULAREXPRESSIONS + // if (usesRfoTryFind) + // { + // LeadingStrings = SearchValues.Create(LeadingPrefixes, StringComparison.Ordinal); + // } +#endif // return; //} } @@ -275,6 +289,11 @@ public RegexFindOptimizations(RegexNode root, RegexOptions options) /// The case-insensitivity of the 0th entry will always match the mode selected, but subsequent entries may not. public List? FixedDistanceSets { get; } +#if SYSTEM_TEXT_REGULAREXPRESSIONS + /// When in leading strings mode, gets the search values to use for searching the input. + public SearchValues? LeadingStrings { get; } +#endif + /// Data about a character class at a fixed offset from the start of any match to a pattern. public struct FixedDistanceSet(char[]? chars, string set, int distance) { @@ -676,6 +695,28 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, return false; } + // There are multiple possible strings at the beginning. Search for one. + case FindNextStartingPositionMode.LeadingStrings_LeftToRight: + case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight: + { + if (LeadingStrings is not SearchValues searchValues) + { + // This should be exceedingly rare and only happen if a Compiled regex selected this + // option but then failed to compile (e.g. due to too deep stacks) and fell back to the interpreter. + return true; + } + + int i = textSpan.Slice(pos).IndexOfAny(searchValues); + if (i >= 0) + { + pos += i; + return true; + } + + pos = textSpan.Length; + return false; + } + // There are one or more sets at fixed offsets from the start of the pattern. case FindNextStartingPositionMode.FixedDistanceSets_LeftToRight: @@ -800,12 +841,6 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan textSpan, return false; } - // Not supported in the interpreter, but we could end up here for patterns so complex the compiler gave up on them. - - case FindNextStartingPositionMode.LeadingStrings_LeftToRight: - case FindNextStartingPositionMode.LeadingStrings_OrdinalIgnoreCase_LeftToRight: - return true; - // Nothing special to look for. Just return true indicating this is a valid position to try to match. default: