diff --git a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs index 0730da675126c..dd5f8af53bb5a 100644 --- a/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs +++ b/src/libraries/System.Text.RegularExpressions/ref/System.Text.RegularExpressions.cs @@ -179,6 +179,12 @@ public static void CompileToAssembly(System.Text.RegularExpressions.RegexCompila public static System.Text.RegularExpressions.Regex.ValueMatchEnumerator EnumerateMatches(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex")] string pattern) { throw null; } public static System.Text.RegularExpressions.Regex.ValueMatchEnumerator EnumerateMatches(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex", new object[]{ "options"})] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; } public static System.Text.RegularExpressions.Regex.ValueMatchEnumerator EnumerateMatches(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex", new object[]{ "options"})] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; } + public System.Text.RegularExpressions.Regex.ValueSplitEnumerator EnumerateSplits(System.ReadOnlySpan input) { throw null; } + public System.Text.RegularExpressions.Regex.ValueSplitEnumerator EnumerateSplits(System.ReadOnlySpan input, int count) { throw null; } + public System.Text.RegularExpressions.Regex.ValueSplitEnumerator EnumerateSplits(System.ReadOnlySpan input, int count, int startat) { throw null; } + public static System.Text.RegularExpressions.Regex.ValueSplitEnumerator EnumerateSplits(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex")] string pattern) { throw null; } + public static System.Text.RegularExpressions.Regex.ValueSplitEnumerator EnumerateSplits(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex", new object[] { "options" })] string pattern, System.Text.RegularExpressions.RegexOptions options) { throw null; } + public static System.Text.RegularExpressions.Regex.ValueSplitEnumerator EnumerateSplits(System.ReadOnlySpan input, [System.Diagnostics.CodeAnalysis.StringSyntaxAttribute("Regex", new object[] { "options" })] string pattern, System.Text.RegularExpressions.RegexOptions options, System.TimeSpan matchTimeout) { throw null; } public string[] GetGroupNames() { throw null; } public int[] GetGroupNumbers() { throw null; } public string GroupNameFromNumber(int i) { throw null; } @@ -243,6 +249,14 @@ public ref partial struct ValueMatchEnumerator public readonly System.Text.RegularExpressions.Regex.ValueMatchEnumerator GetEnumerator() { throw null; } public bool MoveNext() { throw null; } } + public ref partial struct ValueSplitEnumerator + { + private object _dummy; + private int _dummyPrimitive; + public readonly System.Range Current { get { throw null; } } + public readonly System.Text.RegularExpressions.Regex.ValueSplitEnumerator GetEnumerator() { throw null; } + public bool MoveNext() { throw null; } + } } [System.ObsoleteAttribute("Regex.CompileToAssembly is obsolete and not supported. Use the GeneratedRegexAttribute with the regular expression source generator instead.", DiagnosticId = "SYSLIB0036", UrlFormat = "https://aka.ms/dotnet-warnings/{0}")] public partial class RegexCompilationInfo diff --git a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj index 1b77579511ab8..2042b930fdd2c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj +++ b/src/libraries/System.Text.RegularExpressions/src/System.Text.RegularExpressions.csproj @@ -27,6 +27,7 @@ + diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.EnumerateSplits.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.EnumerateSplits.cs new file mode 100644 index 0000000000000..e4dbc232118f8 --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Regex.EnumerateSplits.cs @@ -0,0 +1,276 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics.CodeAnalysis; +using System.Runtime.InteropServices; + +namespace System.Text.RegularExpressions +{ + public partial class Regex + { + /// + /// Searches an input span for all occurrences of a regular expression and returns a to iterate over the splits around matches. + /// + /// + /// + /// The behavior of is similar to the behavior of , producing the splits + /// one at a time as part of iterating through the resulting enumerator rather than all at once as part of a single array. However, there are a few notable differences. + /// will include the contents of capture groups in the resulting splits, while will not. + /// And if is specified, will reverse the order of the resulting splits to be left-to-right, whereas + /// will yield the splits in the order they're found right-to-left. + /// + /// + /// Each match won't actually happen until is invoked on the enumerator, with one match being performed per call. + /// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to may affect the match results; + /// such changes should be avoided and are not supported. + /// + /// + /// The span to search for a match. + /// The regular expression pattern to match. + /// A to iterate over the splits around matches. + /// is null. + /// A regular expression parsing error occurred. + public static ValueSplitEnumerator EnumerateSplits(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex)] string pattern) => + RegexCache.GetOrAdd(pattern).EnumerateSplits(input); + + /// + /// Searches an input span for all occurrences of a regular expression and returns a to iterate over the splits around matches. + /// + /// + /// + /// The behavior of is similar to the behavior of , producing the splits + /// one at a time as part of iterating through the resulting enumerator rather than all at once as part of a single array. However, there are a few notable differences. + /// will include the contents of capture groups in the resulting splits, while will not. + /// And if is specified, will reverse the order of the resulting splits to be left-to-right, whereas + /// will yield the splits in the order they're found right-to-left. + /// + /// + /// Each match won't actually happen until is invoked on the enumerator, with one match being performed per call. + /// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to may affect the match results; + /// such changes should be avoided and are not supported. + /// + /// + /// The span to search for a match. + /// The regular expression pattern to match. + /// A bitwise combination of the enumeration values that specify options for matching. + /// A to iterate over the splits around matches. + /// is null. + /// is not a valid bitwise combination of RegexOptions values. + /// A regular expression parsing error occurred. + public static ValueSplitEnumerator EnumerateSplits(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, nameof(options))] string pattern, RegexOptions options) => + RegexCache.GetOrAdd(pattern, options, s_defaultMatchTimeout).EnumerateSplits(input); + + /// + /// Searches an input span for all occurrences of a regular expression and returns a to iterate over the splits around matches. + /// + /// + /// + /// The behavior of is similar to the behavior of , producing the splits + /// one at a time as part of iterating through the resulting enumerator rather than all at once as part of a single array. However, there are a few notable differences. + /// will include the contents of capture groups in the resulting splits, while will not. + /// And if is specified, will reverse the order of the resulting splits to be left-to-right, whereas + /// will yield the splits in the order they're found right-to-left. + /// + /// + /// Each match won't actually happen until is invoked on the enumerator, with one match being performed per call. + /// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to may affect the match results; + /// such changes should be avoided and are not supported. + /// + /// + /// The span to search for a match. + /// The regular expression pattern to match. + /// A bitwise combination of the enumeration values that specify options for matching. + /// A time-out interval, or to indicate that the method should not time out. + /// A to iterate over the splits around matches. + /// is null. + /// is not a valid bitwise combination of RegexOptions values, or is negative, zero, or greater than approximately 24 days. + /// A regular expression parsing error occurred. + public static ValueSplitEnumerator EnumerateSplits(ReadOnlySpan input, [StringSyntax(StringSyntaxAttribute.Regex, nameof(options))] string pattern, RegexOptions options, TimeSpan matchTimeout) => + RegexCache.GetOrAdd(pattern, options, matchTimeout).EnumerateSplits(input); + + /// + /// Searches an input span for all occurrences of a regular expression and returns a to iterate over the splits around matches. + /// + /// + /// + /// The behavior of is similar to the behavior of , producing the splits + /// one at a time as part of iterating through the resulting enumerator rather than all at once as part of a single array. However, there are a few notable differences. + /// will include the contents of capture groups in the resulting splits, while will not. + /// And if is specified, will reverse the order of the resulting splits to be left-to-right, whereas + /// will yield the splits in the order they're found right-to-left. + /// + /// + /// Each match won't actually happen until is invoked on the enumerator, with one match being performed per call. + /// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to may affect the match results; + /// such changes should be avoided and are not supported. + /// + /// + /// The span to search for a match. + /// A to iterate over the matches. + public ValueSplitEnumerator EnumerateSplits(ReadOnlySpan input) => + EnumerateSplits(input, count: 0); + + /// + /// Searches an input span for all occurrences of a regular expression and returns a to iterate over the splits around matches. + /// + /// + /// + /// The behavior of is similar to the behavior of , producing the splits + /// one at a time as part of iterating through the resulting enumerator rather than all at once as part of a single array. However, there are a few notable differences. + /// will include the contents of capture groups in the resulting splits, while will not. + /// And if is specified, will reverse the order of the resulting splits to be left-to-right, whereas + /// will yield the splits in the order they're found right-to-left. + /// + /// + /// Each match won't actually happen until is invoked on the enumerator, with one match being performed per call. + /// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to may affect the match results; + /// such changes should be avoided and are not supported. + /// + /// + /// The span to search for a match. + /// The maximum number of times the split can occur. If 0, all splits are available. + /// A to iterate over the matches. + public ValueSplitEnumerator EnumerateSplits(ReadOnlySpan input, int count) => + EnumerateSplits(input, count, startat: RightToLeft ? input.Length : 0); + + /// + /// Searches an input span for all occurrences of a regular expression and returns a to iterate over the splits around matches. + /// + /// + /// + /// The behavior of is similar to the behavior of , producing the splits + /// one at a time as part of iterating through the resulting enumerator rather than all at once as part of a single array. However, there are a few notable differences. + /// will include the contents of capture groups in the resulting splits, while will not. + /// And if is specified, will reverse the order of the resulting splits to be left-to-right, whereas + /// will yield the splits in the order they're found right-to-left. + /// + /// + /// Each match won't actually happen until is invoked on the enumerator, with one match being performed per call. + /// Since the evaluation of the match happens lazily, any changes to the passed in input in between calls to may affect the match results; + /// such changes should be avoided and are not supported. + /// + /// + /// The span to search for a match. + /// The maximum number of times the split can occur. If 0, all splits are available. + /// The zero-based character position at which to start the search. + /// A to iterate over the matches. + public ValueSplitEnumerator EnumerateSplits(ReadOnlySpan input, int count, int startat) + { + if (count < 0) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.count, ExceptionResource.CountTooSmall); + } + + if ((uint)startat > (uint)input.Length) + { + ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.startat, ExceptionResource.BeginIndexNotNegative); + } + + return new ValueSplitEnumerator(this, input, count, startat, RightToLeft); + } + + /// + /// Represents an enumerator containing the set of splits around successful matches found by iteratively applying a regular expression pattern to the input span. + /// + [StructLayout(LayoutKind.Auto)] + public ref struct ValueSplitEnumerator + { + private readonly Regex _regex; + private readonly ReadOnlySpan _input; + + private int _startAt; + private (int Index, int Length) _lastMatch; + private Range _currentSplit; + private int _remainingCount; + + /// + /// Creates an instance of the for the passed in which iterates over . + /// + /// The to use for finding matches. + /// The input span to iterate over. + /// The maximum number of times the split can occur. + /// The position from where the engine should start looking for matches. + /// Whether the engine is matching from right to left. + internal ValueSplitEnumerator(Regex regex, ReadOnlySpan input, int count, int startAt, bool rtl) + { + _regex = regex; + _input = input; + _startAt = startAt; + _lastMatch = (rtl ? input.Length : 0, -1); + _remainingCount = count != 0 ? count : int.MaxValue; // Maintain same behavior as Split(..., count: 0, ...), which treats it as effectively infinite. + } + + /// Provides an enumerator that iterates through the splits in the input span. + /// A copy of this enumerator. + public readonly ValueSplitEnumerator GetEnumerator() => this; + + /// Advances the enumerator to the next split. + /// + /// if the enumerator was successfully advanced to the next element; if the enumerator cannot find additional matches. + /// + public bool MoveNext() + { + // If we've already found all the splits, we're done. + if (_remainingCount == 0) + { + return false; + } + + if (_remainingCount == 1) + { + // If we've reached the last split, include everything that remains. + _currentSplit = !_regex.RightToLeft ? (_lastMatch.Index + _lastMatch.Length).._input.Length : 0.._lastMatch.Index; + } + else + { + // Perform the next match. + (bool Success, int Index, int Length, int TextPosition) match = _regex.RunSingleMatch(RegexRunnerMode.BoundsRequired, _lastMatch.Length, _input, _startAt); + + // If the match was successful, update the current result to be the input between the last match and this one. + // Otherwise, update the current result to be the input between the last match and the end of the input. + if (!_regex.RightToLeft) + { + int start = _lastMatch.Index + Math.Max(_lastMatch.Length, 0); + if (match.Success) + { + _currentSplit = start..match.Index; + _lastMatch = (match.Index, match.Length); + } + else + { + _currentSplit = start.._input.Length; + _remainingCount = 1; + } + } + else + { + if (match.Success) + { + int start = _lastMatch.Index; + _currentSplit = (match.Index + match.Length)..start; + _lastMatch = (match.Index, match.Length); + } + else + { + _currentSplit = 0.._lastMatch.Index; + _remainingCount = 1; + } + } + + // Update the position from which to perform the next match. + _startAt = match.TextPosition; + } + + // Decrement the remaining count now that we're successfully yielding the next split. + _remainingCount--; + return true; + } + + /// + /// Gets the element at the current position of the enumerator. + /// + /// Enumeration has either not started or has already finished. + public readonly Range Current => _currentSplit; + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.EnumerateSplits.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.EnumerateSplits.Tests.cs new file mode 100644 index 0000000000000..47a8a9aacc14e --- /dev/null +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.EnumerateSplits.Tests.cs @@ -0,0 +1,75 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Generic; +using System.Threading.Tasks; +using Xunit; + +namespace System.Text.RegularExpressions.Tests +{ + public class RegexEnumerateSplitTests + { + [Theory] + [MemberData(nameof(RegexSplitTests.Split_TestData), MemberType = typeof(RegexSplitTests))] + public async Task Split(RegexEngine engine, string pattern, string input, RegexOptions options, int count, int start, string[] _) + { + options |= RegexOptions.ExplicitCapture; // EnumerateSplits does not include the contents of capture groups, so avoid them when possible in the test patterns. + + bool isDefaultStart = RegexHelpers.IsDefaultStart(input, options, start); + bool isDefaultCount = RegexHelpers.IsDefaultCount(input, options, count); + + Regex r = await RegexHelpers.GetRegexAsync(engine, pattern, options); + if (r.GetGroupNames().Length != 1) + { + // EnumerateSplits does not include the contents of capture groups. + return; + } + + if (isDefaultStart && isDefaultCount) + { + Validate(options, input, r.Split(input), r.EnumerateSplits(input)); + Validate(options, input, Regex.Split(input, pattern, options | RegexHelpers.OptionsFromEngine(engine)), Regex.EnumerateSplits(input, pattern, options | RegexHelpers.OptionsFromEngine(engine))); + } + + if (isDefaultStart) + { + Validate(options, input, r.Split(input, count), r.EnumerateSplits(input, count)); + } + + Validate(options, input, r.Split(input, count, start), r.EnumerateSplits(input, count, start)); + + static void Validate(RegexOptions options, string input, string[] expected, Regex.ValueSplitEnumerator enumerator) + { + var actual = new List(); + while (enumerator.MoveNext()) + { + actual.Add(input[enumerator.Current]); + } + + if ((options & RegexOptions.RightToLeft) != 0) + { + actual.Reverse(); + } + + Assert.Equal(expected, actual.ToArray()); + } + } + + [Fact] + public void Split_Invalid() + { + // pattern is null + AssertExtensions.Throws("pattern", () => Regex.EnumerateSplits("input", null)); + AssertExtensions.Throws("pattern", () => Regex.EnumerateSplits("input", null, RegexOptions.None)); + AssertExtensions.Throws("pattern", () => Regex.EnumerateSplits("input", null, RegexOptions.None, TimeSpan.FromMilliseconds(1))); + + // count is invalid + AssertExtensions.Throws("count", () => new Regex("pattern").EnumerateSplits("input", -1)); + AssertExtensions.Throws("count", () => new Regex("pattern").EnumerateSplits("input", -1, 0)); + + // startat is invalid + AssertExtensions.Throws("startat", () => new Regex("pattern").EnumerateSplits("input", 0, -1)); + AssertExtensions.Throws("startat", () => new Regex("pattern").EnumerateSplits("input", 0, 6)); + } + } +} diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Split.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Split.Tests.cs index 8e821af1da3c6..01a99ec691fa7 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Split.Tests.cs +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Split.Tests.cs @@ -2,6 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Collections.Generic; +using System.Linq; using System.Threading.Tasks; using Xunit; @@ -43,6 +44,8 @@ public static IEnumerable Split_TestData() yield return new object[] { engine, "a(?.)c(.)e", "123abcde456aBCDe789", RegexOptions.None, 19, 0, new string[] { "123", "d", "b", "456aBCDe789" } }; yield return new object[] { engine, "a(?.)c(.)e", "123abcde456aBCDe789", RegexOptions.IgnoreCase, 19, 0, new string[] { "123", "d", "b", "456", "D", "B", "789" } }; + yield return new object[] { engine, "a", string.Concat(Enumerable.Repeat("ab", 1000)), RegexOptions.None, 0, 0, new[] { "" }.Concat(Enumerable.Repeat("b", 1000)).ToArray() }; + if (!RegexHelpers.IsNonBacktracking(engine)) { // RightToLeft diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj index 1edf95f93769f..dbab47f63d097 100644 --- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj +++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/System.Text.RegularExpressions.Tests.csproj @@ -56,6 +56,7 @@ +