diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs
index dc23cd9af0749d..485fdd42bb9209 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs
@@ -51,7 +51,7 @@ internal int FixedLength
/// If true then the state is a dead-end, rejects all inputs.
internal bool IsNothing => Node.IsNothing;
- /// If true then state starts with a ^ or $ or \A or \z or \Z
+ /// If true then state starts with a ^ or $ or \Z
internal bool StartsWithLineAnchor => Node._info.StartsWithLineAnchor;
///
@@ -134,7 +134,9 @@ internal DfaMatchingState Next(TSet minterm)
// nextCharKind will be the PrevCharKind of the target state
// use an existing state instead if one exists already
// otherwise create a new new id for it
- list.Add((Node._builder.CreateState(node, nextCharKind, capturing: true), effects));
+ DfaMatchingState state = Node._builder.CreateState(node, nextCharKind, capturing: true);
+ if (!state.IsDeadend)
+ list.Add((state, effects));
}
return list;
}
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs
index dadd291ecd6436..cfd1048d109405 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DgmlWriter.cs
@@ -191,8 +191,8 @@ internal DfaExplorer(SymbolicRegexMatcher srm, bool nfa, bool addDotStar,
{
_builder = srm._builder;
uint startId = reverse ?
- (srm._reversePattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0) :
- (srm._pattern._info.StartsWithLineAnchor ? CharKind.BeginningEnd : 0);
+ (srm._reversePattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0) :
+ (srm._pattern._info.StartsWithSomeAnchor ? CharKind.BeginningEnd : 0);
// Create the initial state
_initialState = _builder.CreateState(
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs
index 17377d10a9a7bc..651902547e1a34 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs
@@ -24,6 +24,7 @@ internal sealed class SymbolicRegexBuilder where TSet : IComparable
internal readonly SymbolicRegexNode _nothing;
internal readonly SymbolicRegexNode _anyChar;
internal readonly SymbolicRegexNode _anyStar;
+ internal readonly SymbolicRegexNode _anyStarLazy;
private SymbolicRegexNode? _epsilon;
internal SymbolicRegexNode Epsilon => _epsilon ??= SymbolicRegexNode.CreateEpsilon(this);
@@ -173,6 +174,7 @@ internal SymbolicRegexBuilder(ISolver solver, CharSetSolver charSetSolver)
_nothing = SymbolicRegexNode.CreateFalse(this);
_anyChar = SymbolicRegexNode.CreateTrue(this);
_anyStar = SymbolicRegexNode.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: false);
+ _anyStarLazy = SymbolicRegexNode.CreateLoop(this, _anyChar, 0, int.MaxValue, isLazy: true);
// --- initialize singletonCache ---
_singletonCache[_solver.Empty] = _nothing;
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
index 89fc8e5bf551a9..cc5b5af109e320 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexInfo.cs
@@ -11,17 +11,14 @@ namespace System.Text.RegularExpressions.Symbolic
private const uint IsLazyMask = 4;
private const uint CanBeNullableMask = 8;
private const uint ContainsSomeAnchorMask = 16;
- private const uint ContainsLineAnchorMask = 32;
- private const uint ContainsSomeCharacterMask = 64;
- private const uint StartsWithBoundaryAnchorMask = 128;
+ private const uint StartsWithSomeAnchorMask = 32;
private readonly uint _info;
private SymbolicRegexInfo(uint i) => _info = i;
- internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false, bool startsWithLineAnchor = false,
- bool startsWithBoundaryAnchor = false, bool containsSomeAnchor = false,
- bool containsLineAnchor = false, bool containsSomeCharacter = false, bool isLazy = true)
+ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool canBeNullable = false,
+ bool startsWithLineAnchor = false, bool startsWithSomeAnchor = false, bool containsSomeAnchor = false, bool isLazy = true)
{
uint i = 0;
@@ -35,31 +32,21 @@ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool can
}
}
- if (startsWithLineAnchor || containsLineAnchor || startsWithBoundaryAnchor || containsSomeAnchor)
+ if (containsSomeAnchor || startsWithLineAnchor || startsWithSomeAnchor)
{
i |= ContainsSomeAnchorMask;
- if (startsWithLineAnchor || containsLineAnchor)
+ if (startsWithLineAnchor)
{
- i |= ContainsLineAnchorMask;
-
- if (startsWithLineAnchor)
- {
- i |= StartsWithLineAnchorMask;
- }
+ i |= StartsWithLineAnchorMask;
}
- if (startsWithBoundaryAnchor)
+ if (startsWithLineAnchor || startsWithSomeAnchor)
{
- i |= StartsWithBoundaryAnchorMask;
+ i |= StartsWithSomeAnchorMask;
}
}
- if (containsSomeCharacter)
- {
- i |= ContainsSomeCharacterMask;
- }
-
if (isLazy)
{
i |= IsLazyMask;
@@ -72,18 +59,12 @@ internal static SymbolicRegexInfo Create(bool isAlwaysNullable = false, bool can
public bool CanBeNullable => (_info & CanBeNullableMask) != 0;
- public bool StartsWithSomeAnchor => (_info & (StartsWithLineAnchorMask | StartsWithBoundaryAnchorMask)) != 0;
-
public bool StartsWithLineAnchor => (_info & StartsWithLineAnchorMask) != 0;
- public bool StartsWithBoundaryAnchor => (_info & StartsWithBoundaryAnchorMask) != 0;
+ public bool StartsWithSomeAnchor => (_info & StartsWithSomeAnchorMask) != 0;
public bool ContainsSomeAnchor => (_info & ContainsSomeAnchorMask) != 0;
- public bool ContainsLineAnchor => (_info & ContainsLineAnchorMask) != 0;
-
- public bool ContainsSomeCharacter => (_info & ContainsSomeCharacterMask) != 0;
-
public bool IsLazy => (_info & IsLazyMask) != 0;
public static SymbolicRegexInfo Or(params SymbolicRegexInfo[] infos)
@@ -121,20 +102,14 @@ public static SymbolicRegexInfo And(params SymbolicRegexInfo[] infos)
return new SymbolicRegexInfo(i);
}
- public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info)
- {
- bool isNullable = left_info.IsNullable && right_info.IsNullable;
- bool canBeNullable = left_info.CanBeNullable && right_info.CanBeNullable;
- bool isLazy = left_info.IsLazy && right_info.IsLazy;
-
- bool startsWithLineAnchor = left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor);
- bool startsWithBoundaryAnchor = left_info.StartsWithBoundaryAnchor || (left_info.CanBeNullable && right_info.StartsWithBoundaryAnchor);
- bool containsSomeAnchor = left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor;
- bool containsLineAnchor = left_info.ContainsLineAnchor || right_info.ContainsLineAnchor;
- bool containsSomeCharacter = left_info.ContainsSomeCharacter || right_info.ContainsSomeCharacter;
-
- return Create(isNullable, canBeNullable, startsWithLineAnchor, startsWithBoundaryAnchor, containsSomeAnchor, containsLineAnchor, containsSomeCharacter, isLazy);
- }
+ public static SymbolicRegexInfo Concat(SymbolicRegexInfo left_info, SymbolicRegexInfo right_info) =>
+ Create(
+ isAlwaysNullable: left_info.IsNullable && right_info.IsNullable,
+ canBeNullable: left_info.CanBeNullable && right_info.CanBeNullable,
+ startsWithLineAnchor: left_info.StartsWithLineAnchor || (left_info.CanBeNullable && right_info.StartsWithLineAnchor),
+ startsWithSomeAnchor: left_info.StartsWithSomeAnchor || (left_info.CanBeNullable && right_info.StartsWithSomeAnchor),
+ containsSomeAnchor: left_info.ContainsSomeAnchor || right_info.ContainsSomeAnchor,
+ isLazy: left_info.IsLazy && right_info.IsLazy);
public static SymbolicRegexInfo Loop(SymbolicRegexInfo body_info, int lowerBound, bool isLazy)
{
@@ -171,10 +146,7 @@ public static SymbolicRegexInfo Not(SymbolicRegexInfo info) =>
Create(isAlwaysNullable: !info.CanBeNullable,
canBeNullable: !info.IsNullable,
startsWithLineAnchor: info.StartsWithLineAnchor,
- startsWithBoundaryAnchor: info.StartsWithBoundaryAnchor,
containsSomeAnchor: info.ContainsSomeAnchor,
- containsLineAnchor: info.ContainsLineAnchor,
- containsSomeCharacter: info.ContainsSomeCharacter,
isLazy: info.IsLazy);
public override bool Equals(object? obj) => obj is SymbolicRegexInfo i && Equals(i);
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
index d9efcc3d7d08a6..4a454d80c9ba98 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.cs
@@ -210,7 +210,7 @@ private SymbolicRegexMatcher(SymbolicRegexNode rootNode, int captureCount,
// Create the dot-star pattern (a concatenation of any* with the original pattern)
// and all of its initial states.
- _dotStarredPattern = _builder.CreateConcat(_builder._anyStar, _pattern);
+ _dotStarredPattern = _builder.CreateConcat(_builder._anyStarLazy, _pattern);
var dotstarredInitialStates = new DfaMatchingState[statesCount];
for (uint i = 0; i < dotstarredInitialStates.Length; i++)
{
@@ -280,8 +280,9 @@ private bool TryTakeTransition(SymbolicRegexBuilder builder
{
int c = input[i];
+ // Find the minterm, handling the special case for the last \n for states that start with a relevant anchor
int mintermId = c == '\n' && i == input.Length - 1 && TStateHandler.StartsWithLineAnchor(ref state) ?
- builder._minterms!.Length : // mintermId = minterms.Length represents \Z (last \n)
+ builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input
_mintermClassifier.GetMintermID(c);
return TStateHandler.TakeTransition(builder, ref state, mintermId);
@@ -335,29 +336,17 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start
timeoutOccursAt = Environment.TickCount + (int)(_timeout + 0.5);
}
- // If we're starting at the end of the input, we don't need to do any work other than
- // determine whether an empty match is valid, i.e. whether the pattern is "nullable"
- // given the kinds of characters at and just before the end.
- if (startat == input.Length)
- {
- // TODO https://github.com/dotnet/runtime/issues/65606: Handle capture groups.
- uint prevKind = GetCharKind(input, startat - 1);
- uint nextKind = GetCharKind(input, startat);
- return _pattern.IsNullableFor(CharKind.Context(prevKind, nextKind)) ?
- new SymbolicMatch(startat, 0) :
- SymbolicMatch.NoMatch;
- }
-
// Phase 1:
- // Determine whether there is a match by finding the first final state position. This only tells
- // us whether there is a match but needn't give us the longest possible match. This may return -1 as
- // a legitimate value when the initial state is nullable and startat == 0. It returns NoMatchExists (-2)
- // when there is no match. As an example, consider the pattern a{5,10}b* run against an input
- // of aaaaaaaaaaaaaaabbbc: phase 1 will find the position of the first b: aaaaaaaaaaaaaaab.
- int i = FindFinalStatePosition(input, startat, timeoutOccursAt, out int matchStartLowBoundary, out int matchStartLengthMarker, perThreadData);
+ // Determine the end point of the match. The returned index is one-past-the-end index for the characters
+ // in the match. Note that -1 is a valid end point for an empty match at the beginning of the input.
+ // It returns NoMatchExists (-2) when there is no match.
+ // As an example, consider the pattern a{1,3}(b*) run against an input of aacaaaabbbc: phase 1 will find
+ // the position of the last b: aacaaaabbbc. It additionally records the position of the first a after
+ // the c as the low boundary for the starting position.
+ int matchEnd = FindEndPosition(input, startat, timeoutOccursAt, isMatch, out int matchStartLowBoundary, out int matchStartLengthMarker, perThreadData);
// If there wasn't a match, we're done.
- if (i == NoMatchExists)
+ if (matchEnd == NoMatchExists)
{
return SymbolicMatch.NoMatch;
}
@@ -374,101 +363,138 @@ public SymbolicMatch FindMatch(bool isMatch, ReadOnlySpan input, int start
// start position. That tells us the actual starting position of the match. We can skip this phase if we
// recorded a fixed-length marker for the portion of the pattern that matched, as we can then jump that
// exact number of positions backwards. Continuing the previous example, phase 2 will walk backwards from
- // that first b until it finds the 6th a: aaaaaaaaaab.
+ // that last b until it finds the 4th a: aaabbbc.
int matchStart;
if (matchStartLengthMarker >= 0)
{
- matchStart = i - matchStartLengthMarker + 1;
+ matchStart = matchEnd - matchStartLengthMarker;
+ }
+ else if (_fixedMatchLength.HasValue)
+ {
+ matchStart = matchEnd - _fixedMatchLength.GetValueOrDefault();
}
else
{
- Debug.Assert(i >= startat - 1);
- matchStart = i < startat ?
+ Debug.Assert(matchEnd >= startat - 1);
+ matchStart = matchEnd < startat ?
startat :
- FindStartPosition(input, i, matchStartLowBoundary, perThreadData);
+ FindStartPosition(input, matchEnd, matchStartLowBoundary, perThreadData);
}
// Phase 3:
- // Match again, this time from the computed start position, to find the latest end position. That start
- // and end then represent the bounds of the match. If the pattern has subcaptures (captures other than
- // the top-level capture for the whole match), we need to do more work to compute their exact bounds, so we
- // take a faster path if captures aren't required. Further, if captures aren't needed, and if any possible
- // match of the whole pattern is a fixed length, we can skip this phase as well, just using that fixed-length
- // to compute the ending position based on the starting position. Continuing the previous example, phase 3
- // will walk forwards from the 6th a until it finds the end of the match: aaaaaaaaaabbb.
+ // If there are no subcaptures, the matching process is done. For patterns with subcaptures (captures other
+ // than the top-level capture for the whole match), we need to do an additional pass to find their bounds.
+ // Continuing for the previous example, phase 3 will be executed for the characters inside the match, aaabbbc,
+ // and will find associate the one capture (b*) with it's match: bbb.
if (!HasSubcaptures)
{
- if (_fixedMatchLength.HasValue)
- {
- return new SymbolicMatch(matchStart, _fixedMatchLength.GetValueOrDefault());
- }
-
- int matchEnd = FindEndPosition(input, matchStart, perThreadData);
- return new SymbolicMatch(matchStart, matchEnd + 1 - matchStart);
+ return new SymbolicMatch(matchStart, matchEnd - matchStart);
}
else
{
- int matchEnd = FindEndPositionCapturing(input, matchStart, out Registers endRegisters, perThreadData);
- return new SymbolicMatch(matchStart, matchEnd + 1 - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
+ Registers endRegisters = FindSubcaptures(input, matchStart, matchEnd, perThreadData);
+ return new SymbolicMatch(matchStart, matchEnd - matchStart, endRegisters.CaptureStarts, endRegisters.CaptureEnds);
}
}
- /// Phase 3 of matching. From a found starting position, find the ending position of the match using the original pattern.
- ///
- /// The ending position is known to exist; this function just needs to determine exactly what it is.
- /// We need to find the longest possible match and thus the latest valid ending position.
- ///
+ /// Performs the initial Phase 1 match to find the end position of the match, or first final state if this is an isMatch call.
/// The input text.
- /// The starting position of the match.
+ /// The starting position in .
+ /// The time at which timeout occurs, if timeouts are being checked.
+ /// Whether this is an isMatch call.
+ /// The last position the initial state of was visited before the end position was found.
+ /// Length of the match if there's a match; otherwise, -1.
/// Per thread data reused between calls.
- /// The found ending position of the match.
- private int FindEndPosition(ReadOnlySpan input, int i, PerThreadData perThreadData)
+ ///
+ /// A one-past-the-end index into input for the preferred match, or first final state position if isMatch is true, or NoMatchExists if no match exists.
+ ///
+ private int FindEndPosition(ReadOnlySpan input, int i, int timeoutOccursAt, bool isMatch, out int initialStateIndex, out int matchLength, PerThreadData perThreadData)
{
- // Get the starting state based on the current context.
- DfaMatchingState dfaStartState = _initialStates[GetCharKind(input, i - 1)];
-
- // If the starting state is nullable (accepts the empty string), then it's a valid
- // match and we need to record the position as a possible end, but keep going looking
- // for a better one.
- int end = input.Length; // invalid sentinel value
- if (dfaStartState.IsNullable(GetCharKind(input, i)))
- {
- // Empty match exists because the initial state is accepting.
- end = i - 1;
- }
+ int endPosition = NoMatchExists;
- if ((uint)i < (uint)input.Length)
+ matchLength = -1;
+ initialStateIndex = i;
+ int initialStateIndexCandidate = i;
+
+ var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]);
+ SymbolicRegexBuilder builder = _pattern._builder;
+
+ while (true)
{
- // Iterate from the starting state until we've found the best ending state.
- SymbolicRegexBuilder builder = dfaStartState.Node._builder;
- var currentState = new CurrentState(dfaStartState);
- while (true)
+ if (currentState.DfaState is { IsInitialState: true })
+ {
+ if (_findOpts is RegexFindOptimizations findOpts)
+ {
+ // Find the first position i that matches with some likely character.
+ if (!findOpts.TryFindNextStartingPosition(input, ref i, 0))
+ {
+ // no match was found
+ break;
+ }
+ }
+
+ initialStateIndexCandidate = i;
+
+ // Update the starting state based on where TryFindNextStartingPosition moved us to.
+ // As with the initial starting state, if it's a dead end, no match exists.
+ currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]);
+ }
+
+ // Now run the DFA or NFA traversal from the current point using the current state. If timeouts are being checked,
+ // we need to pop out of the inner loop every now and then to do the timeout check in this outer loop.
+ const int CharsPerTimeoutCheck = 1_000;
+ ReadOnlySpan inputForInnerLoop = _checkTimeout && input.Length - i > CharsPerTimeoutCheck ?
+ input.Slice(0, i + CharsPerTimeoutCheck) :
+ input;
+
+ int newEndPosition;
+ int findResult = currentState.NfaState is not null ?
+ FindEndPositionDeltas(builder, inputForInnerLoop, isMatch, ref i, ref currentState, ref matchLength, out newEndPosition) :
+ FindEndPositionDeltas(builder, inputForInnerLoop, isMatch, ref i, ref currentState, ref matchLength, out newEndPosition);
+
+ // If a new end position was found, commit to the matching initial state index
+ if (newEndPosition != -1)
{
- // Run the DFA or NFA traversal backwards from the current point using the current state.
- bool done = currentState.NfaState is not null ?
- FindEndPositionDeltas(builder, input, ref i, ref currentState, ref end) :
- FindEndPositionDeltas(builder, input, ref i, ref currentState, ref end);
+ endPosition = newEndPosition;
+ initialStateIndex = initialStateIndexCandidate;
+ }
- // If we successfully found the ending position, we're done.
- if (done || (uint)i >= (uint)input.Length)
+ // If we reached the end of input or a deadend state, we're done.
+ if (findResult > 0)
+ {
+ break;
+ }
+
+ // The search did not finish, so we either hit an initial state (in which case we want to loop around to apply our initial
+ // state processing logic and optimizations), or failed to transition (which should only happen if we were in DFA mode and
+ // need to switch over to NFA mode). If we exited because we hit an initial state, find result will be 0, otherwise -1.
+ if (findResult < 0)
+ {
+ if (i >= input.Length)
{
+ // We ran out of input.
break;
}
- // We exited out of the inner processing loop, but we didn't hit a dead end or run out
- // of input, and that should only happen if we failed to transition from one state to
- // the next, which should only happen if we were in DFA mode and we tried to create
- // a new state and exceeded the graph size. Upgrade to NFA mode and continue;
- Debug.Assert(currentState.DfaState is not null);
- NfaMatchingState nfaState = perThreadData.NfaState;
- nfaState.InitializeFrom(currentState.DfaState);
- currentState = new CurrentState(nfaState);
+ if (i < inputForInnerLoop.Length)
+ {
+ // We failed to transition. Upgrade to DFA mode.
+ Debug.Assert(i < inputForInnerLoop.Length);
+ Debug.Assert(currentState.DfaState is not null);
+ NfaMatchingState nfaState = perThreadData.NfaState;
+ nfaState.InitializeFrom(currentState.DfaState);
+ currentState = new CurrentState(nfaState);
+ }
+ }
+
+ // Check for a timeout before continuing.
+ if (_checkTimeout)
+ {
+ DoCheckTimeout(timeoutOccursAt);
}
}
- // Return the found ending position.
- Debug.Assert(end < input.Length, "Expected to find an ending position but didn't");
- return end;
+ return endPosition;
}
///
@@ -476,158 +502,68 @@ private int FindEndPosition(ReadOnlySpan input, int i, PerThreadData perTh
/// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state,
/// lazily building out the graph as needed.
///
- private bool FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, ref int i, ref CurrentState currentState, ref int endingIndex)
+ ///
+ /// The supplies the actual transitioning logic, controlling whether processing is
+ /// performed in DFA mode or in NFA mode. However, it expects to be configured to match,
+ /// so for example if is a , it expects the 's
+ /// to be non-null and its to be null; vice versa for
+ /// .
+ ///
+ ///
+ /// A positive value if iteration completed because it reached a deadend state or nullable state and the call is an isMatch.
+ /// 0 if iteration completed because we reached an initial state.
+ /// A negative value if iteration completed because we ran out of input or we failed to transition.
+ ///
+ private int FindEndPositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, bool isMatch, ref int i, ref CurrentState currentState, ref int matchLength, out int endPosition)
where TStateHandler : struct, IStateHandler
{
- // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
+ // To avoid frequent reads/writes to ref and out values, make and operate on local copies, which we then copy back once before returning.
int pos = i;
CurrentState state = currentState;
-
- // Repeatedly read the next character from the input and use it to transition the current state to the next.
- // We're looking for the furthest final state we can find.
- while ((uint)pos < (uint)input.Length && TryTakeTransition(builder, input, pos, ref state))
+ int endPos = -1;
+ try
{
- if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos + 1)))
- {
- // If the new state accepts the empty string, we found an ending state. Record the position.
- endingIndex = pos;
- }
- else if (TStateHandler.IsDeadend(ref state))
+ // Loop through each character in the input, transitioning from state to state for each.
+ while (true)
{
- // If the new state is a dead end, the match ended the last time endingIndex was updated.
- currentState = state;
- i = pos;
- return true;
- }
-
- // We successfully transitioned to the next state and consumed the current character,
- // so move along to the next.
- pos++;
- }
-
- // We either ran out of input, in which case we successfully recorded an ending index,
- // or we failed to transition to the next state due to the graph becoming too large.
- currentState = state;
- i = pos;
- return false;
- }
-
- /// Find match end position using the original pattern, end position is known to exist. This version also produces captures.
- /// input span
- /// inclusive start position
- /// out parameter for the final register values, which indicate capture starts and ends
- /// Per thread data reused between calls.
- /// the match end position
- private int FindEndPositionCapturing(ReadOnlySpan input, int i, out Registers resultRegisters, PerThreadData perThreadData)
- {
- int i_end = input.Length;
- Registers endRegisters = default;
- DfaMatchingState? endState = null;
+ // If the state is nullable for the next character, meaning it accepts the empty string,
+ // we found a potential end state.
+ if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos)))
+ {
+ // Check whether there's a fixed-length marker for the current state. If there is, we can
+ // use that length to optimize subsequent matching phases.
+ matchLength = TStateHandler.FixedLength(ref state);
+ endPos = pos;
+ // If this is an isMatch call we are done, since a match is now known to exist.
+ if (isMatch)
+ return 1;
+ }
- // Pick the correct start state based on previous character kind.
- DfaMatchingState initialState = _initialStates[GetCharKind(input, i - 1)];
+ // If the state is a dead end, such that we can't transition anywhere else, end the search.
+ if (TStateHandler.IsDeadend(ref state))
+ return 1;
- Registers initialRegisters = perThreadData.InitialRegisters;
+ // If there is more input available try to transition with the next character.
+ if ((uint)pos >= (uint)input.Length || !TryTakeTransition(builder, input, pos, ref state))
+ return -1;
- // Initialize registers with -1, which means "not seen yet"
- Array.Fill(initialRegisters.CaptureStarts, -1);
- Array.Fill(initialRegisters.CaptureEnds, -1);
+ // We successfully transitioned, so update our current input index to match.
+ pos++;
- if (initialState.IsNullable(GetCharKind(input, i)))
- {
- // Empty match exists because the initial state is accepting.
- i_end = i - 1;
- endRegisters.Assign(initialRegisters);
- endState = initialState;
+ // Now that currentState and our position are coherent, check if currentState represents an initial state.
+ // If it does, we exit out in order to allow our find optimizations to kick in to hopefully more quickly
+ // find the next possible starting location.
+ if (TStateHandler.IsInitialState(ref state))
+ return 0;
+ }
}
-
- // Use two maps from state IDs to register values for the current and next set of states.
- // Note that these maps use insertion order, which is used to maintain priorities between states in a way
- // that matches the order the backtracking engines visit paths.
- Debug.Assert(perThreadData.Current is not null && perThreadData.Next is not null);
- SparseIntMap current = perThreadData.Current, next = perThreadData.Next;
- current.Clear();
- next.Clear();
- current.Add(initialState.Id, initialRegisters);
-
- SymbolicRegexBuilder builder = _builder;
-
- while ((uint)i < (uint)input.Length)
+ finally
{
- Debug.Assert(next.Count == 0);
-
- int c = input[i];
- int normalMintermId = _mintermClassifier.GetMintermID(c);
-
- foreach ((int sourceId, Registers sourceRegisters) in current.Values)
- {
- Debug.Assert(builder._capturingStateArray is not null);
- DfaMatchingState sourceState = builder._capturingStateArray[sourceId];
-
- // Find the minterm, handling the special case for the last \n
- int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ?
- builder._minterms!.Length :
- normalMintermId; // mintermId = minterms.Length represents \Z (last \n)
- TSet minterm = builder.GetMinterm(mintermId);
-
- // Get or create the transitions
- int offset = (sourceId << builder._mintermsLog) | mintermId;
- Debug.Assert(builder._capturingDelta is not null);
- List<(DfaMatchingState, DerivativeEffect[])>? transitions =
- builder._capturingDelta[offset] ??
- CreateNewCapturingTransitions(sourceState, minterm, offset);
-
- // Take the transitions in their prioritized order
- for (int j = 0; j < transitions.Count; ++j)
- {
- (DfaMatchingState targetState, DerivativeEffect[] effects) = transitions[j];
- if (targetState.IsDeadend)
- continue;
-
- // Try to add the state and handle the case where it didn't exist before. If the state already
- // exists, then the transition can be safely ignored, as the existing state was generated by a
- // higher priority transition.
- if (next.Add(targetState.Id, out int index))
- {
- // Avoid copying the registers on the last transition from this state, reusing the registers instead
- Registers newRegisters = j != transitions.Count - 1 ? sourceRegisters.Clone() : sourceRegisters;
- newRegisters.ApplyEffects(effects, i);
- next.Update(index, targetState.Id, newRegisters);
- if (targetState.IsNullable(GetCharKind(input, i + 1)))
- {
- // Accepting state has been reached. Record the position.
- i_end = i;
- endRegisters.Assign(newRegisters);
- endState = targetState;
- // No lower priority transitions from this or other source states are taken because the
- // backtracking engines would return the match ending here.
- goto BreakNullable;
- }
- }
- }
- }
-
- BreakNullable:
- if (next.Count == 0)
- {
- // If all states died out some nullable state must have been seen before
- break;
- }
-
- // Swap the state sets and prepare for the next character
- SparseIntMap tmp = current;
- current = next;
- next = tmp;
- next.Clear();
- i++;
+ // Write back the local copies of the ref and out values.
+ currentState = state;
+ i = pos;
+ endPosition = endPos;
}
-
- Debug.Assert(i_end != input.Length && endState is not null);
- // Apply effects for finishing at the stored end state
- endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos),
- CharKind.Context(endState.PrevCharKind, GetCharKind(input, i_end + 1)), (Registers: endRegisters, Pos: i_end + 1));
- resultRegisters = endRegisters;
- return i_end;
}
///
@@ -639,30 +575,24 @@ private int FindEndPositionCapturing(ReadOnlySpan input, int i, out Regist
/// We need to find the earliest (lowest index) starting position that's not earlier than .
///
/// The input text.
- /// The ending position to walk backwards from. points at the last character of the match.
+ /// The ending position to walk backwards from. points one past the last character of the match.
/// The initial starting location discovered in phase 1, a point we must not walk earlier than.
/// Per thread data reused between calls.
/// The found starting position for the match.
private int FindStartPosition(ReadOnlySpan input, int i, int matchStartBoundary, PerThreadData perThreadData)
{
Debug.Assert(i >= 0, $"{nameof(i)} == {i}");
- Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary < input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}");
+ Debug.Assert(matchStartBoundary >= 0 && matchStartBoundary <= input.Length, $"{nameof(matchStartBoundary)} == {matchStartBoundary}");
Debug.Assert(i >= matchStartBoundary, $"Expected {i} >= {matchStartBoundary}.");
// Get the starting state for the reverse pattern. This depends on previous character (which, because we're
- // going backwards, is character number i + 1).
- var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i + 1)]);
+ // going backwards, is character number i).
+ var currentState = new CurrentState(_reverseInitialStates[GetCharKind(input, i)]);
- // If the initial state is nullable, meaning it accepts the empty string, then we've already discovered
- // a valid starting position, and we just need to keep looking for an earlier one in case there is one.
int lastStart = -1; // invalid sentinel value
- if (currentState.DfaState!.IsNullable(GetCharKind(input, i)))
- {
- lastStart = i + 1;
- }
// Walk backwards to the furthest accepting state of the reverse pattern but no earlier than matchStartBoundary.
- SymbolicRegexBuilder builder = currentState.DfaState.Node._builder;
+ SymbolicRegexBuilder builder = currentState.DfaState!.Node._builder;
while (true)
{
// Run the DFA or NFA traversal backwards from the current point using the current state.
@@ -701,232 +631,145 @@ private bool FindStartPositionDeltas(SymbolicRegexBuilder b
// To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
int pos = i;
CurrentState state = currentState;
-
- // Loop backwards through each character in the input, transitioning from state to state for each.
- while (TryTakeTransition(builder, input, pos, ref state))
+ try
{
- // We successfully transitioned. If the new state is a dead end, we're done, as we must have already seen
- // and recorded a larger lastStart value that was the earliest valid starting position.
- if (TStateHandler.IsDeadend(ref state))
- {
- Debug.Assert(lastStart != -1);
- currentState = state;
- i = pos;
- return true;
- }
-
- // If the new state accepts the empty string, we found a valid starting position. Record it and keep going,
- // since we're looking for the earliest one to occur within bounds.
- if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos - 1)))
+ // Loop backwards through each character in the input, transitioning from state to state for each.
+ while (true)
{
- lastStart = pos;
- }
+ // If the state accepts the empty string, we found a valid starting position. Record it and keep going,
+ // since we're looking for the earliest one to occur within bounds.
+ if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos - 1)))
+ lastStart = pos;
+
+ // If we are past the start threshold or if the state is a dead end, bail; we should have already
+ // found a valid starting location.
+ if (pos <= startThreshold || TStateHandler.IsDeadend(ref state))
+ {
+ Debug.Assert(lastStart != -1);
+ return true;
+ }
- // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input.
- pos--;
+ // Try to transition with the next character, the one before the current position.
+ if (!TryTakeTransition(builder, input, pos - 1, ref state))
+ // Return false to indicate the search didn't finish.
+ return false;
- // If doing so now puts us below the start threshold, bail; we should have already found a valid starting location.
- if (pos < startThreshold)
- {
- Debug.Assert(lastStart != -1);
- currentState = state;
- i = pos;
- return true;
+ // Since we successfully transitioned, update our current index to match the fact that we consumed the previous character in the input.
+ pos--;
}
}
-
- // Unable to transition further.
- currentState = state;
- i = pos;
- return false;
+ finally
+ {
+ // Write back the local copies of the ref values.
+ currentState = state;
+ i = pos;
+ }
}
- /// Performs the initial Phase 1 match to find the first final state encountered.
- /// The input text.
- /// The starting position in .
- /// The time at which timeout occurs, if timeouts are being checked.
- /// The last position the initial state of was visited.
- /// Length of the match if there's a match; otherwise, -1.
+
+ /// Run the pattern on a match to record the capture starts and ends.
+ /// input span
+ /// inclusive start position
+ /// exclusive end position
/// Per thread data reused between calls.
- /// The index into input that matches the final state, or NoMatchExists if no match exists. It returns -1 when i=0 and the initial state is nullable.
- private int FindFinalStatePosition(ReadOnlySpan input, int i, int timeoutOccursAt, out int initialStateIndex, out int matchLength, PerThreadData perThreadData)
+ /// the final register values, which indicate capture starts and ends
+ private Registers FindSubcaptures(ReadOnlySpan input, int i, int iEnd, PerThreadData perThreadData)
{
- matchLength = -1;
- initialStateIndex = i;
+ // Pick the correct start state based on previous character kind.
+ DfaMatchingState initialState = _initialStates[GetCharKind(input, i - 1)];
- // Start with the start state of the dot-star pattern, which in general depends on the previous character kind in the input in order to handle anchors.
- // If the starting state is a dead end, then no match exists.
- var currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]);
- if (currentState.DfaState!.IsNothing)
- {
- // This can happen, for example, when the original regex starts with a beginning anchor but the previous char kind is not Beginning.
- return NoMatchExists;
- }
+ Registers initialRegisters = perThreadData.InitialRegisters;
- // If the starting state accepts the empty string in this context (factoring in anchors), we're done.
- if (currentState.DfaState.IsNullable(GetCharKind(input, i)))
- {
- // The initial state is nullable in this context so at least an empty match exists.
- // The last position of the match is i - 1 because the match is empty.
- // This value is -1 if i == 0.
- return i - 1;
- }
+ // Initialize registers with -1, which means "not seen yet"
+ Array.Fill(initialRegisters.CaptureStarts, -1);
+ Array.Fill(initialRegisters.CaptureEnds, -1);
- // Otherwise, start searching from the current position until the end of the input.
- if ((uint)i < (uint)input.Length)
- {
- SymbolicRegexBuilder builder = currentState.DfaState.Node._builder;
- while (true)
- {
- // If we're at an initial state, try to search ahead for the next possible match location
- // using any find optimizations that may have previously been computed.
- if (currentState.DfaState is { IsInitialState: true })
- {
- // i is the most recent position in the input when the dot-star pattern is in the initial state
- initialStateIndex = i;
+ // Use two maps from state IDs to register values for the current and next set of states.
+ // Note that these maps use insertion order, which is used to maintain priorities between states in a way
+ // that matches the order the backtracking engines visit paths.
+ Debug.Assert(perThreadData.Current is not null && perThreadData.Next is not null);
+ SparseIntMap current = perThreadData.Current, next = perThreadData.Next;
+ current.Clear();
+ next.Clear();
+ current.Add(initialState.Id, initialRegisters);
- if (_findOpts is RegexFindOptimizations findOpts)
- {
- // Find the first position i that matches with some likely character.
- if (!findOpts.TryFindNextStartingPosition(input, ref i, 0))
- {
- // no match was found
- return NoMatchExists;
- }
+ SymbolicRegexBuilder builder = _builder;
- initialStateIndex = i;
+ while ((uint)i < (uint)iEnd)
+ {
+ Debug.Assert(next.Count == 0);
- // Update the starting state based on where TryFindNextStartingPosition moved us to.
- // As with the initial starting state, if it's a dead end, no match exists.
- currentState = new CurrentState(_dotstarredInitialStates[GetCharKind(input, i - 1)]);
- if (currentState.DfaState!.IsNothing)
- {
- return NoMatchExists;
- }
- }
- }
+ // Read the next character and find its minterm
+ int c = input[i];
+ int normalMintermId = _mintermClassifier.GetMintermID(c);
- // Now run the DFA or NFA traversal from the current point using the current state. If timeouts are being checked,
- // we need to pop out of the inner loop every now and then to do the timeout check in this outer loop.
- const int CharsPerTimeoutCheck = 10_000;
- ReadOnlySpan inputForInnerLoop = _checkTimeout && input.Length - i > CharsPerTimeoutCheck ?
- input.Slice(0, i + CharsPerTimeoutCheck) :
- input;
+ foreach ((int sourceId, Registers sourceRegisters) in current.Values)
+ {
+ Debug.Assert(builder._capturingStateArray is not null);
+ DfaMatchingState sourceState = builder._capturingStateArray[sourceId];
- int finalStatePosition;
- int findResult = currentState.NfaState is not null ?
- FindFinalStatePositionDeltas(builder, inputForInnerLoop, ref i, ref currentState, ref matchLength, out finalStatePosition) :
- FindFinalStatePositionDeltas(builder, inputForInnerLoop, ref i, ref currentState, ref matchLength, out finalStatePosition);
+ // Handle the special case for the last \n for states that start with a relevant anchor
+ int mintermId = c == '\n' && i == input.Length - 1 && sourceState.StartsWithLineAnchor ?
+ builder._minterms!.Length : // mintermId = minterms.Length represents an \n at the very end of input
+ normalMintermId;
+ TSet minterm = builder.GetMinterm(mintermId);
- // If we reached a final or deadend state, we're done.
- if (findResult > 0)
- {
- return finalStatePosition;
- }
+ // Get or create the transitions
+ int offset = (sourceId << builder._mintermsLog) | mintermId;
+ Debug.Assert(builder._capturingDelta is not null);
+ List<(DfaMatchingState, DerivativeEffect[])>? transitions =
+ builder._capturingDelta[offset] ??
+ CreateNewCapturingTransitions(sourceState, minterm, offset);
- // We're not at an end state, so we either ran out of input (in which case no match exists), hit an initial state (in which case
- // we want to loop around to apply our initial state processing logic and optimizations), or failed to transition (which should
- // only happen if we were in DFA mode and need to switch over to NFA mode). If we exited because we hit an initial state,
- // find result will be 0, otherwise negative.
- if (findResult < 0)
+ // Take the transitions in their prioritized order
+ for (int j = 0; j < transitions.Count; ++j)
{
- if (i >= input.Length)
- {
- // We ran out of input. No match.
- break;
- }
+ (DfaMatchingState targetState, DerivativeEffect[] effects) = transitions[j];
+ Debug.Assert(!targetState.IsDeadend, "Transitions should not include dead ends.");
- if (i < inputForInnerLoop.Length)
+ // Try to add the state and handle the case where it didn't exist before. If the state already
+ // exists, then the transition can be safely ignored, as the existing state was generated by a
+ // higher priority transition.
+ if (next.Add(targetState.Id, out int index))
{
- // We failed to transition. Upgrade to DFA mode.
- Debug.Assert(currentState.DfaState is not null);
- NfaMatchingState nfaState = perThreadData.NfaState;
- nfaState.InitializeFrom(currentState.DfaState);
- currentState = new CurrentState(nfaState);
+ // Avoid copying the registers on the last transition from this state, reusing the registers instead
+ Registers newRegisters = j != transitions.Count - 1 ? sourceRegisters.Clone() : sourceRegisters;
+ newRegisters.ApplyEffects(effects, i);
+ next.Update(index, targetState.Id, newRegisters);
+ if (targetState.IsNullable(GetCharKind(input, i + 1)))
+ {
+ // No lower priority transitions from this or other source states are taken because the
+ // backtracking engines would return the match ending here.
+ goto BreakNullable;
+ }
}
}
-
- // Check for a timeout before continuing.
- if (_checkTimeout)
- {
- DoCheckTimeout(timeoutOccursAt);
- }
}
- }
-
- // No match was found.
- return NoMatchExists;
- }
- ///
- /// Workhorse inner loop for . Consumes the character by character,
- /// starting at , for each character transitioning from one state in the DFA or NFA graph to the next state,
- /// lazily building out the graph as needed.
- ///
- ///
- /// The supplies the actual transitioning logic, controlling whether processing is
- /// performed in DFA mode or in NFA mode. However, it expects to be configured to match,
- /// so for example if is a , it expects the 's
- /// to be non-null and its to be null; vice versa for
- /// .
- ///
- ///
- /// A positive value if iteration completed because it reached a nullable or deadend state.
- /// 0 if iteration completed because we reached an initial state.
- /// A negative value if iteration completed because we ran out of input or we failed to transition.
- ///
- private int FindFinalStatePositionDeltas(SymbolicRegexBuilder builder, ReadOnlySpan input, ref int i, ref CurrentState currentState, ref int matchLength, out int finalStatePosition)
- where TStateHandler : struct, IStateHandler
- {
- // To avoid frequent reads/writes to ref values, make and operate on local copies, which we then copy back once before returning.
- int pos = i;
- CurrentState state = currentState;
+ BreakNullable:
+ // Swap the state sets and prepare for the next character
+ SparseIntMap tmp = current;
+ current = next;
+ next = tmp;
+ next.Clear();
+ i++;
+ }
- // Loop through each character in the input, transitioning from state to state for each.
- while ((uint)pos < (uint)input.Length && TryTakeTransition(builder, input, pos, ref state))
+ Debug.Assert(current.Count > 0);
+ Debug.Assert(_builder._capturingStateArray is not null);
+ foreach (var (endStateId, endRegisters) in current.Values)
{
- // We successfully transitioned for the character at index i. If the new state is nullable for
- // the next character, meaning it accepts the empty string, we found a final state and are done!
- if (TStateHandler.IsNullable(ref state, GetCharKind(input, pos + 1)))
+ DfaMatchingState endState = _builder._capturingStateArray[endStateId];
+ if (endState.IsNullable(GetCharKind(input, iEnd)))
{
- // Check whether there's a fixed-length marker for the current state. If there is, we can
- // use that length to optimize subsequent matching phases.
- matchLength = TStateHandler.FixedLength(ref state);
- currentState = state;
- i = pos;
- finalStatePosition = pos;
- return 1;
- }
-
- // If the new state is a dead end, such that we didn't match and we can't transition anywhere
- // else, then no match exists.
- if (TStateHandler.IsDeadend(ref state))
- {
- currentState = state;
- i = pos;
- finalStatePosition = NoMatchExists;
- return 1;
- }
-
- // We successfully transitioned, so update our current input index to match.
- pos++;
-
- // Now that currentState and our position are coherent, check if currentState represents an initial state.
- // If it does, we exit out in order to allow our find optimizations to kick in to hopefully more quickly
- // find the next possible starting location.
- if (TStateHandler.IsInitialState(ref state))
- {
- currentState = state;
- i = pos;
- finalStatePosition = 0;
- return 0;
+ // Apply effects for finishing at the stored end state
+ endState.Node.ApplyEffects((effect, args) => args.Registers.ApplyEffect(effect, args.Pos),
+ CharKind.Context(endState.PrevCharKind, GetCharKind(input, iEnd)), (Registers: endRegisters, Pos: iEnd));
+ return endRegisters;
}
}
-
- currentState = state;
- i = pos;
- finalStatePosition = 0;
- return -1;
+ Debug.Fail("No nullable state found in the set of end states");
+ return default;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
index 0a1668f5348c1b..dc62260b9e8ec2 100644
--- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
+++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexNode.cs
@@ -382,7 +382,7 @@ internal static SymbolicRegexNode CreateFalse(SymbolicRegexBuilder b
Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Empty, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode CreateTrue(SymbolicRegexBuilder builder) =>
- Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create(containsSomeCharacter: true));
+ Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, builder._solver.Full, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode CreateFixedLengthMarker(SymbolicRegexBuilder builder, int length) =>
Create(builder, SymbolicRegexNodeKind.FixedLengthMarker, null, null, length, -1, default, null, SymbolicRegexInfo.Create(isAlwaysNullable: true));
@@ -399,19 +399,22 @@ internal static SymbolicRegexNode CreateBeginEndAnchor(SymbolicRegexBuilde
SymbolicRegexNodeKind.BeginningAnchor or SymbolicRegexNodeKind.EndAnchor or
SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor);
- return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithLineAnchor: true, canBeNullable: true));
+ return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true,
+ startsWithLineAnchor: kind is
+ SymbolicRegexNodeKind.EndAnchorZ or SymbolicRegexNodeKind.EndAnchorZReverse or
+ SymbolicRegexNodeKind.EOLAnchor or SymbolicRegexNodeKind.BOLAnchor));
}
internal static SymbolicRegexNode CreateBoundaryAnchor(SymbolicRegexBuilder builder, SymbolicRegexNodeKind kind)
{
Debug.Assert(kind is SymbolicRegexNodeKind.BoundaryAnchor or SymbolicRegexNodeKind.NonBoundaryAnchor);
- return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithBoundaryAnchor: true, canBeNullable: true));
+ return Create(builder, kind, null, null, -1, -1, default, null, SymbolicRegexInfo.Create(startsWithSomeAnchor: true, canBeNullable: true));
}
#endregion
internal static SymbolicRegexNode CreateSingleton(SymbolicRegexBuilder builder, TSet set) =>
- Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create(containsSomeCharacter: !set.Equals(builder._solver.Empty)));
+ Create(builder, SymbolicRegexNodeKind.Singleton, null, null, -1, -1, set, null, SymbolicRegexInfo.Create());
internal static SymbolicRegexNode CreateLoop(SymbolicRegexBuilder builder, SymbolicRegexNode body, int lower, int upper, bool isLazy)
{
@@ -589,40 +592,6 @@ internal static SymbolicRegexNode OrderedOr(SymbolicRegexBuilder bui
Debug.Assert(left._kind != SymbolicRegexNodeKind.OrderedOr);
Debug.Assert(deduplicated);
- // Apply the counter subsumption/combining optimization if possible
- (SymbolicRegexNode loop, SymbolicRegexNode rest) = left.FirstCounterInfo();
- if (loop != builder._nothing)
- {
- Debug.Assert(loop._kind == SymbolicRegexNodeKind.Loop && loop._left is not null);
- (SymbolicRegexNode otherLoop, SymbolicRegexNode otherRest) = right.FirstCounterInfo();
- if (otherLoop != builder._nothing && rest == otherRest)
- {
- // Found two adjacent counters with the same continuation, check that the loops are equivalent apart from bounds
- // and that the bounds form a contiguous interval. Two integer intervals [x1,x2] and [y1,y2] overlap when
- // x1 <= y2 and y1 <= x2. The union of intervals that just touch is still contiguous, e.g. [2,5] and [6,10] make
- // [2,10], so the lower bounds are decremented by 1 in the check.
- Debug.Assert(otherLoop._kind == SymbolicRegexNodeKind.Loop && otherLoop._left is not null);
- if (loop._left == otherLoop._left && loop.IsLazy == otherLoop.IsLazy &&
- loop._lower - 1 <= otherLoop._upper && otherLoop._lower - 1 <= loop._upper)
- {
- // Loops are equivalent apart from bounds, and the union of the bounds is a contiguous interval
- // Build a new counter for the union of the ranges
- SymbolicRegexNode newCounter = CreateConcat(builder, CreateLoop(builder, loop._left,
- Math.Min(loop._lower, otherLoop._lower), Math.Max(loop._upper, otherLoop._upper), loop.IsLazy), rest);
- if (right._kind == SymbolicRegexNodeKind.OrderedOr)
- {
- // The right counter came from an or, so include the rest of that or
- Debug.Assert(right._right is not null);
- return OrderedOr(builder, newCounter, right._right, deduplicated: true);
- }
- else
- {
- return newCounter;
- }
- }
- }
- }
-
// Counter optimization did not apply, just build the or
return Create(builder, SymbolicRegexNodeKind.OrderedOr, left, right, -1, -1, default, null, SymbolicRegexInfo.Or(left._info, right._info));
}
@@ -1052,6 +1021,8 @@ internal SymbolicRegexNode CreateDerivative(TSet elem, uint context)
private void AddTransitions(TSet elem, uint context, List<(SymbolicRegexNode, DerivativeEffect[])> transitions,
List> continuation, Stack? effects, bool simulateBacktracking)
{
+ Debug.Assert(!_builder._solver.IsEmpty(elem), "False element or minterm should not make it into derivative construction.");
+
// Helper function for concatenating a head node and a list of continuation nodes. The continuation nodes
// are added in reverse order and the function below uses the list as a stack, so the nodes added to the
// stack first end up at the tail of the concatenation.
diff --git a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
index 99f458ad5b20e5..6e939ec1e50fd6 100644
--- a/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
+++ b/src/libraries/System.Text.RegularExpressions/tests/FunctionalTests/Regex.Match.Tests.cs
@@ -754,6 +754,9 @@ public static IEnumerable