Skip to content

Commit

Permalink
Some more cleanup to regex NonBacktracking (#104766)
Browse files Browse the repository at this point in the history
* Rent object[] rather than (uint,uint)[][] from the ArrayPool

* Remove unnecessary TInputReader generic from functions

* Add more comments and do some renames

* Remove unused TFindOptimizationsHandler from FindEndPositionDeltasNFA

* Fix a stray input reader

* Some more renames

* Avoid duplicated reads of input character and nullability info

* Remove initialStateId from TryFindNextStartingPosition and make initial accelerators more similar

* Remove unused initialStatePos / initialStatePosCandidate

It's only ever written and not actually used for anything.

* Remove unnecessary generic args and remove resulting dead code

Multiple XxDfa / XxNfa methods took a TStateHandler, but it was only ever DfaStateHandler for XxDfa or NfaStateHandler for XxNfa. We can just use the types directly in those methods, rather than generically parameterizing. Doing that revealed all but one of the members of IStateHandler weren't needed on the interface. And removing those revealed a bunch of dead code on DfaStateHandler/NfaStateHandler, which were removed, as well as arguments to some methods that weren't used.

* Put GetStateFlags back in IStateHandler and use it to avoid duplication at call sites

* Put out argument last in TryCreateNewTransition

* Store state to local in FindStartPositionDeltasDFA

* Merge IAcceleratedStateHandler into IInitialStateHandler

* Remove MintermClassifier.IntLookup
  • Loading branch information
stephentoub committed Jul 12, 2024
1 parent 9b09bcf commit b54bfdd
Show file tree
Hide file tree
Showing 5 changed files with 263 additions and 340 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,11 @@ public MintermClassifier(BDD[] minterms)
// in order to size the lookup array to minimize steady-state memory consumption of the potentially
// large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
// consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
// However, when there are more than 255 minterms, we need to use int[] _intLookup.
(uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);
// However, when there are more than 255 minterms, we need to use int[] _intLookup. We rent an object[]
// rather than a (uint,uint)[][] to avoid the extra type pressure on the ArrayPool (object[]s are common,
// (uint,uint)[][]s much less so).
object[] arrayPoolArray = ArrayPool<object>.Shared.Rent(minterms.Length);
Span<object> charRangesPerMinterm = arrayPoolArray.AsSpan(0, minterms.Length);

int maxChar = -1;
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
Expand All @@ -70,17 +73,17 @@ public MintermClassifier(BDD[] minterms)
}

// Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
Array.Clear(charRangesPerMinterm, 0, minterms.Length);
ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);
charRangesPerMinterm.Clear();
ArrayPool<object>.Shared.Return(arrayPoolArray);

// Creates the lookup array.
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
// Creates the lookup array. charRangesPerMinterm needs to have already been populated with (uint, uint)[] instances.
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<object> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
{
T[] lookup = new T[_maxChar + 1];
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
{
// Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
foreach ((uint start, uint end) in ((uint, uint)[])charRangesPerMinterm[mintermId])
{
lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
}
Expand All @@ -101,7 +104,9 @@ public int GetMintermID(int c)
}
else
{
int[] lookup = _intLookup!;
Debug.Assert(_intLookup is not null);

int[] lookup = _intLookup;
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
}
}
Expand All @@ -111,12 +116,6 @@ public int GetMintermID(int c)
/// </summary>
public byte[]? ByteLookup => _lookup;

/// <summary>
/// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
/// Null in the common case where there are fewer than 255 minterms.
/// </summary>
public int[]? IntLookup => _intLookup;

/// <summary>
/// Maximum ordinal character for a non-0 minterm, used to conserve memory
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ private static void ArrayResizeAndVolatilePublish<T>(ref T[] array, int newSize)
/// Pre-computed hot-loop version of nullability check
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool IsNullableWithContext(int stateId, int mintermId) =>
(_nullabilityArray[stateId] & (1 << (int)GetPositionKind(mintermId))) > 0;
private bool IsNullableWithContext(byte stateNullability, int mintermId) =>
(stateNullability & (1 << (int)GetPositionKind(mintermId))) != 0;

/// <summary>Returns the span from <see cref="_dfaDelta"/> that may contain transitions for the given state</summary>
private Span<int> GetDeltasFor(MatchingState<TSet> state)
Expand Down Expand Up @@ -355,9 +355,7 @@ private int GetCoreStateId(int nfaStateId)

/// <summary>Gets or creates a new DFA transition.</summary>
/// <remarks>This function locks the matcher for safe concurrent use of the <see cref="_builder"/></remarks>
private bool TryCreateNewTransition(
MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out MatchingState<TSet>? nextState,
long timeoutOccursAt = 0)
private bool TryCreateNewTransition(MatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, long timeoutOccursAt, [NotNullWhen(true)] out MatchingState<TSet>? nextState)
{
Debug.Assert(offset < _dfaDelta.Length);
lock (this)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,22 @@ public override void Explore(bool includeDotStarred, bool includeReverse, bool i
{
// Don't dequeue yet, because a transition might fail
MatchingState<TSet> state = toExplore.Peek();

// Include the special minterm for the last end-of-line if the state is sensitive to it
int maxMinterm = state.StartsWithLineAnchor ? _minterms!.Length : _minterms!.Length - 1;

// Explore successor states for each minterm
for (int mintermId = 0; mintermId <= maxMinterm; ++mintermId)
{
int offset = DeltaOffset(state.Id, mintermId);
if (!TryCreateNewTransition(state, mintermId, offset, true, out MatchingState<TSet>? nextState))
if (!TryCreateNewTransition(state, mintermId, offset, true, 0, out MatchingState<TSet>? nextState))
{
goto DfaLimitReached;
}

EnqueueIfUnseen(nextState, seen, toExplore);
}

// Safe to dequeue now that the state has been completely handled
toExplore.Dequeue();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public override IEnumerable<string> SampleMatches(int k, int randomseed)
NfaMatchingState states = new();
// Here one could also consider previous characters for example for \b, \B, and ^ anchors
// and initialize inputSoFar accordingly
states.InitializeFrom(this, _initialStates[GetCharKind<FullInputReader>([], -1)]);
states.InitializeFrom(this, _initialStates[GetCharKind([], -1)]);
CurrentState statesWrapper = new(states);

// Used for end suffixes
Expand Down
Loading

0 comments on commit b54bfdd

Please sign in to comment.