Review and clean up some code

Simplification, style consistency, dead code deletion, some bounds-check removal, etc.
dotnet · Jul 10, 2024 · 0a0f409 · 0a0f409
1 parent 0abaabe
commit 0a0f409
Show file tree

Hide file tree

Showing 10 changed files with 410 additions and 476 deletions.
diff --git a/...ies/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs b/...ies/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs
@@ -5,7 +5,6 @@
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 
 #pragma warning disable CS8500 // takes address of managed type
 

diff --git a/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs b/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversal.cs
@@ -1,15 +1,39 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-namespace System.Text.RegularExpressions.Symbolic;
+using System.Diagnostics;
 
-internal readonly struct MatchReversal<TSet>(
-    MatchReversalKind kind,
-    int fixedLength,
-    MatchingState<TSet>? adjustedStartState = null)
-    where TSet : IComparable<TSet>, IEquatable<TSet>
+namespace System.Text.RegularExpressions.Symbolic
 {
-    internal MatchReversalKind Kind { get; } = kind;
-    internal int FixedLength { get; } = fixedLength;
-    internal MatchingState<TSet>? AdjustedStartState { get; } = adjustedStartState;
+    /// <summary>Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed.</summary>
+    internal readonly struct MatchReversalInfo<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
+    {
+        /// <summary>Initializes the match reversal details.</summary>
+        internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
+        {
+            Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength);
+            Debug.Assert(fixedLength >= 0);
+            Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength));
+
+            Kind = kind;
+            FixedLength = fixedLength;
+            AdjustedStartState = adjustedStartState;
+        }
+
+        /// <summary>Gets the kind of the match reversal processing required.</summary>
+        internal MatchReversalKind Kind { get; }
+
+        /// <summary>Gets the fixed length of the match, if one is known.</summary>
+        /// <remarks>
+        /// For <see cref="MatchReversalKind.MatchStart"/>, this is ignored.
+        /// For <see cref="MatchReversalKind.FixedLength"/>, this is the full length of the match. The beginning may be found simply
+        /// by subtracting this length from the end.
+        /// For <see cref="MatchReversalKind.PartialFixedLength"/>, this is the length of fixed portion of the match.
+        /// </remarks>
+        internal int FixedLength { get; }
+
+        /// <summary>Gets the adjusted start state to use for partial fixed-length matches.</summary>
+        /// <remarks>This will be non-null iff <see cref="Kind"/> is <see cref="MatchReversalKind.PartialFixedLength"/>.</remarks>
+        internal MatchingState<TSet>? AdjustedStartState { get; }
+    }
 }
diff --git a/....Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs b/....Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchReversalKind.cs
@@ -1,14 +1,26 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-namespace System.Text.RegularExpressions.Symbolic;
-
-internal enum MatchReversalKind
+namespace System.Text.RegularExpressions.Symbolic
 {
-    /// <summary>The most generic option, run the regex backwards to find beginning of match</summary>
-    MatchStart,
-    /// <summary>Part of the reversal is fixed length and can be skipped</summary>
-    PartialFixedLength,
-    /// <summary>The entire pattern is fixed length, reversal not necessary</summary>
-    FixedLength
+    /// <summary>Specifies the kind of a <see cref="MatchReversalInfo{TSet}"/>.</summary>
+    internal enum MatchReversalKind
+    {
+        /// <summary>The regex should be run in reverse to find beginning of the match.</summary>
+        MatchStart,
+
+        /// <summary>The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match.</summary>
+        /// <remarks>
+        /// Reverse execution is not necessary for a subset of the match.
+        /// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the fixed portion.
+        /// </remarks>
+        PartialFixedLength,
+
+        /// <summary>The entire pattern is of a fixed length.</summary>
+        /// <remarks>
+        /// Reverse execution is not necessary to find the beginning of the match.
+        /// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the match.
+        /// </remarks>
+        FixedLength
+    }
 }
diff --git a/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs b/...stem.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs
@@ -17,8 +17,6 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
             NullabilityInfo = BuildNullabilityInfo();
         }
 
-        internal int NullabilityInfo { get; }
-
         /// <summary>The regular expression that labels this state and gives it its semantics.</summary>
         internal SymbolicRegexNode<TSet> Node { get; }
 
@@ -98,15 +96,31 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
             return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
         }
 
-        /// <summary>
-        /// Cached nullability check with encoded bits
-        /// </summary>
+        /// <summary>Determines whether the node is nullable for the given context.</summary>
+        /// <remarks>
+        /// This is functionally equivalent to <see cref="SymbolicRegexNode{TSet}.IsNullableFor(uint)"/>, but using cached
+        /// answers stored in <see cref="NullabilityInfo"/>.
+        /// </remarks>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         internal bool IsNullableFor(uint nextCharKind)
         {
-            return ((1 << (int)nextCharKind) & NullabilityInfo) != 0;
+            Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount);
+            return (NullabilityInfo & (1 << (int)nextCharKind)) != 0;
         }
 
+        /// <summary>Gets the nullability info for the matching state.</summary>
+        /// <remarks>
+        /// <list>
+        /// <item>00000 -> node cannot be nullable</item>
+        /// <item>00001 -> nullable for General</item>
+        /// <item>00010 -> nullable for BeginningEnd</item>
+        /// <item>00100 -> nullable for NewLine</item>
+        /// <item>01000 -> nullable for NewLineS</item>
+        /// <item>10000 -> nullable for WordLetter</item>
+        /// </list>
+        /// </remarks>
+        internal int NullabilityInfo { get; }
+
         /// <summary>
         /// Builds a <see cref="StateFlags"/> with the relevant flags set.
         /// </summary>
@@ -138,24 +152,16 @@ internal StateFlags BuildStateFlags(bool isInitial)
             return info;
         }
 
-        /// <summary>
-        /// Builds the nullability information for the matching state.
-        /// Nullability for each context is encoded in a bit
-        /// 0 means node cannot be nullable
-        /// 00001 -> nullable for General
-        /// 00010 -> nullable for BeginningEnd
-        /// 00100 -> nullable for NewLine
-        /// 01000 -> nullable for NewLineS
-        /// 10000 -> nullable for WordLetter
-        /// </summary>
-        internal byte BuildNullabilityInfo()
+        /// <summary>Builds the nullability information for the matching state.</summary>
+        /// <remarks>Nullability for each context is encoded in a bit. See <see cref="NullabilityInfo"/>.</remarks>
+        private byte BuildNullabilityInfo()
         {
             byte nullabilityInfo = 0;
             if (Node.CanBeNullable)
             {
-                for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
+                for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++)
                 {
-                    nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0);
+                    nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0);
                 }
             }
 

diff --git a/....Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs b/....Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MintermClassifier.cs
@@ -1,7 +1,9 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+using System.Buffers;
 using System.Diagnostics;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 
 namespace System.Text.RegularExpressions.Symbolic
@@ -20,12 +22,12 @@ namespace System.Text.RegularExpressions.Symbolic
     /// </remarks>
     internal sealed class MintermClassifier
     {
-        /// <summary>An array used to map characters to minterms</summary>
+        /// <summary>Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms.</summary>
+        /// <remarks>_lookup[char] provides the minterm ID. If char &gt;= _lookup.Length, its minterm is 0.</remarks>
         private readonly byte[]? _lookup;
 
-        /// <summary>
-        /// Fallback lookup if over 255 minterms. This is rarely used.
-        /// </summary>
+        /// <summary>Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used.</summary>
+        /// <remarks>_intLookup[char] provides the minterm ID. If char &gt;= _intLookup.Length, its minterm is 0.</remarks>
         private readonly int[]? _intLookup;
 
         /// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
@@ -37,61 +39,64 @@ public MintermClassifier(BDD[] minterms)
             if (minterms.Length == 1)
             {
                 // With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
-                _lookup = Array.Empty<byte>();
+                _lookup = [];
                 return;
             }
 
-            int _maxChar = -1;
-            // attempt to save memory in common cases by allocating only up to the highest char code
+            // Compute all minterm ranges. We do this here in order to determine the maximum character value
+            // in order to size the lookup array to minimize steady-state memory consumption of the potentially
+            // large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
+            // consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
+            // However, when there are more than 255 minterms, we need to use int[] _intLookup.
+            (uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);
+
+            int maxChar = -1;
             for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
             {
-                _maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
+                (uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]);
+                charRangesPerMinterm[mintermId] = ranges;
+                maxChar = Math.Max(maxChar, (int)ranges[^1].Item2);
             }
 
-            // It's incredibly rare for a regex to use more than a hundred or two minterms,
-            // but we need a fallback just in case.
+            // It's incredibly rare for a regex to use more than a couple hundred minterms,
+            // but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.)
             if (minterms.Length > 255)
             {
-                // over 255 unique sets also means it's never ascii only
-                int[] lookup = new int[_maxChar + 1];
-                for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
-                {
-                    // precompute all assigned minterm categories
-                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                    foreach ((uint start, uint end) in mintermRanges)
-                    {
-                        // assign character ranges in bulk
-                        Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
-                        slice.Fill(mintermId);
-                    }
-                }
-                _intLookup = lookup;
+                _intLookup = CreateLookup<int>(minterms, charRangesPerMinterm, maxChar);
             }
             else
             {
-                byte[] lookup = new byte[_maxChar + 1];
+                _lookup = CreateLookup<byte>(minterms, charRangesPerMinterm, maxChar);
+            }
+
+            // Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
+            Array.Clear(charRangesPerMinterm, 0, minterms.Length);
+            ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);
+
+            // Creates the lookup array.
+            static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
+            {
+                T[] lookup = new T[_maxChar + 1];
                 for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
                 {
-                    // precompute all assigned minterm categories
-                    (uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
-                    foreach ((uint start, uint end) in mintermRanges)
+                    // Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
+                    foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
                     {
-                        // assign character ranges in bulk
-                        Span<byte> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
-                        slice.Fill((byte)mintermId);
+                        lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
                     }
                 }
-                _lookup = lookup;
+
+                return lookup;
             }
         }
 
         /// <summary>Gets the ID of the minterm associated with the specified character. </summary>
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public int GetMintermID(int c)
         {
-            if (_intLookup is null)
+            if (_lookup is not null)
             {
-                byte[] lookup = _lookup!;
+                byte[] lookup = _lookup;
                 return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
             }
             else
@@ -104,20 +109,17 @@ public int GetMintermID(int c)
         /// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
         /// Null if there are greater than 255 minterms.
         /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public byte[]? ByteLookup() => _lookup;
+        public byte[]? ByteLookup => _lookup;
 
         /// <summary>
         /// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
         /// Null in the common case where there are fewer than 255 minterms.
         /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public int[]? IntLookup() => _intLookup;
+        public int[]? IntLookup => _intLookup;
 
         /// <summary>
         /// Maximum ordinal character for a non-0 minterm, used to conserve memory
         /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1;
+        public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1;
     }
 }