From 5b6887792fd47ad0e1997586c1d7754ab7596aa5 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 13 Apr 2024 01:07:44 +0200 Subject: [PATCH 1/3] Rework ProbabilisticMap character checks in SearchValues --- .../System.Memory/tests/Span/SearchValues.cs | 28 +- .../System.Private.CoreLib.Shared.projitems | 3 +- ...rchValues.cs => BitmapCharSearchValues.cs} | 58 +++- .../ProbabilisticCharSearchValues.cs | 16 +- .../System/SearchValues/ProbabilisticMap.cs | 103 +++--- .../SearchValues/ProbabilisticMapState.cs | 299 ++++++++++++++++++ .../ProbabilisticWithAsciiCharSearchValues.cs | 26 +- .../src/System/SearchValues/SearchValues.cs | 58 ++-- 8 files changed, 472 insertions(+), 119 deletions(-) rename src/libraries/System.Private.CoreLib/src/System/SearchValues/{Latin1CharSearchValues.cs => BitmapCharSearchValues.cs} (62%) create mode 100644 src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs diff --git a/src/libraries/System.Memory/tests/Span/SearchValues.cs b/src/libraries/System.Memory/tests/Span/SearchValues.cs index f2020279cdf4b9..f76584e15983e1 100644 --- a/src/libraries/System.Memory/tests/Span/SearchValues.cs +++ b/src/libraries/System.Memory/tests/Span/SearchValues.cs @@ -68,9 +68,31 @@ public static IEnumerable Values_MemberData() "\uFFFF\uFFFE\uFFFD\uFFFC\uFFFB\uFFFA", "\uFFFF\uFFFE\uFFFD\uFFFC\uFFFB\uFFFB", "\uFFFF\uFFFE\uFFFD\uFFFC\uFFFB\uFFF9", + new string('\u0080', 256) + '\u0082', + new string('\u0080', 100) + '\uF000', + new string('\u0080', 256) + '\uF000', + string.Concat(Enumerable.Range(128, 255).Select(i => (char)i)), + string.Concat(Enumerable.Range(128, 257).Select(i => (char)i)), + string.Concat(Enumerable.Range(128, 254).Select(i => (char)i)) + '\uF000', + string.Concat(Enumerable.Range(128, 256).Select(i => (char)i)) + '\uF000', + '\0' + string.Concat(Enumerable.Range(2, char.MaxValue - 1).Select(i => (char)i)), }; - return values.Select(v => new object[] { v, Encoding.Latin1.GetBytes(v) }); + foreach (string value in values) + { + yield return Pair(value); + yield return Pair('a' + value); + + // Test some more duplicates + if (value.Length > 0) + { + yield return Pair(value + value[0]); + yield return Pair(value[0] + value); + yield return Pair(value + value); + } + } + + static object[] Pair(string value) => new object[] { value, Encoding.Latin1.GetBytes(value) }; } [Theory] @@ -192,10 +214,12 @@ public static void SearchValues_Contains(string needle, byte[] byteNeedle) static void Test(ReadOnlySpan needle, SearchValues values) where T : struct, INumber, IMinMaxValue { + HashSet needleSet = needle.ToArray().ToHashSet(); + for (int i = int.CreateChecked(T.MaxValue); i >= 0; i--) { T t = T.CreateChecked(i); - Assert.Equal(needle.Contains(t), values.Contains(t)); + Assert.Equal(needleSet.Contains(t), values.Contains(t)); } } } diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 428af3859c2a02..e27e89faba410e 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -435,6 +435,7 @@ + @@ -445,7 +446,7 @@ - + diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Latin1CharSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/BitmapCharSearchValues.cs similarity index 62% rename from src/libraries/System.Private.CoreLib/src/System/SearchValues/Latin1CharSearchValues.cs rename to src/libraries/System.Private.CoreLib/src/System/SearchValues/BitmapCharSearchValues.cs index 3968825ccd4c72..71e364cc618d13 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Latin1CharSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/BitmapCharSearchValues.cs @@ -1,34 +1,55 @@ // Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. +using System.Collections.Generic; +using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; namespace System.Buffers { - internal sealed class Latin1CharSearchValues : SearchValues + internal sealed class BitmapCharSearchValues : SearchValues { - private readonly BitVector256 _lookup; + private readonly uint[] _bitmap; - public Latin1CharSearchValues(ReadOnlySpan values) + public BitmapCharSearchValues(ReadOnlySpan values, int maxInclusive) { + Debug.Assert(maxInclusive <= char.MaxValue); + + _bitmap = new uint[maxInclusive / 32 + 1]; + foreach (char c in values) { - if (c > 255) + _bitmap[c >> 5] |= 1u << c; + } + } + + internal override char[] GetValues() + { + var chars = new List(); + uint[] bitmap = _bitmap; + + for (int i = 0; i < _bitmap.Length * 32; i++) + { + if (Contains(bitmap, i)) { - // The values were modified concurrent with the call to SearchValues.Create - ThrowHelper.ThrowInvalidOperationException_InvalidOperation_EnumFailedVersion(); + chars.Add((char)i); } - - _lookup.Set(c); } - } - internal override char[] GetValues() => _lookup.GetCharValues(); + return chars.ToArray(); + } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal override bool ContainsCore(char value) => - _lookup.Contains256(value); + Contains(_bitmap, value); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool Contains(uint[] bitmap, int value) + { + uint offset = (uint)(value >> 5); + return offset < (uint)bitmap.Length && (bitmap[offset] & (1u << value)) != 0; + } [MethodImpl(MethodImplOptions.AggressiveInlining)] internal override int IndexOfAny(ReadOnlySpan span) => @@ -51,11 +72,12 @@ private int IndexOfAny(ref char searchSpace, int searchSpaceLength) { ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); ref char cur = ref searchSpace; + uint[] bitmap = _bitmap; while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) { char c = cur; - if (TNegator.NegateIfNeeded(_lookup.Contains256(c))) + if (TNegator.NegateIfNeeded(Contains(bitmap, c))) { return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); } @@ -69,16 +91,18 @@ private int IndexOfAny(ref char searchSpace, int searchSpaceLength) private int LastIndexOfAny(ref char searchSpace, int searchSpaceLength) where TNegator : struct, IndexOfAnyAsciiSearcher.INegator { - for (int i = searchSpaceLength - 1; i >= 0; i--) + uint[] bitmap = _bitmap; + + while (--searchSpaceLength >= 0) { - char c = Unsafe.Add(ref searchSpace, i); - if (TNegator.NegateIfNeeded(_lookup.Contains256(c))) + char c = Unsafe.Add(ref searchSpace, searchSpaceLength); + if (TNegator.NegateIfNeeded(Contains(bitmap, c))) { - return i; + break; } } - return -1; + return searchSpaceLength; } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs index 5ec79a3b8911c8..ab7b2dfd0da7a6 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs @@ -8,31 +8,31 @@ namespace System.Buffers { internal sealed class ProbabilisticCharSearchValues : SearchValues { - private ProbabilisticMap _map; + private ProbabilisticMapState _map; private readonly string _values; - public ProbabilisticCharSearchValues(scoped ReadOnlySpan values) + public ProbabilisticCharSearchValues(ReadOnlySpan values, int maxInclusive) { _values = new string(values); - _map = new ProbabilisticMap(_values); + _map = new ProbabilisticMapState(values, maxInclusive); } internal override char[] GetValues() => _values.ToCharArray(); [MethodImpl(MethodImplOptions.AggressiveInlining)] internal override bool ContainsCore(char value) => - ProbabilisticMap.Contains(ref Unsafe.As(ref _map), _values, value); + _map.FastContains(value); internal override int IndexOfAny(ReadOnlySpan span) => - ProbabilisticMap.IndexOfAny(ref Unsafe.As(ref _map), ref MemoryMarshal.GetReference(span), span.Length, _values); + ProbabilisticMap.IndexOfAny(ref MemoryMarshal.GetReference(span), span.Length, ref _map); internal override int IndexOfAnyExcept(ReadOnlySpan span) => - ProbabilisticMap.IndexOfAnySimpleLoop(ref MemoryMarshal.GetReference(span), span.Length, _values); + ProbabilisticMapState.IndexOfAnySimpleLoop(ref MemoryMarshal.GetReference(span), span.Length, ref _map); internal override int LastIndexOfAny(ReadOnlySpan span) => - ProbabilisticMap.LastIndexOfAny(ref Unsafe.As(ref _map), ref MemoryMarshal.GetReference(span), span.Length, _values); + ProbabilisticMap.LastIndexOfAny(ref MemoryMarshal.GetReference(span), span.Length, ref _map); internal override int LastIndexOfAnyExcept(ReadOnlySpan span) => - ProbabilisticMap.LastIndexOfAnySimpleLoop(ref MemoryMarshal.GetReference(span), span.Length, _values); + ProbabilisticMapState.LastIndexOfAnySimpleLoop(ref MemoryMarshal.GetReference(span), span.Length, ref _map); } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs index 60f9ea43f1cf9d..945189aa096d34 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMap.cs @@ -10,6 +10,8 @@ using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; +#pragma warning disable CS8500 // Takes the address of a managed type + namespace System.Buffers { /// Data structure used to optimize checks for whether a char is in a set of chars. @@ -97,7 +99,7 @@ internal static bool Contains(ref uint charMap, ReadOnlySpan values, int c Contains(values, (char)ch); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool Contains(ReadOnlySpan values, char ch) => + internal static bool Contains(ReadOnlySpan values, char ch) => SpanHelpers.NonPackedContainsValueType( ref Unsafe.As(ref MemoryMarshal.GetReference(values)), (short)ch, @@ -343,78 +345,64 @@ public static int LastIndexOfAnyExcept(ref char searchSpace, int searchSpaceLeng } [MethodImpl(MethodImplOptions.NoInlining)] - private static int ProbabilisticIndexOfAny(ref char searchSpace, int searchSpaceLength, ref char values, int valuesLength) + private static unsafe int ProbabilisticIndexOfAny(ref char searchSpace, int searchSpaceLength, ref char values, int valuesLength) { var valuesSpan = new ReadOnlySpan(ref values, valuesLength); - var map = new ProbabilisticMap(valuesSpan); - ref uint charMap = ref Unsafe.As(ref map); + // ProbabilisticMapState can hold either a precomputed hash table or a pointer to the values. + // Precomputing the table is relatively expensive, so we only do it when using SearchValues where instances can be reused. + var state = new ProbabilisticMapState(&valuesSpan); - return IndexOfAny(ref charMap, ref searchSpace, searchSpaceLength, valuesSpan); + // The FalseConst here indicates that we can't use the fast character checks and must instead check the values span. + return IndexOfAny(ref searchSpace, searchSpaceLength, ref state); } [MethodImpl(MethodImplOptions.NoInlining)] - private static int ProbabilisticLastIndexOfAny(ref char searchSpace, int searchSpaceLength, ref char values, int valuesLength) + private static unsafe int ProbabilisticLastIndexOfAny(ref char searchSpace, int searchSpaceLength, ref char values, int valuesLength) { var valuesSpan = new ReadOnlySpan(ref values, valuesLength); - var map = new ProbabilisticMap(valuesSpan); - ref uint charMap = ref Unsafe.As(ref map); + // ProbabilisticMapState can hold either a precomputed hash table or a pointer to the values. + // Precomputing the table is relatively expensive, so we only do it when using SearchValues where instances can be reused. + var state = new ProbabilisticMapState(&valuesSpan); - return LastIndexOfAny(ref charMap, ref searchSpace, searchSpaceLength, valuesSpan); + // The FalseConst here indicates that we can't use the fast character checks and must instead check the values span. + return LastIndexOfAny(ref searchSpace, searchSpaceLength, ref state); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int IndexOfAny(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) + internal static int IndexOfAny(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) + where TUseFastContains : struct, SearchValues.IRuntimeConst { if ((Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && searchSpaceLength >= 16) { return Vector512.IsHardwareAccelerated && Avx512Vbmi.VL.IsSupported - ? IndexOfAnyVectorizedAvx512(ref charMap, ref searchSpace, searchSpaceLength, values) - : IndexOfAnyVectorized(ref charMap, ref searchSpace, searchSpaceLength, values); + ? IndexOfAnyVectorizedAvx512(ref searchSpace, searchSpaceLength, ref state) + : IndexOfAnyVectorized(ref searchSpace, searchSpaceLength, ref state); } - ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); - ref char cur = ref searchSpace; - - while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) - { - int ch = cur; - if (Contains(ref charMap, values, ch)) - { - return MatchOffset(ref searchSpace, ref cur); - } - - cur = ref Unsafe.Add(ref cur, 1); - } - - return -1; + return ProbabilisticMapState.IndexOfAnySimpleLoop(ref searchSpace, searchSpaceLength, ref state); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int LastIndexOfAny(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) + internal static int LastIndexOfAny(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) + where TUseFastContains : struct, SearchValues.IRuntimeConst { - for (int i = searchSpaceLength - 1; i >= 0; i--) - { - int ch = Unsafe.Add(ref searchSpace, i); - if (Contains(ref charMap, values, ch)) - { - return i; - } - } + // TODO: Implement vectorized LastIndexOfAny. - return -1; + return ProbabilisticMapState.LastIndexOfAnySimpleLoop(ref searchSpace, searchSpaceLength, ref state); } [CompExactlyDependsOn(typeof(Avx512Vbmi.VL))] - private static int IndexOfAnyVectorizedAvx512(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) + private static int IndexOfAnyVectorizedAvx512(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) + where TUseFastContains : struct, SearchValues.IRuntimeConst { Debug.Assert(Avx512Vbmi.VL.IsSupported); Debug.Assert(searchSpaceLength >= 16); ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); - Vector256 charMap256 = Vector256.LoadUnsafe(ref Unsafe.As(ref charMap)); + Vector256 charMap256 = Vector256.LoadUnsafe(ref Unsafe.As(ref state.Map)); if (searchSpaceLength > 32) { @@ -431,7 +419,7 @@ private static int IndexOfAnyVectorizedAvx512(ref uint charMap, ref char searchS if (result != Vector512.Zero) { - if (TryFindMatch(ref cur, PackedSpanHelpers.FixUpPackedVector512Result(result).ExtractMostSignificantBits(), values, out int index)) + if (TryFindMatch(ref cur, PackedSpanHelpers.FixUpPackedVector512Result(result).ExtractMostSignificantBits(), ref state, out int index)) { return MatchOffset(ref searchSpace, ref cur) + index; } @@ -461,7 +449,7 @@ private static int IndexOfAnyVectorizedAvx512(ref uint charMap, ref char searchS if (result != Vector512.Zero) { - if (TryFindMatchOverlapped(ref searchSpace, searchSpaceLength, PackedSpanHelpers.FixUpPackedVector512Result(result).ExtractMostSignificantBits(), values, out int index)) + if (TryFindMatchOverlapped(ref searchSpace, searchSpaceLength, PackedSpanHelpers.FixUpPackedVector512Result(result).ExtractMostSignificantBits(), ref state, out int index)) { return index; } @@ -478,7 +466,7 @@ private static int IndexOfAnyVectorizedAvx512(ref uint charMap, ref char searchS if (result != Vector256.Zero) { - if (TryFindMatchOverlapped(ref searchSpace, searchSpaceLength, PackedSpanHelpers.FixUpPackedVector256Result(result).ExtractMostSignificantBits(), values, out int index)) + if (TryFindMatchOverlapped(ref searchSpace, searchSpaceLength, PackedSpanHelpers.FixUpPackedVector256Result(result).ExtractMostSignificantBits(), ref state, out int index)) { return index; } @@ -490,7 +478,8 @@ private static int IndexOfAnyVectorizedAvx512(ref uint charMap, ref char searchS [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] [CompExactlyDependsOn(typeof(Sse41))] - private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, int searchSpaceLength, ReadOnlySpan values) + private static int IndexOfAnyVectorized(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) + where TUseFastContains : struct, SearchValues.IRuntimeConst { Debug.Assert(Sse41.IsSupported || AdvSimd.Arm64.IsSupported); Debug.Assert(searchSpaceLength >= 16); @@ -498,8 +487,8 @@ private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); ref char cur = ref searchSpace; - Vector128 charMapLower = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap)); - Vector128 charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As(ref charMap), (nuint)Vector128.Count); + Vector128 charMapLower = Vector128.LoadUnsafe(ref Unsafe.As(ref state.Map)); + Vector128 charMapUpper = Vector128.LoadUnsafe(ref Unsafe.As(ref state.Map), (nuint)Vector128.Count); #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // In this case, we have an else clause which has the same semantic meaning whether or not Avx2 is considered supported or unsupported if (Avx2.IsSupported && searchSpaceLength >= 32) @@ -516,7 +505,7 @@ private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, if (result != Vector256.Zero) { - if (TryFindMatch(ref cur, PackedSpanHelpers.FixUpPackedVector256Result(result).ExtractMostSignificantBits(), values, out int index)) + if (TryFindMatch(ref cur, PackedSpanHelpers.FixUpPackedVector256Result(result).ExtractMostSignificantBits(), ref state, out int index)) { return MatchOffset(ref searchSpace, ref cur) + index; } @@ -556,7 +545,7 @@ private static int IndexOfAnyVectorized(ref uint charMap, ref char searchSpace, if (result != Vector128.Zero) { - if (TryFindMatch(ref cur, result.ExtractMostSignificantBits(), values, out int index)) + if (TryFindMatch(ref cur, result.ExtractMostSignificantBits(), ref state, out int index)) { return MatchOffset(ref searchSpace, ref cur) + index; } @@ -584,13 +573,14 @@ private static int MatchOffset(ref char searchSpace, ref char cur) => (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool TryFindMatch(ref char cur, uint mask, ReadOnlySpan values, out int index) + private static bool TryFindMatch(ref char cur, uint mask, ref ProbabilisticMapState state, out int index) + where TUseFastContains : struct, SearchValues.IRuntimeConst { do { index = BitOperations.TrailingZeroCount(mask); - if (Contains(values, Unsafe.Add(ref cur, index))) + if (state.ConfirmProbabilisticMatch(Unsafe.Add(ref cur, index))) { return true; } @@ -604,7 +594,8 @@ private static bool TryFindMatch(ref char cur, uint mask, ReadOnlySpan val } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool TryFindMatchOverlapped(ref char cur, int searchSpaceLength, uint mask, ReadOnlySpan values, out int index) + private static bool TryFindMatchOverlapped(ref char cur, int searchSpaceLength, uint mask, ref ProbabilisticMapState state, out int index) + where TUseFastContains : struct, SearchValues.IRuntimeConst { do { @@ -617,7 +608,7 @@ private static bool TryFindMatchOverlapped(ref char cur, int searchSpaceLength, index += searchSpaceLength - (2 * Vector256.Count); } - if (Contains(values, Unsafe.Add(ref cur, index))) + if (state.ConfirmProbabilisticMatch(Unsafe.Add(ref cur, index))) { return true; } @@ -631,13 +622,14 @@ private static bool TryFindMatchOverlapped(ref char cur, int searchSpaceLength, } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool TryFindMatch(ref char cur, ulong mask, ReadOnlySpan values, out int index) + private static bool TryFindMatch(ref char cur, ulong mask, ref ProbabilisticMapState state, out int index) + where TUseFastContains : struct, SearchValues.IRuntimeConst { do { index = BitOperations.TrailingZeroCount(mask); - if (Contains(values, Unsafe.Add(ref cur, index))) + if (state.ConfirmProbabilisticMatch(Unsafe.Add(ref cur, index))) { return true; } @@ -651,7 +643,8 @@ private static bool TryFindMatch(ref char cur, ulong mask, ReadOnlySpan va } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool TryFindMatchOverlapped(ref char cur, int searchSpaceLength, ulong mask, ReadOnlySpan values, out int index) + private static bool TryFindMatchOverlapped(ref char cur, int searchSpaceLength, ulong mask, ref ProbabilisticMapState state, out int index) + where TUseFastContains : struct, SearchValues.IRuntimeConst { do { @@ -664,7 +657,7 @@ private static bool TryFindMatchOverlapped(ref char cur, int searchSpaceLength, index += searchSpaceLength - (2 * Vector512.Count); } - if (Contains(values, Unsafe.Add(ref cur, index))) + if (state.ConfirmProbabilisticMatch(Unsafe.Add(ref cur, index))) { return true; } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs new file mode 100644 index 00000000000000..91804634ed664c --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs @@ -0,0 +1,299 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections; +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +#pragma warning disable CS8500 // Takes the address of a managed type + +namespace System.Buffers +{ + /// + /// Stores the state necessary to call vectorized members on , + /// as well as (optionally) a precomputed perfect hash table for faster single-character lookups/match confirmations. + /// When the hash table isn't available, the structure stores a pointer to the span of values in the set instead. + /// + internal unsafe struct ProbabilisticMapState + { + private const int MaxModulus = char.MaxValue + 1; + + public ProbabilisticMap Map; + + // Hash entries store each value from the set at the index determined by the remainder modulo the table size. + // As every value has a unique remainder, we can check if a value is contained in the set by checking + // _hashEntries[value % _hashEntries.Length] == value (see FastContains below). + // The multiplier is used for faster modulo operations when determining the hash table index. + // Exactly one of _hashEntries and _slowContainsValuesPtr may be initialized at the same time. + private readonly uint _multiplier; + private readonly char[]? _hashEntries; + private readonly ReadOnlySpan* _slowContainsValuesPtr; + + public ProbabilisticMapState(ReadOnlySpan values, int maxInclusive) + { + Debug.Assert(!values.IsEmpty); + + Map = new ProbabilisticMap(values); + + uint modulus = FindModulus(values, maxInclusive); + _multiplier = GetFastModMultiplier(modulus); + _hashEntries = new char[modulus]; + + // Some hash entries will remain unused. + // We can't leave them uninitialized as that would lead to false positives for values divisible by modulus. + _hashEntries.AsSpan().Fill(values[0]); + + foreach (char c in values) + { + _hashEntries[FastMod(c, modulus, _multiplier)] = c; + } + } + + // valuesPtr must remain valid for as long as this ProbabilisticMapState is used. + public unsafe ProbabilisticMapState(ReadOnlySpan* valuesPtr) + { + Debug.Assert((IntPtr)valuesPtr != IntPtr.Zero); + + Map = new ProbabilisticMap(*valuesPtr); + _slowContainsValuesPtr = valuesPtr; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool FastContains(char value) + { + Debug.Assert(_hashEntries is not null); + Debug.Assert((IntPtr)_slowContainsValuesPtr == IntPtr.Zero); + + return FastContains(_hashEntries, _multiplier, value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool FastContains(char[] hashEntries, uint multiplier, char value) + { + ulong offset = FastMod(value, (uint)hashEntries.Length, multiplier); + Debug.Assert(offset < (ulong)hashEntries.Length); + + return Unsafe.Add(ref MemoryMarshal.GetArrayDataReference(hashEntries), (nuint)offset) == value; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool SlowProbabilisticContains(char value) + { + Debug.Assert(_hashEntries is null); + Debug.Assert((IntPtr)_slowContainsValuesPtr != IntPtr.Zero); + + return ProbabilisticMap.Contains( + ref Unsafe.As(ref Map), + *_slowContainsValuesPtr, + value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private bool SlowContains(char value) + { + Debug.Assert(_hashEntries is null); + Debug.Assert((IntPtr)_slowContainsValuesPtr != IntPtr.Zero); + + return ProbabilisticMap.Contains(*_slowContainsValuesPtr, value); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public bool ConfirmProbabilisticMatch(char value) + where TUseFastContains : struct, SearchValues.IRuntimeConst + { + if (TUseFastContains.Value) + { + return FastContains(value); + } + else + { + // We use SlowContains instead of SlowProbabilisticContains here as we've already checked + // the value against the probabilistic filter and are now confirming the potential match. + return SlowContains(value); + } + } + + /// Finds a modulus where remainders for all values in the set are unique. + private static uint FindModulus(ReadOnlySpan values, int maxInclusive) + { + Debug.Assert(maxInclusive <= char.MaxValue); + + int modulus = HashHelpers.GetPrime(values.Length); + bool removedDuplicates = false; + + if (modulus >= maxInclusive) + { + return (uint)(maxInclusive + 1); + } + + while (true) + { + if (modulus >= maxInclusive) + { + // Try to remove duplicates and try again. + if (!removedDuplicates && TryRemoveDuplicates(values, out char[]? deduplicated)) + { + removedDuplicates = true; + values = deduplicated; + modulus = HashHelpers.GetPrime(values.Length); + continue; + } + + return (uint)(maxInclusive + 1); + } + + if (TestModulus(values, modulus)) + { + return (uint)modulus; + } + + modulus = HashHelpers.GetPrime(modulus + 1); + } + + static bool TestModulus(ReadOnlySpan values, int modulus) + { + Debug.Assert(modulus < MaxModulus); + + bool[] seen = ArrayPool.Shared.Rent(modulus); + seen.AsSpan(0, modulus).Clear(); + + uint multiplier = GetFastModMultiplier((uint)modulus); + + foreach (char c in values) + { + ulong index = FastMod(c, (uint)modulus, multiplier); + + if (seen[index]) + { + ArrayPool.Shared.Return(seen); + return false; + } + + seen[index] = true; + } + + // Saw no duplicates. + ArrayPool.Shared.Return(seen); + return true; + } + + static bool TryRemoveDuplicates(ReadOnlySpan values, [NotNullWhen(true)] out char[]? deduplicated) + { + HashSet unique = [.. values]; + + if (unique.Count == values.Length) + { + deduplicated = null; + return false; + } + + deduplicated = new char[unique.Count]; + unique.CopyTo(deduplicated); + return true; + } + } + + // This is a variant of HashHelpers.GetFastModMultiplier, specialized for smaller divisors (<= 65536). + private static uint GetFastModMultiplier(uint divisor) + { + Debug.Assert(divisor > 0); + Debug.Assert(divisor <= MaxModulus); + + return uint.MaxValue / divisor + 1; + } + + // This is a faster variant of HashHelpers.FastMod, specialized for smaller divisors (<= 65536). + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static ulong FastMod(char value, uint divisor, uint multiplier) + { + Debug.Assert(multiplier == GetFastModMultiplier(divisor)); + + ulong result = ((ulong)(multiplier * value) * divisor) >> 32; + + Debug.Assert(result == (value % divisor)); + return result; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int IndexOfAnySimpleLoop(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) + where TUseFastContains : struct, SearchValues.IRuntimeConst + where TNegator : struct, IndexOfAnyAsciiSearcher.INegator + { + ref char searchSpaceEnd = ref Unsafe.Add(ref searchSpace, searchSpaceLength); + ref char cur = ref searchSpace; + + if (TUseFastContains.Value) + { + Debug.Assert(state._hashEntries is not null); + + char[] hashEntries = state._hashEntries; + uint multiplier = state._multiplier; + + while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + { + char c = cur; + if (TNegator.NegateIfNeeded(FastContains(hashEntries, multiplier, c))) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); + } + + cur = ref Unsafe.Add(ref cur, 1); + } + } + else + { + while (!Unsafe.AreSame(ref cur, ref searchSpaceEnd)) + { + char c = cur; + if (TNegator.NegateIfNeeded(state.SlowProbabilisticContains(c))) + { + return (int)((nuint)Unsafe.ByteOffset(ref searchSpace, ref cur) / sizeof(char)); + } + + cur = ref Unsafe.Add(ref cur, 1); + } + } + + return -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int LastIndexOfAnySimpleLoop(ref char searchSpace, int searchSpaceLength, ref ProbabilisticMapState state) + where TUseFastContains : struct, SearchValues.IRuntimeConst + where TNegator : struct, IndexOfAnyAsciiSearcher.INegator + { + if (TUseFastContains.Value) + { + Debug.Assert(state._hashEntries is not null); + + char[] hashEntries = state._hashEntries; + uint multiplier = state._multiplier; + + while (--searchSpaceLength >= 0) + { + char c = Unsafe.Add(ref searchSpace, searchSpaceLength); + if (TNegator.NegateIfNeeded(FastContains(hashEntries, multiplier, c))) + { + break; + } + } + } + else + { + while (--searchSpaceLength >= 0) + { + char c = Unsafe.Add(ref searchSpace, searchSpaceLength); + if (TNegator.NegateIfNeeded(state.SlowProbabilisticContains(c))) + { + break; + } + } + } + + return searchSpaceLength; + } + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs index b9b9227aa3ff6c..0d14354552b40a 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs @@ -15,10 +15,10 @@ internal sealed class ProbabilisticWithAsciiCharSearchValues : S { private IndexOfAnyAsciiSearcher.AsciiState _asciiState; private IndexOfAnyAsciiSearcher.AsciiState _inverseAsciiState; - private ProbabilisticMap _map; + private ProbabilisticMapState _map; private readonly string _values; - public ProbabilisticWithAsciiCharSearchValues(scoped ReadOnlySpan values) + public ProbabilisticWithAsciiCharSearchValues(ReadOnlySpan values, int maxInclusive) { Debug.Assert(IndexOfAnyAsciiSearcher.IsVectorizationSupported); Debug.Assert(values.ContainsAnyInRange((char)0, (char)127)); @@ -27,14 +27,14 @@ public ProbabilisticWithAsciiCharSearchValues(scoped ReadOnlySpan values) _inverseAsciiState = _asciiState.CreateInverse(); _values = new string(values); - _map = new ProbabilisticMap(_values); + _map = new ProbabilisticMapState(_values, maxInclusive); } internal override char[] GetValues() => _values.ToCharArray(); [MethodImpl(MethodImplOptions.AggressiveInlining)] internal override bool ContainsCore(char value) => - ProbabilisticMap.Contains(ref Unsafe.As(ref _map), _values, value); + _map.FastContains(value); internal override int IndexOfAny(ReadOnlySpan span) { @@ -83,11 +83,10 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(span)), span = span.Slice(offset); } - int index = ProbabilisticMap.IndexOfAny( - ref Unsafe.As(ref _map), + int index = ProbabilisticMap.IndexOfAny( ref MemoryMarshal.GetReference(span), span.Length, - _values); + ref _map); if (index >= 0) { @@ -122,10 +121,10 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(span)), span = span.Slice(offset); } - int index = ProbabilisticMap.IndexOfAnySimpleLoop( + int index = ProbabilisticMapState.IndexOfAnySimpleLoop( ref MemoryMarshal.GetReference(span), span.Length, - _values); + ref _map); if (index >= 0) { @@ -183,11 +182,10 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(span)), span = span.Slice(0, offset + 1); } - return ProbabilisticMap.LastIndexOfAny( - ref Unsafe.As(ref _map), + return ProbabilisticMap.LastIndexOfAny( ref MemoryMarshal.GetReference(span), span.Length, - _values); + ref _map); } internal override int LastIndexOfAnyExcept(ReadOnlySpan span) @@ -212,10 +210,10 @@ ref Unsafe.As(ref MemoryMarshal.GetReference(span)), span = span.Slice(0, offset + 1); } - return ProbabilisticMap.LastIndexOfAnySimpleLoop( + return ProbabilisticMapState.LastIndexOfAnySimpleLoop( ref MemoryMarshal.GetReference(span), span.Length, - _values); + ref _map); } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs index 3c1805afdf6122..5f997b554104b5 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs @@ -3,9 +3,7 @@ using System.Diagnostics; using System.Numerics; -using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; @@ -159,36 +157,52 @@ public static SearchValues Create(ReadOnlySpan values) return new Any5SearchValues(shortValues); } - scoped ReadOnlySpan probabilisticValues = values; - - if (Vector128.IsHardwareAccelerated && values.Length < 8) - { - // ProbabilisticMap does a Span.Contains check to confirm potential matches. - // If we have fewer than 8 values, pad them with existing ones to make the verification faster. - Span newValues = stackalloc char[8]; - newValues.Fill(values[0]); - values.CopyTo(newValues); - probabilisticValues = newValues; - } - if (IndexOfAnyAsciiSearcher.IsVectorizationSupported && minInclusive < 128) { // If we have both ASCII and non-ASCII characters, use an implementation that // does an optimistic ASCII fast-path and then falls back to the ProbabilisticMap. - return (Ssse3.IsSupported || PackedSimd.IsSupported) && probabilisticValues.Contains('\0') - ? new ProbabilisticWithAsciiCharSearchValues(probabilisticValues) - : new ProbabilisticWithAsciiCharSearchValues(probabilisticValues); + return (Ssse3.IsSupported || PackedSimd.IsSupported) && values.Contains('\0') + ? new ProbabilisticWithAsciiCharSearchValues(values, maxInclusive) + : new ProbabilisticWithAsciiCharSearchValues(values, maxInclusive); } - // We prefer using the ProbabilisticMap over Latin1CharSearchValues if the former is vectorized. - if (!(Sse41.IsSupported || AdvSimd.Arm64.IsSupported) && maxInclusive < 256) + if (ShouldUseProbabilisticMap(values.Length, maxInclusive)) { - // This will also match ASCII values when IndexOfAnyAsciiSearcher is not supported. - return new Latin1CharSearchValues(values); + return new ProbabilisticCharSearchValues(values, maxInclusive); } - return new ProbabilisticCharSearchValues(probabilisticValues); + // This will also match ASCII values when IndexOfAnyAsciiSearcher is not supported. + return new BitmapCharSearchValues(values, maxInclusive); + + static bool ShouldUseProbabilisticMap(int valuesLength, int maxInclusive) + { + // *Rough estimates*. The current implementation uses 256 bits for the bloom filter. + // If the implementation is vectorized we can get away with a decent false positive rate. + const int MaxValuesForProbabilisticMap = 256; + + if (valuesLength > MaxValuesForProbabilisticMap) + { + // If the number of values is too high, we won't see any benefits from the 'probabilistic' part. + return false; + } + + if (Sse41.IsSupported || AdvSimd.Arm64.IsSupported) + { + // If the probabilistic map is vectorized, we prefer it. + return true; + } + + // The probabilistic map is more memory efficient for spare sets, while the bitmap is more efficient for dense sets. + int bitmapFootprintBytesEstimate = 64 + (maxInclusive / 8); + int probabilisticFootprintBytesEstimate = 128 + (valuesLength * 6); + + // The bitmap is a bit faster than the perfect hash checks employed by the probabilistic map. + // Sacrifice some memory usage for faster lookups. + const int AcceptableSizeMultiplier = 2; + + return AcceptableSizeMultiplier * probabilisticFootprintBytesEstimate < bitmapFootprintBytesEstimate; + } } /// From c29dd3e341534c125646ebf19138ed2599fc98b9 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sat, 13 Apr 2024 04:06:37 +0200 Subject: [PATCH 2/3] Reduce footprint of ProbMap SearchValues --- .../SearchValues/ProbabilisticCharSearchValues.cs | 5 ++--- .../src/System/SearchValues/ProbabilisticMapState.cs | 10 ++++++++++ .../ProbabilisticWithAsciiCharSearchValues.cs | 7 +++---- .../src/System/SearchValues/SearchValues.cs | 2 +- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs index ab7b2dfd0da7a6..2914407b0cc1be 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticCharSearchValues.cs @@ -9,15 +9,14 @@ namespace System.Buffers internal sealed class ProbabilisticCharSearchValues : SearchValues { private ProbabilisticMapState _map; - private readonly string _values; public ProbabilisticCharSearchValues(ReadOnlySpan values, int maxInclusive) { - _values = new string(values); _map = new ProbabilisticMapState(values, maxInclusive); } - internal override char[] GetValues() => _values.ToCharArray(); + internal override char[] GetValues() => + _map.GetValues(); [MethodImpl(MethodImplOptions.AggressiveInlining)] internal override bool ContainsCore(char value) => diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs index 91804634ed664c..601ca6a9f8a76d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs @@ -61,6 +61,16 @@ public unsafe ProbabilisticMapState(ReadOnlySpan* valuesPtr) _slowContainsValuesPtr = valuesPtr; } + public char[] GetValues() + { + Debug.Assert(_hashEntries is not null); + + var unique = new HashSet(_hashEntries); + char[] values = new char[unique.Count]; + unique.CopyTo(values); + return values; + } + [MethodImpl(MethodImplOptions.AggressiveInlining)] public bool FastContains(char value) { diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs index 0d14354552b40a..7b05c3f8a3b77c 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticWithAsciiCharSearchValues.cs @@ -16,7 +16,6 @@ internal sealed class ProbabilisticWithAsciiCharSearchValues : S private IndexOfAnyAsciiSearcher.AsciiState _asciiState; private IndexOfAnyAsciiSearcher.AsciiState _inverseAsciiState; private ProbabilisticMapState _map; - private readonly string _values; public ProbabilisticWithAsciiCharSearchValues(ReadOnlySpan values, int maxInclusive) { @@ -26,11 +25,11 @@ public ProbabilisticWithAsciiCharSearchValues(ReadOnlySpan values, int max IndexOfAnyAsciiSearcher.ComputeAsciiState(values, out _asciiState); _inverseAsciiState = _asciiState.CreateInverse(); - _values = new string(values); - _map = new ProbabilisticMapState(_values, maxInclusive); + _map = new ProbabilisticMapState(values, maxInclusive); } - internal override char[] GetValues() => _values.ToCharArray(); + internal override char[] GetValues() => + _map.GetValues(); [MethodImpl(MethodImplOptions.AggressiveInlining)] internal override bool ContainsCore(char value) => diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs index 5f997b554104b5..6e08d95d2e4002 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/SearchValues.cs @@ -195,7 +195,7 @@ static bool ShouldUseProbabilisticMap(int valuesLength, int maxInclusive) // The probabilistic map is more memory efficient for spare sets, while the bitmap is more efficient for dense sets. int bitmapFootprintBytesEstimate = 64 + (maxInclusive / 8); - int probabilisticFootprintBytesEstimate = 128 + (valuesLength * 6); + int probabilisticFootprintBytesEstimate = 128 + (valuesLength * 4); // The bitmap is a bit faster than the perfect hash checks employed by the probabilistic map. // Sacrifice some memory usage for faster lookups. From f1966443c411ac6c6976647860af1ac04c355407 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Tue, 23 Apr 2024 03:32:47 -0700 Subject: [PATCH 3/3] Update misleading comment --- .../src/System/SearchValues/ProbabilisticMapState.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs index 601ca6a9f8a76d..c1631feabf807f 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/ProbabilisticMapState.cs @@ -43,7 +43,8 @@ public ProbabilisticMapState(ReadOnlySpan values, int maxInclusive) _hashEntries = new char[modulus]; // Some hash entries will remain unused. - // We can't leave them uninitialized as that would lead to false positives for values divisible by modulus. + // We can't leave them uninitialized as we would otherwise erroneously match (char)0. + // The exact value doesn't matter, as long as it's in the set of our values. _hashEntries.AsSpan().Fill(values[0]); foreach (char c in values)