Skip to content

Commit

Permalink
Use trait-based classes to get everything inlined
Browse files Browse the repository at this point in the history
This will cost one extra `HashSet<string>` and one extra `SubstringComparer` to be allocated, but might make the code run faster.

Use the GSW strategy for virtual flattening
  • Loading branch information
IDisposable committed Jul 30, 2023
1 parent c1dd5d9 commit 7060c23
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 55 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ The System.Collections.Immutable library is built-in as part of the shared frame

<ItemGroup>
<Compile Include="Properties\InternalsVisibleTo.cs" />
<Compile Include="System\Collections\Frozen\String\SubstringComparers\SubstringComparer.cs" />
<Compile Include="System\Collections\Frozen\String\SubstringComparers\SubstringComparerBase.cs" />

<Compile Include="System\Polyfills.cs" />
<Compile Include="System\Collections\ThrowHelper.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Collections.Frozen.String.SubstringComparers;

namespace System.Collections.Frozen
{
Expand All @@ -29,40 +30,40 @@ internal static class KeyAnalyzer
public static AnalysisResults Analyze(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength)
{
Debug.Assert(uniqueStrings.Length > 0);
Debug.Assert(!uniqueStrings.IsEmpty);

if (minLength > 0)
{
const int MaxSubstringLengthLimit = 8; // arbitrary small-ish limit...it's not worth the increase in algorithmic complexity to analyze longer substrings
int uniqueStringsLength = uniqueStrings.Length;

// Sufficient uniqueness factor of 95% is good enough.
// Instead of ensuring that 95% of data is good, we stop when we know that at least 5% is bad.
int acceptableNonUniqueCount = uniqueStrings.Length / 20;
int acceptableNonUniqueCount = uniqueStringsLength / 20;

// Try to pick a substring comparer.
SubstringComparer comparer = ignoreCase ? new JustifiedCaseInsensitiveSubstringComparer() : new JustifiedSubstringComparer();
HashSet<string> set = new HashSet<string>(
#if NET6_0_OR_GREATER
uniqueStrings.Length,
#endif
comparer);
ISubstringComparer leftComparer = ignoreCase ? new LeftSubstringCaseInsensitiveComparer() : new LeftSubstringOrdinalComparer();
HashSet<string> leftSet = MakeHashSet(uniqueStringsLength, leftComparer);

// we lazily spin up the right comparators when/if needed
ISubstringComparer? rightComparer = null;
HashSet<string>? rightSet = null;

// For each substring length...preferring the shortest length that provides
// enough uniqueness
int maxSubstringLength = Math.Min(minLength, MaxSubstringLengthLimit);
for (int count = 1; count <= maxSubstringLength; count++)
{
comparer.Count = count;
leftComparer.Count = count;

// For each index, get a uniqueness factor for the left-justified substrings.
// If any is above our threshold, we're done.
for (int index = 0; index <= minLength - count; index++)
{
comparer.Index = index;
leftComparer.Index = index;

if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
if (HasSufficientUniquenessFactor(leftSet, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, index, count);
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, leftComparer);
}
}

Expand All @@ -72,29 +73,42 @@ public static AnalysisResults Analyze(
// right-justified substrings, and so we also check right-justification.
if (minLength != maxLength)
{
rightComparer ??= ignoreCase ? new RightSubstringCaseInsensitiveComparer() : new RightSubstringOrdinalComparer();
rightSet ??= MakeHashSet(uniqueStringsLength, rightComparer);

// when Index is negative, we're offsetting from the right, ensure we're at least
// far enough from the right that we have count characters available
comparer.Index = -count;
rightComparer!.Count = count;
rightComparer!.Index = -count;

// For each index, get a uniqueness factor for the right-justified substrings.
// If any is above our threshold, we're done.
for (int offset = 0; offset <= minLength - count; offset++, comparer.Index--)
for (int offset = 0; offset <= minLength - count; offset++, rightComparer!.Index--)
{
if (HasSufficientUniquenessFactor(set, uniqueStrings, acceptableNonUniqueCount))
if (HasSufficientUniquenessFactor(rightSet!, uniqueStrings, acceptableNonUniqueCount))
{
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, comparer.Index, count);
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, rightComparer);
}
}
}
}
}

// Could not find a substring index/length that was good enough, use the entire string.
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, 0, 0);
return CreateAnalysisResults(uniqueStrings, ignoreCase, minLength, maxLength, s_FullStringComparer);
}

private static HashSet<string> MakeHashSet(int length, IEqualityComparer<string> comparer)
{
return new HashSet<string>(
#if NET6_0_OR_GREATER
length,
#endif
comparer);
}

private static AnalysisResults CreateAnalysisResults(
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, int index, int count)
ReadOnlySpan<string> uniqueStrings, bool ignoreCase, int minLength, int maxLength, ISubstringComparer comparer)
{
// Start off by assuming all strings are ASCII
bool allAsciiIfIgnoreCase = true;
Expand All @@ -113,7 +127,7 @@ private static AnalysisResults CreateAnalysisResults(
foreach (string s in uniqueStrings)
{
// Get the span for the substring.
ReadOnlySpan<char> substring = count == 0 ? s.AsSpan() : Slicer(s, index, count);
ReadOnlySpan<char> substring = comparer.Slice(s);

// If the substring isn't ASCII, bail out to return the results.
if (!IsAllAscii(substring))
Expand All @@ -139,7 +153,7 @@ private static AnalysisResults CreateAnalysisResults(
}

// Return the analysis results.
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, index, count, minLength, maxLength);
return new AnalysisResults(ignoreCase, allAsciiIfIgnoreCase, comparer.Index, comparer.Count, minLength, maxLength);
}

internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
Expand Down Expand Up @@ -184,7 +198,7 @@ internal static unsafe bool IsAllAscii(ReadOnlySpan<char> s)
#if NET8_0_OR_GREATER
private static readonly SearchValues<char> s_asciiLetters = SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
#endif
private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
internal static bool ContainsAnyLetters(ReadOnlySpan<char> s)
{
Debug.Assert(IsAllAscii(s));

Expand All @@ -203,14 +217,13 @@ private static bool ContainsAnyLetters(ReadOnlySpan<char> s)
#endif
}

private static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
internal static bool HasSufficientUniquenessFactor(HashSet<string> set, ReadOnlySpan<string> uniqueStrings, int acceptableNonUniqueCount)
{
set.Clear();

foreach (string s in uniqueStrings)
{
if (!set.Add(s) && acceptableNonUniqueCount-- <= 0)
if (!set.Add(s) && --acceptableNonUniqueCount < 0)
{
set.Clear();
return false;
}
}
Expand Down Expand Up @@ -241,34 +254,6 @@ public AnalysisResults(bool ignoreCase, bool allAsciiIfIgnoreCase, int hashIndex
public bool RightJustifiedSubstring => HashIndex < 0;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static ReadOnlySpan<char> Slicer(this string s, int index, int count) => s.AsSpan((index >= 0 ? index : s.Length + index), count);

private abstract class SubstringComparer : IEqualityComparer<string>
{
public int Index; // offset from left side (if positive) or right side (if negative) of the string
public int Count; // number of characters in the span

public abstract bool Equals(string? x, string? y);
public abstract int GetHashCode(string s);
}

private sealed class JustifiedSubstringComparer : SubstringComparer
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override bool Equals(string? x, string? y) => x!.Slicer(Index, Count).SequenceEqual(y!.Slicer(Index, Count));

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(s.Slicer(Index, Count));
}

private sealed class JustifiedCaseInsensitiveSubstringComparer : SubstringComparer
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override bool Equals(string? x, string? y) => x!.Slicer(Index, Count).Equals(y!.Slicer(Index, Count), StringComparison.OrdinalIgnoreCase);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(s.Slicer(Index, Count));
}
private static FullStringComparer s_FullStringComparer = new FullStringComparer();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Runtime.CompilerServices;

namespace System.Collections.Frozen.String.SubstringComparers
{
internal sealed class LeftSubstringOrdinalComparer : SubstringComparerBase<LeftSubstringOrdinalComparer.GSW>
{
internal struct GSW : IGenericSpecializedWrapper
{
private LeftSubstringOrdinalComparer _this;
public void Store(ISubstringComparer @this) => _this = (LeftSubstringOrdinalComparer)@this;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => s.AsSpan(_this.Index, _this.Count);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => Slice(x!).SequenceEqual(Slice(y!));

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(Slice(s));
}
}

internal sealed class RightSubstringOrdinalComparer : SubstringComparerBase<RightSubstringOrdinalComparer.GSW>
{
internal struct GSW : IGenericSpecializedWrapper
{
private RightSubstringOrdinalComparer _this;
public void Store(ISubstringComparer @this) => _this = (RightSubstringOrdinalComparer)@this;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => s.AsSpan(s.Length + _this.Index, _this.Count);


[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => Slice(x!).SequenceEqual(Slice(y!));

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => Hashing.GetHashCodeOrdinal(Slice(s));
}
}

internal sealed class LeftSubstringCaseInsensitiveComparer : SubstringComparerBase<LeftSubstringCaseInsensitiveComparer.GSW>
{
internal struct GSW : IGenericSpecializedWrapper
{
private LeftSubstringCaseInsensitiveComparer _this;
public void Store(ISubstringComparer @this) => _this = (LeftSubstringCaseInsensitiveComparer)@this;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => s.AsSpan(_this.Index, _this.Count);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => Slice(x!).Equals(Slice(y!), StringComparison.OrdinalIgnoreCase);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(Slice(s));
}
}

internal sealed class RightSubstringCaseInsensitiveComparer : SubstringComparerBase<RightSubstringCaseInsensitiveComparer.GSW>
{
internal struct GSW : IGenericSpecializedWrapper
{
private RightSubstringCaseInsensitiveComparer _this;
public void Store(ISubstringComparer @this) => _this = (RightSubstringCaseInsensitiveComparer)@this;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => s.AsSpan(s.Length + _this.Index, _this.Count);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => Slice(x!).Equals(Slice(y!), StringComparison.OrdinalIgnoreCase);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(Slice(s));
}
}

internal sealed class FullStringComparer : SubstringComparerBase<FullStringComparer.GSW>
{
internal struct GSW : IGenericSpecializedWrapper
{
private FullStringComparer _this;
public void Store(ISubstringComparer @this) => _this = (FullStringComparer)@this;

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => s.AsSpan();

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => Slice(x!).Equals(Slice(y!), StringComparison.OrdinalIgnoreCase);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => Hashing.GetHashCodeOrdinalIgnoreCase(Slice(s));
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Generic;
using System.Runtime.CompilerServices;

namespace System.Collections.Frozen.String.SubstringComparers
{
internal interface ISubstringComparer : IEqualityComparer<string>
{
public int Index { get; set; } // offset from left side (if positive) or right side (if negative) of the string
public int Count { get; set; } // number of characters in the span

public abstract ReadOnlySpan<char> Slice(string s);
}

internal abstract class SubstringComparerBase<TThisWrapper> : ISubstringComparer
where TThisWrapper : struct, SubstringComparerBase<TThisWrapper>.IGenericSpecializedWrapper
{
/// <summary>A wrapper around this that enables access to important members without making virtual calls.</summary>
private readonly TThisWrapper _this;

protected SubstringComparerBase()
{
_this = default;
_this.Store(this);
}

public int Index { get; set; }
public int Count { get; set; }

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public ReadOnlySpan<char> Slice(string s) => _this.Slice(s);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public bool Equals(string? x, string? y) => _this.Equals(x, y);

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetHashCode(string s) => _this.GetHashCode(s);

/// <summary>Used to enable generic specialization with reference types.</summary>
/// <remarks>
/// To avoid each of those incurring virtual dispatch to the derived type, the derived
/// type hands down a struct wrapper through which all calls are performed. This base
/// class uses that generic struct wrapper to specialize and devirtualize.
/// </remarks>
internal interface IGenericSpecializedWrapper
{
void Store(ISubstringComparer @this);
public ReadOnlySpan<char> Slice(string s);
public bool Equals(string? x, string? y);
public int GetHashCode(string s);
}
}
}
Loading

0 comments on commit 7060c23

Please sign in to comment.