Skip to content

Commit

Permalink
Expand small value sets to all case permutations in SearchValues<stri…
Browse files Browse the repository at this point in the history
…ng> (#98902)

* Expand small value sets to all case permutations in SearchValues<string>

* Use the constant in one more place
  • Loading branch information
MihaZupan authored Mar 2, 2024
1 parent cd5587d commit 8aff565
Showing 1 changed file with 79 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

using System.Collections.Generic;
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Globalization;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
Expand All @@ -14,6 +15,8 @@ namespace System.Buffers
{
internal static class StringSearchValues
{
private const int TeddyBucketCount = 8;

private static readonly SearchValues<char> s_asciiLetters =
SearchValues.Create("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");

Expand Down Expand Up @@ -248,6 +251,18 @@ static SearchValues<string> PickAhoCorasickImplementation<TCaseSensitivity>(AhoC

Debug.Assert(!(asciiStartLettersOnly && asciiStartUnaffectedByCaseConversion));

// If we still have empty buckets we could use and we're ignoring case, we may be able to
// generate all possible permutations of the first N characters and switch to case-sensitive searching.
// E.g. ["ab", "c!"] => ["ab", "Ab" "aB", "AB", "c!", "C!"].
// This won't apply to inputs with many letters (e.g. "abc" => 8 permutations on its own).
if (!asciiStartUnaffectedByCaseConversion &&
values.Length < TeddyBucketCount &&
TryGenerateAllCasePermutationsForPrefixes(values, n, TeddyBucketCount, out string[]? newValues))
{
asciiStartUnaffectedByCaseConversion = true;
values = newValues;
}

if (asciiStartUnaffectedByCaseConversion)
{
return nonAsciiAffectedByCaseConversion
Expand Down Expand Up @@ -278,9 +293,9 @@ private static SearchValues<string> PickTeddyImplementation<TStartCaseSensitivit
Debug.Assert(values.Length > 1);
Debug.Assert(n is 2 or 3);

if (values.Length > 8)
if (values.Length > TeddyBucketCount)
{
string[][] buckets = TeddyBucketizer.Bucketize(values, bucketCount: 8, n);
string[][] buckets = TeddyBucketizer.Bucketize(values, TeddyBucketCount, n);

// Potential optimization: We don't have to pick the first N characters for the fingerprint.
// Different offset selection can noticeably improve throughput (e.g. 2x).
Expand All @@ -297,6 +312,68 @@ private static SearchValues<string> PickTeddyImplementation<TStartCaseSensitivit
}
}

private static bool TryGenerateAllCasePermutationsForPrefixes(ReadOnlySpan<string> values, int n, int maxValues, [NotNullWhen(true)] out string[]? newValues)
{
Debug.Assert(n is 2 or 3);
Debug.Assert(values.Length < maxValues);

// Count how many possible permutations there are.
int newValuesCount = 0;

foreach (string value in values)
{
int permutations = 1;

foreach (char c in value.AsSpan(0, n))
{
Debug.Assert(char.IsAscii(c));

if (char.IsAsciiLetter(c))
{
permutations *= 2;
}
}

newValuesCount += permutations;
}

Debug.Assert(newValuesCount > values.Length, "Shouldn't have been called if there were no letters present");

if (newValuesCount > maxValues)
{
newValues = null;
return false;
}

// Generate the permutations.
newValues = new string[newValuesCount];
newValuesCount = 0;

foreach (string value in values)
{
int start = newValuesCount;

newValues[newValuesCount++] = value;

for (int i = 0; i < n; i++)
{
char c = value[i];

if (char.IsAsciiLetter(c))
{
// Copy all the previous permutations of this value but change the casing of the i-th character.
foreach (string previous in newValues.AsSpan(start, newValuesCount - start))
{
newValues[newValuesCount++] = $"{previous.AsSpan(0, i)}{(char)(c ^ 0x20)}{previous.AsSpan(i + 1)}";
}
}
}
}

Debug.Assert(newValuesCount == newValues.Length);
return true;
}

private static SearchValues<string> CreateForSingleValue(
string value,
HashSet<string>? uniqueValues,
Expand Down

0 comments on commit 8aff565

Please sign in to comment.