Skip to content

Commit

Permalink
Fix bad case-insensitive ASCII equality check (#45928) (#46057)
Browse files Browse the repository at this point in the history
  • Loading branch information
GrabYourPitchforks authored Jan 14, 2021
1 parent a66f040 commit 93820c8
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 53 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -150,31 +150,46 @@ internal static bool UInt32OrdinalIgnoreCaseAscii(uint valueA, uint valueB)
Debug.Assert(AllCharsInUInt32AreAscii(valueA));
Debug.Assert(AllCharsInUInt32AreAscii(valueB));

// a mask of all bits which are different between A and B
uint differentBits = valueA ^ valueB;

// the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value < 'A'
uint lowerIndicator = valueA + 0x0100_0100u - 0x0041_0041u;

// the 0x80 bit of each word of 'upperIndicator' will be set iff (word | 0x20) has value > 'z'
uint upperIndicator = (valueA | 0x0020_0020u) + 0x0080_0080u - 0x007B_007Bu;

// the 0x80 bit of each word of 'combinedIndicator' will be set iff the word is *not* [A-Za-z]
uint combinedIndicator = lowerIndicator | upperIndicator;

// Shift all the 0x80 bits of 'combinedIndicator' into the 0x20 positions, then set all bits
// aside from 0x20. This creates a mask where all bits are set *except* for the 0x20 bits
// which correspond to alpha chars (either lower or upper). For these alpha chars only, the
// 0x20 bit is allowed to differ between the two input values. Every other char must be an
// exact bitwise match between the two input values. In other words, (valueA & mask) will
// convert valueA to uppercase, so (valueA & mask) == (valueB & mask) answers "is the uppercase
// form of valueA equal to the uppercase form of valueB?" (Technically if valueA has an alpha
// char in the same position as a non-alpha char in valueB, or vice versa, this operation will
// result in nonsense, but it'll still compute as inequal regardless, which is what we want ultimately.)
// The line below is a more efficient way of doing the same check taking advantage of the XOR
// computation we performed at the beginning of the method.

return (((combinedIndicator >> 2) | ~0x0020_0020u) & differentBits) == 0;
// Generate a mask of all bits which are different between A and B. Since [A-Z]
// and [a-z] differ by the 0x20 bit, we'll left-shift this by 2 now so that
// this is moved over to the 0x80 bit, which nicely aligns with the calculation
// we're going to do on the indicator flag later.
//
// n.b. All of the logic below assumes we have at least 2 "known zero" bits leading
// each of the 7-bit ASCII values. This assumption won't hold if this method is
// ever adapted to deal with packed bytes instead of packed chars.

uint differentBits = (valueA ^ valueB) << 2;

// Now, we want to generate a mask where for each word in the input, the mask contains
// 0xFF7F if the word is [A-Za-z], 0xFFFF if the word is not [A-Za-z]. We know each
// input word is ASCII (only low 7 bit set), so we can use a combination of addition
// and logical operators as follows.
//
// original input +05 |A0 +1A
// ====================================================
// 00 .. 3F -> 05 .. 44 -> A5 .. E4 -> BF .. FE
// 40 -> 45 -> E5 -> FF
// ([A-Z]) 41 .. 5A -> 46 .. 5F -> E6 .. FF -> 00 .. 19
// 5B .. 5F -> 60 .. 64 -> E0 .. E4 -> FA .. FE
// 60 -> 65 -> E5 -> FF
// ([a-z]) 61 .. 7A -> 66 .. 7F -> E6 .. FF -> 00 .. 19
// 7B .. 7F -> 80 .. 84 -> A0 .. A4 -> BA .. BE
//
// This combination of operations results in the 0x80 bit of each word being set
// iff the original word value was *not* [A-Za-z].

uint indicator = valueA + 0x0005_0005u;
indicator |= 0x00A0_00A0u;
indicator += 0x001A_001Au;
indicator |= 0xFF7F_FF7Fu; // normalize each word to 0xFF7F or 0xFFFF

// At this point, 'indicator' contains the mask of bits which are *not* allowed to
// differ between the inputs, and 'differentBits' contains the mask of bits which
// actually differ between the inputs. If these masks have any bits in common, then
// the two values are *not* equal under an OrdinalIgnoreCase comparer.

return (differentBits & indicator) == 0;
}

/// <summary>
Expand All @@ -192,26 +207,15 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB)
Debug.Assert(AllCharsInUInt64AreAscii(valueA));
Debug.Assert(AllCharsInUInt64AreAscii(valueB));

// the 0x80 bit of each word of 'lowerIndicator' will be set iff the word has value >= 'A'
ulong lowerIndicator = valueA + 0x0080_0080_0080_0080ul - 0x0041_0041_0041_0041ul;

// the 0x80 bit of each word of 'upperIndicator' will be set iff (word | 0x20) has value <= 'z'
ulong upperIndicator = (valueA | 0x0020_0020_0020_0020ul) + 0x0100_0100_0100_0100ul - 0x007B_007B_007B_007Bul;

// the 0x20 bit of each word of 'combinedIndicator' will be set iff the word is [A-Za-z]
ulong combinedIndicator = (0x0080_0080_0080_0080ul & lowerIndicator & upperIndicator) >> 2;

// Convert both values to lowercase (using the combined indicator from the first value)
// and compare for equality. It's possible that the first value will contain an alpha character
// where the second value doesn't (or vice versa), and applying the combined indicator will
// create nonsensical data, but the comparison would have failed anyway in this case so it's
// a safe operation to perform.
//
// This 64-bit method is similar to the 32-bit method, but it performs the equivalent of convert-to-
// lowercase-then-compare rather than convert-to-uppercase-and-compare. This particular operation
// happens to be faster on x64.
// Duplicate of logic in UInt32OrdinalIgnoreCaseAscii, but using 64-bit consts.
// See comments in that method for more info.

return (valueA | combinedIndicator) == (valueB | combinedIndicator);
ulong differentBits = (valueA ^ valueB) << 2;
ulong indicator = valueA + 0x0005_0005_0005_0005ul;
indicator |= 0x00A0_00A0_00A0_00A0ul;
indicator += 0x001A_001A_001A_001Aul;
indicator |= 0xFF7F_FF7F_FF7F_FF7Ful;
return (differentBits & indicator) == 0;
}
}
}
41 changes: 41 additions & 0 deletions src/libraries/System.Runtime/tests/System/StringTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1775,5 +1775,46 @@ public static void MakeSureNoTrimChecksGoOutOfRange_Memory()
Assert.True(rom.Span.SequenceEqual(rom.TrimEnd().Span));
}
}

[Fact]
public static void EqualityTests_AsciiOptimizations()
{
for (int i = 0; i < 128; i++)
{
for (int j = 0; j < 128; j++)
{
for (int len = 0; len < 8; len++)
{
bool expectedEqualOrdinal = i == j;
bool expectedEqualOrdinalIgnoreCase = (i == j) || ((i | 0x20) >= 'a' && (i | 0x20) <= 'z' && ((i | 0x20) == (j | 0x20)));

// optimization might vary based on string length, so we use 'len' to vary the string length
// in order to hit as many code paths as possible
string prefix = new string('a', len);
string suffix = new string('b', len);
string s1 = prefix + (char)i + suffix;
string s2 = prefix + (char)j + suffix;

bool actualEqualOrdinal = string.Equals(s1, s2, StringComparison.Ordinal);
bool actualEqualOrdinalIgnoreCase = string.Equals(s1, s2, StringComparison.OrdinalIgnoreCase);

int actualCompareToOrdinal = string.Compare(s1, s2, StringComparison.Ordinal);
int actualCompareToOrdinalIgnoreCase = string.Compare(s1, s2, StringComparison.OrdinalIgnoreCase);

try
{
Assert.Equal(expectedEqualOrdinal, actualEqualOrdinal);
Assert.Equal(expectedEqualOrdinal, actualCompareToOrdinal == 0);
Assert.Equal(expectedEqualOrdinalIgnoreCase, actualEqualOrdinalIgnoreCase);
Assert.Equal(expectedEqualOrdinalIgnoreCase, actualCompareToOrdinalIgnoreCase == 0);
}
catch (Exception ex)
{
throw new Exception($"Chars U+{i:X4} ('{(char)i}') and U+{j:X4} ('{(char)j}') did not compare as expected. Iteration: len = {len}.", ex);
}
}
}
}
}
}
}
21 changes: 12 additions & 9 deletions src/mono/mono/mini/interp/interp-intrins.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,24 @@ int
interp_intrins_ordinal_ignore_case_ascii (guint32 valueA, guint32 valueB)
{
// Utf16Utility.UInt32OrdinalIgnoreCaseAscii
guint32 differentBits = valueA ^ valueB;
guint32 lowerIndicator = valueA + 0x01000100 - 0x00410041;
guint32 upperIndicator = (valueA | 0x00200020u) + 0x00800080 - 0x007B007B;
guint32 combinedIndicator = lowerIndicator | upperIndicator;
return (((combinedIndicator >> 2) | ~0x00200020) & differentBits) == 0;
guint32 differentBits = (valueA ^ valueB) << 2;
guint32 indicator = valueA + 0x00050005;
indicator |= 0x00A000A0;
indicator += 0x001A001A;
indicator |= 0xFF7FFF7F;
return (differentBits & indicator) == 0;
}

int
interp_intrins_64ordinal_ignore_case_ascii (guint64 valueA, guint64 valueB)
{
// Utf16Utility.UInt64OrdinalIgnoreCaseAscii
guint64 lowerIndicator = valueA + 0x0080008000800080l - 0x0041004100410041l;
guint64 upperIndicator = (valueA | 0x0020002000200020l) + 0x0100010001000100l - 0x007B007B007B007Bl;
guint64 combinedIndicator = (0x0080008000800080l & lowerIndicator & upperIndicator) >> 2;
return (valueA | combinedIndicator) == (valueB | combinedIndicator);
guint64 differentBits = (valueA ^ valueB) << 2;
guint64 indicator = valueA + 0x0005000500050005l;
indicator |= 0x00A000A000A000A0l;
indicator += 0x001A001A001A001Al;
indicator |= 0xFF7FFF7FFF7FFF7Fl;
return (differentBits & indicator) == 0;
}

static int
Expand Down

0 comments on commit 93820c8

Please sign in to comment.