From 3750ac51619efbbc59bf07d3879758a9c18c4b0e Mon Sep 17 00:00:00 2001 From: Ilona Tomkowicz <32700855+ilonatommy@users.noreply.github.com> Date: Fri, 31 May 2024 12:42:07 +0000 Subject: [PATCH] [browser] `HybridGlobalization` correct `HashCode` ranges of skipped unicodes (#102912) * Add full tests. Behavior is correct. * Feedback. --- .../Globalization/CompareInfo.WebAssembly.cs | 2 +- .../CompareInfo/CompareInfoTests.HashCode.cs | 130 +++++------------- 2 files changed, 38 insertions(+), 94 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs index e8935a6d2439c..8ea2223764d8d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Globalization/CompareInfo.WebAssembly.cs @@ -117,7 +117,7 @@ private unsafe int JsIndexOfCore(ReadOnlySpan source, ReadOnlySpan t } } - // there are chars that are ignored by ICU hashing algorithm but not ignored by invariant hashing + // there are chars that are considered equal by HybridGlobalization but do not have equal hashes when binary hashed // Control: 1105 (out of 1105) // Format: 697 (out of 731) // OtherPunctuation: 6919 (out of 7004) diff --git a/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs b/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs index 9ee808a6282e4..e05bcbf453e91 100644 --- a/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs +++ b/src/libraries/System.Runtime/tests/System.Globalization.Tests/CompareInfo/CompareInfoTests.HashCode.cs @@ -12,108 +12,52 @@ namespace System.Globalization.Tests { public class CompareInfoHashCodeTests : CompareInfoTestsBase { - public static IEnumerable HashCodeLocalized_TestData() - { - yield return new object[] { s_invariantCompare, "foo", "Foo", CompareOptions.IgnoreCase }; - yield return new object[] { s_invariantCompare, "igloo", "\u0130GLOO", CompareOptions.IgnoreCase }; - yield return new object[] { s_invariantCompare, "igloo", "\u0130GLOO", CompareOptions.None }; - yield return new object[] { s_invariantCompare, "igloo", "IGLOO", CompareOptions.IgnoreCase }; - yield return new object[] { new CultureInfo("pl-PL").CompareInfo, "igloo", "\u0130GLOO", CompareOptions.IgnoreCase }; - yield return new object[] { new CultureInfo("pl-PL").CompareInfo, "igloo", "IGLOO", CompareOptions.IgnoreCase }; - yield return new object[] { new CultureInfo("tr-TR").CompareInfo, "igloo", "\u0130GLOO", CompareOptions.IgnoreCase }; - yield return new object[] { new CultureInfo("tr-TR").CompareInfo, "igloo", "IGLOO", CompareOptions.IgnoreCase }; - - if (!PlatformDetection.IsHybridGlobalizationOnBrowser) - { - if (PlatformDetection.IsNotHybridGlobalizationOnApplePlatform) - yield return new object[] { new CultureInfo("en-GB").CompareInfo, "100", "100!", CompareOptions.IgnoreSymbols }; // HG: equal: True, hashCodesEqual: False - yield return new object[] { new CultureInfo("ja-JP").CompareInfo, "\u30A2", "\u3042", CompareOptions.IgnoreKanaType }; // HG: equal: True, hashCodesEqual: False - yield return new object[] { new CultureInfo("en-GB").CompareInfo, "caf\u00E9", "cafe\u0301", CompareOptions.IgnoreNonSpace | CompareOptions.IgnoreKanaType }; // HG: equal: True, hashCodesEqual: False - } - } - [Theory] - [MemberData(nameof(HashCodeLocalized_TestData))] - public void HashCodeLocalized(CompareInfo cmpInfo, string str1, string str2, CompareOptions options) + [OuterLoop] + [ConditionalFact(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] + [ActiveIssue("https://github.com/dotnet/runtime/issues/95338", typeof(PlatformDetection), nameof(PlatformDetection.IsHybridGlobalizationOnApplePlatform))] + public void CheckHashingInLineWithEqual() { - bool areEqual = cmpInfo.Compare(str1, str2, options) == 0; - var hashCode1 = cmpInfo.GetHashCode(str1, options); - var hashCode2 = cmpInfo.GetHashCode(str2, options); - bool areHashCodesEqual = hashCode1 == hashCode2; - - if (areEqual) - { - Assert.True(areHashCodesEqual); - } - else + int additionalCollisions = 0; + CultureInfo[] cultures = CultureInfo.GetCultures(CultureTypes.AllCultures); + foreach (CultureInfo culture in cultures) { - Assert.False(areHashCodesEqual); - } - - // implication of the above behavior: - StringComparer stringComparer = new CustomComparer(cmpInfo, options); - TryAddToCustomDictionary(stringComparer, str1, str2, areHashCodesEqual); - } + // Japanese does not have "None" compare option, it always ignores Kana + // HashCode is not available for options different than None or IgnoreCase + if (culture.Name.Split('-')[0] == "ja") + continue; - private void TryAddToCustomDictionary(StringComparer comparer, string str1, string str2, bool shouldFail) - { - Dictionary customDictionary = new Dictionary(comparer); - customDictionary.Add(str1, 0); - try - { - customDictionary.Add(str2, 1); - Assert.False(shouldFail); - } - catch (ArgumentException ex) - { - Assert.True(shouldFail); - Assert.Contains("An item with the same key has already been added.", ex.Message); + for (int i = 0; i <= 65535; i++) + CheckChar(i, culture); } - catch (Exception ex) + + void CheckChar(int charCode, CultureInfo culture) { - Assert.Fail($"Unexpected exception thrown: {ex}"); + var cmpInfo = culture.CompareInfo; + char character = (char)charCode; + string str1 = $"a{character}b"; + string str2 = "ab"; + CompareOptions options = CompareOptions.None; + var hashCode1 = cmpInfo.GetHashCode(str1, options); + var hashCode2 = cmpInfo.GetHashCode(str2, options); + bool areHashCodesEqual = hashCode1 == hashCode2; + StringComparer stringComparer = new CustomComparer(cmpInfo, options); + // if equal => same, then expect hash => same + if (stringComparer.Compare(str1, str2) == 0) + { + Assert.True(areHashCodesEqual, $"Expected equal hashes for equal strings. The check failed for culture {culture.Name}, character: {character}, code: {charCode}."); + } + // if equal => diff, then expect hash => diff + else + { + if (areHashCodesEqual) + { + additionalCollisions++; // this should be smallest possible, 11541466 + } + } } } - public static IEnumerable CheckHashingOfSkippedChars_TestData() - { - // one char from each ignored category that is skipped on ICU - yield return new object[] { '\u0008', s_invariantCompare }; // Control: BACKSPACE - yield return new object[] { '\u200B', s_invariantCompare }; // Format: ZERO WIDTH SPACE - yield return new object[] { '\u180A', s_invariantCompare }; // OtherPunctuation: MONGOLIAN NIRUGU - yield return new object[] { '\uFE73', s_invariantCompare }; // OtherLetter: THAI CHARACTER PAIYANNOI - yield return new object[] { '\u0F3E', s_invariantCompare }; // SpacingCombiningMark: "TIBETAN MARK GTER YIG MGO UM RTAGS GNYIS - yield return new object[] { '\u0640', s_invariantCompare }; // ModifierLetter: ARABIC TATWEEL - yield return new object[] { '\u0488', s_invariantCompare }; // EnclosingMark: COMBINING CYRILLIC HUNDRED THOUSANDS SIGN - yield return new object[] { '\u034F', s_invariantCompare }; // NonSpacingMark: DIAERESIS - CompareInfo thaiCmpInfo = new CultureInfo("th-TH").CompareInfo; - yield return new object[] { '\u0020', thaiCmpInfo }; // SpaceSeparator: SPACE - yield return new object[] { '\u0028', thaiCmpInfo }; // OpenPunctuation: LEFT PARENTHESIS - yield return new object[] { '\u007D', thaiCmpInfo }; // ClosePunctuation: RIGHT PARENTHESIS - yield return new object[] { '\u2013', thaiCmpInfo }; // DashPunctuation: EN DASH - yield return new object[] { '\u005F', thaiCmpInfo }; // ConnectorPunctuation: LOW LINE - yield return new object[] { '\u2018', thaiCmpInfo }; // InitialQuotePunctuation: LEFT SINGLE QUOTATION MARK - yield return new object[] { '\u2019', thaiCmpInfo }; // FinalQuotePunctuation: RIGHT SINGLE QUOTATION MARK - yield return new object[] { '\u2028', thaiCmpInfo }; // LineSeparator: LINE SEPARATOR - yield return new object[] { '\u2029', thaiCmpInfo }; // ParagraphSeparator: PARAGRAPH SEPARATOR - } - - [ConditionalTheory(typeof(PlatformDetection), nameof(PlatformDetection.IsIcuGlobalization))] - [MemberData(nameof(CheckHashingOfSkippedChars_TestData))] - [ActiveIssue("https://github.com/dotnet/runtime/issues/95338", typeof(PlatformDetection), nameof(PlatformDetection.IsHybridGlobalizationOnApplePlatform))] - public void CheckHashingOfSkippedChars(char character, CompareInfo cmpInfo) - { - string str1 = $"a{character}b"; - string str2 = "ab"; - CompareOptions options = CompareOptions.None; - var hashCode1 = cmpInfo.GetHashCode(str1, options); - var hashCode2 = cmpInfo.GetHashCode(str2, options); - bool areHashCodesEqual = hashCode1 == hashCode2; - Assert.True(areHashCodesEqual); - StringComparer stringComparer = new CustomComparer(cmpInfo, options); - TryAddToCustomDictionary(stringComparer, str1, str2, areHashCodesEqual); - } - public static IEnumerable GetHashCodeTestData => new[] { new object[] { "abc", CompareOptions.OrdinalIgnoreCase, "ABC", CompareOptions.OrdinalIgnoreCase, true },