From e17849c0b83402e2b5361f9468e83ff8c58ad479 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Sun, 22 Nov 2020 12:40:28 -0800 Subject: [PATCH 1/6] Fix Full Width Chars Casing --- .../pal_collation.c | 210 ++++++++++-------- .../CompareInfo/CompareInfoTests.Compare.cs | 26 +++ 2 files changed, 149 insertions(+), 87 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index a9eb1d57d74c7..a76d7919744cb 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -59,8 +59,6 @@ struct SortHandle SearchIteratorNode searchIteratorList[CompareOptionsMask + 1]; }; -typedef struct { UChar* items; size_t size; } UCharList; - // Hiragana character range static const UChar hiraganaStart = 0x3041; static const UChar hiraganaEnd = 0x309e; @@ -142,99 +140,91 @@ static int IsHalfFullHigherSymbol(UChar character) } /* -Gets a string of custom collation rules, if necessary. +Fill custom collation rules for ignoreKana cases. Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly. */ -static UCharList* GetCustomRules(int32_t options, UColAttributeValue strength, int isIgnoreSymbols) +static void FillIgnoreKanaRules(UChar* completeRules, int32_t* fillIndex, int32_t completeRulesLength, int32_t isIgnoreKanaType) { - int isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType; - int isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth; - - // kana differs at the tertiary level - int needsIgnoreKanaTypeCustomRule = isIgnoreKanaType && strength >= UCOL_TERTIARY; - int needsNotIgnoreKanaTypeCustomRule = !isIgnoreKanaType && strength < UCOL_TERTIARY; - - // character width differs at the tertiary level - int needsIgnoreWidthCustomRule = isIgnoreWidth && strength >= UCOL_TERTIARY; - int needsNotIgnoreWidthCustomRule = !isIgnoreWidth && strength < UCOL_TERTIARY; + UChar compareChar = isIgnoreKanaType ? '=' : '<'; - if (!(needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule || needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule)) - return NULL; - - UCharList* customRules = (UCharList*)malloc(sizeof(UCharList)); - if (customRules == NULL) + for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) { - return NULL; + // Hiragana is the range 3041 to 3096 & 309D & 309E + if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana + { + assert((*fillIndex) + 4 <= completeRulesLength); + completeRules[*fillIndex] = '&'; + completeRules[(*fillIndex) + 1] = hiraganaChar; + completeRules[(*fillIndex) + 2] = compareChar; + completeRules[(*fillIndex) + 3] = hiraganaChar + hiraganaToKatakanaOffset; + (*fillIndex) += 4; + } } +} - // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long - // and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long. - int capacity = - ((needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) + - ((needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) ? 5 * g_HalfFullCharsLength : 0); +/* +Fill custom collation rules for ignoreWidth cases. - UChar* items; - customRules->items = items = (UChar*)malloc((size_t)capacity * sizeof(UChar)); - if (customRules->items == NULL) - { - free(customRules); - return NULL; - } +Since the CompareOptions flags don't map 1:1 with ICU default functionality, we need to fall back to using +custom rules in order to support IgnoreKanaType and IgnoreWidth CompareOptions correctly. +*/ +static void FillIgnoreWidthRules(UChar* completeRules, int32_t* fillIndex, int32_t completeRulesLength, int32_t isIgnoreWidth, int32_t isIgnoreCase, int32_t isIgnoreSymbols) +{ + UChar compareChar = isIgnoreWidth ? '=' : '<'; - if (needsIgnoreKanaTypeCustomRule || needsNotIgnoreKanaTypeCustomRule) - { - UChar compareChar = needsIgnoreKanaTypeCustomRule ? '=' : '<'; + UChar lowerChar; + UChar higherChar; + int needsEscape; - for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) + for (int i = 0; i < g_HalfFullCharsLength; i++) + { + lowerChar = g_HalfFullLowerChars[i]; + higherChar = g_HalfFullHigherChars[i]; + // the lower chars need to be checked for escaping since they contain ASCII punctuation + needsEscape = NeedsEscape(lowerChar); + + // when isIgnoreSymbols is true and we are not ignoring width, check to see if + // this character is a symbol, and if so skip it + if (!(isIgnoreSymbols && (!isIgnoreWidth) && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) { - // Hiragana is the range 3041 to 3096 & 309D & 309E - if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana + assert((*fillIndex) + 4 <= completeRulesLength); + completeRules[*fillIndex] = '&'; + (*fillIndex)++; + + if (needsEscape) { - assert(items - customRules->items <= capacity - 4); - *(items++) = '&'; - *(items++) = hiraganaChar; - *(items++) = compareChar; - *(items++) = hiraganaChar + hiraganaToKatakanaOffset; + completeRules[*fillIndex] = '\\'; + (*fillIndex)++; } + + completeRules[*fillIndex] = lowerChar; + completeRules[(*fillIndex) + 1] = compareChar; + completeRules[(*fillIndex) + 2] = higherChar; + (*fillIndex) += 3; } } - if (needsIgnoreWidthCustomRule || needsNotIgnoreWidthCustomRule) + // When we have isIgnoreWidth is false, we sort the normal width latin alphabet charcatres before the full width latin alphabet charcatres + // e.g. `a` < `a` (\uFF41) + // This break the casing of the full width latin alphabet charcatres. + // e.g. `a` (\uFF41) == `A` (\uFF21). + // we are fixing back this case mapping here. + if (isIgnoreCase && (!isIgnoreWidth)) { - UChar compareChar = needsIgnoreWidthCustomRule ? '=' : '<'; + assert((*fillIndex) + (FullWidthAlphabetRangeLength * 4) <= completeRulesLength); + const int UpperCaseToLowerCaseOffset = 0xFF41 - 0xFF21; - UChar lowerChar; - UChar higherChar; - int needsEscape; - for (int i = 0; i < g_HalfFullCharsLength; i++) + for (UChar ch = 0xFF21; ch <= 0xFF3A; ch++) { - lowerChar = g_HalfFullLowerChars[i]; - higherChar = g_HalfFullHigherChars[i]; - // the lower chars need to be checked for escaping since they contain ASCII punctuation - needsEscape = NeedsEscape(lowerChar); - - // when isIgnoreSymbols is true and we are not ignoring width, check to see if - // this character is a symbol, and if so skip it - if (!(isIgnoreSymbols && needsNotIgnoreWidthCustomRule && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) - { - assert(items - customRules->items <= capacity - 5); - *(items++) = '&'; - if (needsEscape) - { - *(items++) = '\\'; - } - *(items++) = lowerChar; - *(items++) = compareChar; - *(items++) = higherChar; - } + completeRules[*fillIndex] = '&'; + completeRules[(*fillIndex) + 1] = ch + UpperCaseToLowerCaseOffset; + completeRules[(*fillIndex) + 2] = '='; + completeRules[(*fillIndex) + 3] = ch; + (*fillIndex) += 4; } } - - customRules->size = (size_t)(items - customRules->items); - - return customRules; } /* @@ -247,9 +237,11 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o { UColAttributeValue strength = ucol_getStrength(pCollator); - int isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; - int isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; - int isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; + int32_t isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; + int32_t isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; + int32_t isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; + int32_t isIgnoreKanaType = (options & CompareOptionsIgnoreKanaType) == CompareOptionsIgnoreKanaType; + int32_t isIgnoreWidth = (options & CompareOptionsIgnoreWidth) == CompareOptionsIgnoreWidth; if (isIgnoreCase) { @@ -262,34 +254,78 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o } UCollator* pClonedCollator; - UCharList* customRules = GetCustomRules(options, strength, isIgnoreSymbols); - if (customRules == NULL || customRules->size == 0) + + // IgnoreWidth - it would be easy to IgnoreWidth by just setting Strength <= Secondary. + // For any strength under that, the width of the characters will be ignored. + // For strength above that, the width of the characters will be used in differentiation. + // a. However, this doesn’t play nice with IgnoreCase, since these Strength levels are overloaded. + // b. So the plan to support IgnoreWidth is to use customized rules. + // i. Since the character width is differentiated at “Tertiary” strength, we only need to use custom rules in specific cases. + // ii. If (IgnoreWidth == true && Strength > “Secondary”) + // 1. Build up a custom rule set for each half-width character and say that it is equal to the corresponding full-width character. + // a. ex: “0x30F2 = 0xFF66 & 0x30F3 = 0xFF9D & …” + // iii. If (IgnoreWidth == false && Strength <= “Secondary”) + // 1. Build up a custom rule set saying that the half-width and full-width characters have a primary level difference (which will cause it always to be unequal) + // a. Ex. “0x30F2 < 0xFF66 & 0x30F3 < 0xFF9D & …” + // IgnoreKanaType – this works the same way as IgnoreWidth, only change “Secondary” to “Tertiary” and use the set of Hiragana and Katakana characters instead of half-width vs full-width characters to build the rules. + int32_t applyIgnoreKanaTypeCustomRule = isIgnoreKanaType ^ (strength < UCOL_TERTIARY); // kana differs at the tertiary level + int32_t applyIgnoreWidthTypeCustomRule = isIgnoreWidth ^ (strength < UCOL_TERTIARY); // character width differs at the tertiary level + + int32_t customRuleLength = 0; + if (applyIgnoreKanaTypeCustomRule || applyIgnoreWidthTypeCustomRule) + { + // Length of the fullwidth charcatres from 'A' to 'Z' + // We'll use it to map the casing of the full width 'A' to 'Z' characters + const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1; + + // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long + // and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long. + customRuleLength = (applyIgnoreKanaTypeCustomRule ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) + + (applyIgnoreWidthTypeCustomRule ? ((5 * g_HalfFullCharsLength) + (isIgnoreCase ? 4 * FullWidthAlphabetRangeLength : 0)) : 0) + + 1; // Adding extra terminator rule at the end to force ICU apply last actual entered rule, otherwise last actual rule get ignored. + } + + if (customRuleLength == 0) { pClonedCollator = ucol_safeClone(pCollator, NULL, NULL, pErr); } else { - int32_t customRuleLength = (int32_t)customRules->size; - - int32_t localeRulesLength; - const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength); - int32_t completeRulesLength = localeRulesLength + customRuleLength + 1; + int32_t rulesLength; + const UChar* localeRules = ucol_getRules(pCollator, &rulesLength); + int32_t completeRulesLength = rulesLength + customRuleLength + 1; UChar* completeRules = (UChar*)calloc((size_t)completeRulesLength, sizeof(UChar)); - for (int i = 0; i < localeRulesLength; i++) + for (int i = 0; i < rulesLength; i++) { completeRules[i] = localeRules[i]; } - for (int i = 0; i < customRuleLength; i++) + + if (applyIgnoreKanaTypeCustomRule) { - completeRules[localeRulesLength + i] = customRules->items[i]; + FillIgnoreKanaRules(completeRules, &rulesLength, completeRulesLength, isIgnoreKanaType); } - pClonedCollator = ucol_openRules(completeRules, completeRulesLength, UCOL_DEFAULT, strength, NULL, pErr); + assert(rulesLength <= completeRulesLength); + + if (applyIgnoreWidthTypeCustomRule) + { + FillIgnoreWidthRules(completeRules, &rulesLength, completeRulesLength, isIgnoreWidth, isIgnoreCase, isIgnoreSymbols); + } + + assert(rulesLength + 4 <= completeRulesLength); + + // Adding extra terminator rule at the end to force ICU apply last actual entered rule, otherwise last actual rule get ignored. + completeRules[rulesLength] = '&'; + completeRules[rulesLength + 1] = 'a'; + completeRules[rulesLength + 2] = '='; + completeRules[rulesLength + 3] = 'a'; + rulesLength += 4; + + pClonedCollator = ucol_openRules(completeRules, rulesLength, UCOL_DEFAULT, strength, NULL, pErr); free(completeRules); } - free(customRules); if (isIgnoreSymbols) { diff --git a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs index 57704146c8b68..98890c0c4fdbf 100644 --- a/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs +++ b/src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs @@ -455,5 +455,31 @@ public void Compare_Invalid() AssertExtensions.Throws("string2", () => s_invariantCompare.Compare("Test", 0, 2, "Test", 2, 3)); AssertExtensions.Throws("string2", () => s_invariantCompare.Compare("Test", 0, 2, "Test", 2, 3, CompareOptions.None)); } + + [Fact] + public void TestIgnoreKanaAndWidthCases() + { + for (char c = '\uFF41'; c <= '\uFF5A'; c++) // Full width 'a' to `z` + { + Assert.False(string.Equals(new string(c, 1), new string((char) (c - 0x20), 1), StringComparison.InvariantCulture), $"Expected '{(int)c:x4}' != '{c - 0x20:x4}'"); + Assert.True(string.Equals(new string(c, 1), new string((char) (c - 0x20), 1), StringComparison.InvariantCultureIgnoreCase), $"Expected '{(int)c:x4}' == '{c - 0x20:x4}'"); + } + + // Edge case of the Ignore Width. + Assert.False(string.Compare("\u3162\u3163", "\uFFDB\uFFDC", CultureInfo.InvariantCulture, CompareOptions.None) == 0, $"Expect '0x3162 0x3163' != '0xFFDB 0xFFDC'"); + Assert.True(string.Compare("\u3162\u3163", "\uFFDB\uFFDC", CultureInfo.InvariantCulture, CompareOptions.IgnoreWidth) == 0, "Expect '0x3162 0x3163' == '0xFFDB 0xFFDC'"); + + const char hiraganaStart = '\u3041'; + const char hiraganaEnd = '\u3096'; + const int hiraganaToKatakanaOffset = 0x30a1 - 0x3041; + + for (Char hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) + { + Assert.False(string.Compare(new string(hiraganaChar, 1), new string((char)(hiraganaChar + hiraganaToKatakanaOffset), 1), CultureInfo.InvariantCulture, CompareOptions.IgnoreCase) == 0, + $"Expect '{(int)hiraganaChar:x4}' != {(int)hiraganaChar + hiraganaToKatakanaOffset:x4} with CompareOptions.IgnoreCase"); + Assert.True(string.Compare(new string(hiraganaChar, 1), new string((char)(hiraganaChar + hiraganaToKatakanaOffset), 1), CultureInfo.InvariantCulture, CompareOptions.IgnoreKanaType) == 0, + $"Expect '{(int)hiraganaChar:x4}' == {(int)hiraganaChar + hiraganaToKatakanaOffset:x4} with CompareOptions.IgnoreKanaType"); + } + } } } From 2cf96a10b877e80cbd3e7485e47ff44328e9904f Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Sun, 22 Nov 2020 12:59:46 -0800 Subject: [PATCH 2/6] Fix constant definition --- .../Unix/System.Globalization.Native/pal_collation.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index a76d7919744cb..38cda23e005fa 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -63,6 +63,9 @@ struct SortHandle static const UChar hiraganaStart = 0x3041; static const UChar hiraganaEnd = 0x309e; static const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041; +// Length of the fullwidth charcatres from 'A' to 'Z' +// We'll use it to map the casing of the full width 'A' to 'Z' characters +const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1; // Mapping between half- and fullwidth characters. // LowerChars are the characters that should sort lower than HigherChars @@ -274,10 +277,6 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o int32_t customRuleLength = 0; if (applyIgnoreKanaTypeCustomRule || applyIgnoreWidthTypeCustomRule) { - // Length of the fullwidth charcatres from 'A' to 'Z' - // We'll use it to map the casing of the full width 'A' to 'Z' characters - const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1; - // If we need to create customRules, the KanaType custom rule will be 88 kana characters * 4 = 352 chars long // and the Width custom rule will be at most 212 halfwidth characters * 5 = 1060 chars long. customRuleLength = (applyIgnoreKanaTypeCustomRule ? 4 * (hiraganaEnd - hiraganaStart + 1) : 0) + From 2cc2371e09a7abadc03d44d78668819caccd7de8 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Sun, 22 Nov 2020 13:42:19 -0800 Subject: [PATCH 3/6] define the const as a static --- .../Native/Unix/System.Globalization.Native/pal_collation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index 38cda23e005fa..c7f098ee821e6 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -65,7 +65,7 @@ static const UChar hiraganaEnd = 0x309e; static const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041; // Length of the fullwidth charcatres from 'A' to 'Z' // We'll use it to map the casing of the full width 'A' to 'Z' characters -const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1; +static const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1; // Mapping between half- and fullwidth characters. // LowerChars are the characters that should sort lower than HigherChars From 5df7edd2f73c01fbd6958d64494e3ab6f8f2d570 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Sun, 22 Nov 2020 14:17:46 -0800 Subject: [PATCH 4/6] Fix unused method parameter --- .../Unix/System.Globalization.Native/pal_collation.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index c7f098ee821e6..d223c6138effa 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -158,6 +158,10 @@ static void FillIgnoreKanaRules(UChar* completeRules, int32_t* fillIndex, int32_ if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana { assert((*fillIndex) + 4 <= completeRulesLength); + if ((*fillIndex) + 4 > completeRulesLength) + { + return; + } completeRules[*fillIndex] = '&'; completeRules[(*fillIndex) + 1] = hiraganaChar; completeRules[(*fillIndex) + 2] = compareChar; @@ -192,7 +196,11 @@ static void FillIgnoreWidthRules(UChar* completeRules, int32_t* fillIndex, int32 // this character is a symbol, and if so skip it if (!(isIgnoreSymbols && (!isIgnoreWidth) && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) { - assert((*fillIndex) + 4 <= completeRulesLength); + assert((*fillIndex) + 5 <= completeRulesLength); + if ((*fillIndex) + 5 > completeRulesLength) + { + return; + } completeRules[*fillIndex] = '&'; (*fillIndex)++; From 4cb1845544ffa25d13ad778e16c6abee4e42a46b Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 23 Nov 2020 11:20:33 -0800 Subject: [PATCH 5/6] Address the feedback and Move bounds checks outside the loops --- .../pal_collation.c | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index d223c6138effa..67ee3403c5516 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -63,7 +63,7 @@ struct SortHandle static const UChar hiraganaStart = 0x3041; static const UChar hiraganaEnd = 0x309e; static const UChar hiraganaToKatakanaOffset = 0x30a1 - 0x3041; -// Length of the fullwidth charcatres from 'A' to 'Z' +// Length of the fullwidth characters from 'A' to 'Z' // We'll use it to map the casing of the full width 'A' to 'Z' characters static const int32_t FullWidthAlphabetRangeLength = 0xFF3A - 0xFF21 + 1; @@ -152,16 +152,17 @@ static void FillIgnoreKanaRules(UChar* completeRules, int32_t* fillIndex, int32_ { UChar compareChar = isIgnoreKanaType ? '=' : '<'; + assert((*fillIndex) + (4 * (hiraganaEnd - hiraganaStart + 1)) <= completeRulesLength); + if ((*fillIndex) + (4 * (hiraganaEnd - hiraganaStart + 1)) > completeRulesLength) // check the allocated the size + { + return; + } + for (UChar hiraganaChar = hiraganaStart; hiraganaChar <= hiraganaEnd; hiraganaChar++) { // Hiragana is the range 3041 to 3096 & 309D & 309E if (hiraganaChar <= 0x3096 || hiraganaChar >= 0x309D) // characters between 3096 and 309D are not mapped to katakana { - assert((*fillIndex) + 4 <= completeRulesLength); - if ((*fillIndex) + 4 > completeRulesLength) - { - return; - } completeRules[*fillIndex] = '&'; completeRules[(*fillIndex) + 1] = hiraganaChar; completeRules[(*fillIndex) + 2] = compareChar; @@ -185,6 +186,12 @@ static void FillIgnoreWidthRules(UChar* completeRules, int32_t* fillIndex, int32 UChar higherChar; int needsEscape; + assert((*fillIndex) + (5 * g_HalfFullCharsLength) <= completeRulesLength); + if ((*fillIndex) + (5 * g_HalfFullCharsLength) > completeRulesLength) + { + return; + } + for (int i = 0; i < g_HalfFullCharsLength; i++) { lowerChar = g_HalfFullLowerChars[i]; @@ -196,11 +203,6 @@ static void FillIgnoreWidthRules(UChar* completeRules, int32_t* fillIndex, int32 // this character is a symbol, and if so skip it if (!(isIgnoreSymbols && (!isIgnoreWidth) && (needsEscape || IsHalfFullHigherSymbol(higherChar)))) { - assert((*fillIndex) + 5 <= completeRulesLength); - if ((*fillIndex) + 5 > completeRulesLength) - { - return; - } completeRules[*fillIndex] = '&'; (*fillIndex)++; @@ -217,9 +219,9 @@ static void FillIgnoreWidthRules(UChar* completeRules, int32_t* fillIndex, int32 } } - // When we have isIgnoreWidth is false, we sort the normal width latin alphabet charcatres before the full width latin alphabet charcatres + // When we have isIgnoreWidth is false, we sort the normal width latin alphabet characters before the full width latin alphabet characters // e.g. `a` < `a` (\uFF41) - // This break the casing of the full width latin alphabet charcatres. + // This break the casing of the full width latin alphabet characters. // e.g. `a` (\uFF41) == `A` (\uFF21). // we are fixing back this case mapping here. if (isIgnoreCase && (!isIgnoreWidth)) From 85f28d0a2cb0b96de1fd1d1ca4951897090411a8 Mon Sep 17 00:00:00 2001 From: Tarek Mahmoud Sayed Date: Mon, 23 Nov 2020 14:12:32 -0800 Subject: [PATCH 6/6] Fix the comments according to the feedback --- .../Native/Unix/System.Globalization.Native/pal_collation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c index 67ee3403c5516..99edfd96bcf1f 100644 --- a/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c +++ b/src/libraries/Native/Unix/System.Globalization.Native/pal_collation.c @@ -280,7 +280,7 @@ static UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t o // iii. If (IgnoreWidth == false && Strength <= “Secondary”) // 1. Build up a custom rule set saying that the half-width and full-width characters have a primary level difference (which will cause it always to be unequal) // a. Ex. “0x30F2 < 0xFF66 & 0x30F3 < 0xFF9D & …” - // IgnoreKanaType – this works the same way as IgnoreWidth, only change “Secondary” to “Tertiary” and use the set of Hiragana and Katakana characters instead of half-width vs full-width characters to build the rules. + // IgnoreKanaType – this works the same way as IgnoreWidth, it uses the set of Hiragana and Katakana characters instead of half-width vs full-width characters to build the rules. int32_t applyIgnoreKanaTypeCustomRule = isIgnoreKanaType ^ (strength < UCOL_TERTIARY); // kana differs at the tertiary level int32_t applyIgnoreWidthTypeCustomRule = isIgnoreWidth ^ (strength < UCOL_TERTIARY); // character width differs at the tertiary level