From fcd7481c9e223515b376735e71d4ae3c443a640b Mon Sep 17 00:00:00 2001 From: Meri Khamoyan Date: Fri, 30 Jun 2023 12:48:14 +0200 Subject: [PATCH] Handle surrogate cases --- .../features/globalization-hybrid-mode.md | 5 - .../System/Globalization/TextInfoTests.cs | 6 +- .../System.Globalization.Native/pal_casing.m | 106 +++++++++++------- 3 files changed, 68 insertions(+), 49 deletions(-) diff --git a/docs/design/features/globalization-hybrid-mode.md b/docs/design/features/globalization-hybrid-mode.md index c60fa14f2459a..7f7b3bba2c03b 100644 --- a/docs/design/features/globalization-hybrid-mode.md +++ b/docs/design/features/globalization-hybrid-mode.md @@ -423,8 +423,3 @@ Below function are used from apple native functions: - [uppercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1413316-uppercasestringwithlocale?language=objc) - [lowercaseStringWithLocale](https://developer.apple.com/documentation/foundation/nsstring/1417298-lowercasestringwithlocale?language=objc) -Behavioural changes compared to ICU - - - Final sigma behavior correction: - - ICU-based case change does not respect final-sigma rule, but hybrid does, so "ΒΌΛΟΣ" -> "βόλος", not "βόλοσ". diff --git a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs index 483eac52713f6..466be98c442d9 100644 --- a/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs +++ b/src/libraries/System.Globalization/tests/System/Globalization/TextInfoTests.cs @@ -274,9 +274,9 @@ public static IEnumerable ToLower_TestData() // we also don't preform. // Greek Capital Letter Sigma (does not case to U+03C2 with "final sigma" rule). yield return new object[] { cultureName, "\u03A3", "\u03C3" }; - if (PlatformDetection.IsHybridGlobalizationOnBrowser || PlatformDetection.IsHybridGlobalizationOnOSX) + if (PlatformDetection.IsHybridGlobalizationOnBrowser) { - // JS and Apple platforms are using "final sigma" rule correctly - it's costly to unify it with ICU's behavior + // JS is using "final sigma" rule correctly - it's costly to unify it with ICU's behavior yield return new object[] { cultureName, "O\u03A3", "o\u03C2" }; } else @@ -405,7 +405,7 @@ public static IEnumerable ToUpper_TestData() // es-zed does not case to SS when uppercased. yield return new object[] { cultureName, "\u00DF", "\u00DF" }; yield return new object[] { cultureName, "stra\u00DFe", "STRA\u00DFE" }; - if (!PlatformDetection.IsNlsGlobalization && !PlatformDetection.IsHybridGlobalizationOnOSX) + if (!PlatformDetection.IsNlsGlobalization) yield return new object[] { cultureName, "st\uD801\uDC37ra\u00DFe", "ST\uD801\uDC0FRA\u00DFE" }; // Ligatures do not expand when cased. diff --git a/src/native/libs/System.Globalization.Native/pal_casing.m b/src/native/libs/System.Globalization.Native/pal_casing.m index 592bc6b2697c1..0a611f511a34f 100644 --- a/src/native/libs/System.Globalization.Native/pal_casing.m +++ b/src/native/libs/System.Globalization.Native/pal_casing.m @@ -9,6 +9,47 @@ #if defined(TARGET_OSX) || defined(TARGET_MACCATALYST) || defined(TARGET_IOS) || defined(TARGET_TVOS) +/** + * Is this code unit a lead surrogate (U+d800..U+dbff)? + * @param c 16-bit code unit + * @return true or false + */ +#define IS_LEAD(c) (((c)&0xfffffc00) == 0xd800) + +/** + * Is this code unit a trail surrogate (U+dc00..U+dfff)? + * @param c 16-bit code unit + * @return true or false + */ +#define IS_TRAIL(c) (((c)&0xfffffc00) == 0xdc00) + +/** + * Get a code point index from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The length can be negative for a NUL-terminated string. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case for casing will be read + * the following trail surrogate as well. + * If the offset points to a trail surrogate or + * to a single, unpaired lead surrogate, then for casing will be read that unpaired surrogate. + * + * @param s const uint16_t* string + * @param i output string offset, must be i 1 ? [src characterAtIndex: 0] : [dst characterAtIndex: 0]; + dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index]; Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); - if (isError) - return isError; + index++; } + if (isError) + return isError; } return Success; } @@ -101,33 +134,24 @@ int32_t GlobalizationNative_ChangeCaseNative(const uint16_t* localeName, int32_t */ int32_t GlobalizationNative_ChangeCaseInvariantNative(const uint16_t* lpSrc, int32_t cwSrcLength, uint16_t* lpDst, int32_t cwDstLength, int32_t bToUpper) { - NSString *source = [NSString stringWithCharacters: lpSrc length: cwSrcLength]; - NSString *result = bToUpper ? source.uppercaseString : source.lowercaseString; - int32_t srcIdx = 0, dstIdx = 0, isError = 0; uint16_t dstCodepoint; - if (result.length <= cwDstLength) + while (srcIdx < cwSrcLength) { - while (srcIdx < result.length) + int32_t startIndex = srcIdx; + NEXTOFFSET(lpSrc, srcIdx, cwSrcLength); + int32_t srcLength = srcIdx - startIndex; + NSString *src = [NSString stringWithCharacters: lpSrc + startIndex length: srcLength]; + NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString; + int32_t index = 0; + while (index < srcLength) { - dstCodepoint = [result characterAtIndex:srcIdx++]; - Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); - if (isError) - return isError; - } - } - else - { - while (srcIdx < cwSrcLength) - { - NSString *src = [NSString stringWithCharacters: lpSrc + srcIdx length: 1]; - srcIdx++; - NSString *dst = bToUpper ? src.uppercaseString : src.lowercaseString; - dstCodepoint = dst.length > 1 ? [src characterAtIndex: 0] : [dst characterAtIndex: 0]; + dstCodepoint = dst.length > srcLength ? [src characterAtIndex: index] : [dst characterAtIndex: index]; Append(lpDst, dstIdx, cwDstLength, dstCodepoint, isError); - if (isError) - return isError; + index++; } + if (isError) + return isError; } return Success; }