From 95de02e80deef19607a55b5913c674cc21521132 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 15 Jul 2024 16:33:53 +0800 Subject: [PATCH] [SPARK-48441][SQL] Fix StringTrim behaviour for non-UTF8_BINARY collations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? String searching in UTF8_LCASE now works on character-level, rather than on byte-level. For example: `ltrim("İ", "i")` now returns `"İ"`, because there exist **no characters** in `"İ"`, starting from the left, such that lowercased version of those characters are equal to `"i"`. Note, however, that there is a byte subsequence of `"İ"` such that lowercased version of that UTF-8 byte sequence equals to `"i"` (so the new behaviour is different than the old behaviour). Also, translation for ICU collations works by repeatedly trimming the longest possible substring that matches a character in the trim string, starting from the left side of the input string, until trimming is done. ### Why are the changes needed? Fix functions that give unusable results due to one-to-many case mapping when performing string search under UTF8_LCASE (see example above). ### Does this PR introduce _any_ user-facing change? Yes, behaviour of `trim*` expressions is changed for collated strings for edge cases with one-to-many case mapping. ### How was this patch tested? New unit tests in `CollationSupportSuite` and new e2e sql tests in `CollationStringExpressionsSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46762 from uros-db/alter-trim. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../util/CollationAwareUTF8String.java | 306 ++++++-- .../sql/catalyst/util/CollationSupport.java | 129 ++-- .../unsafe/types/CollationSupportSuite.java | 663 ++++++++++++++++-- .../expressions/stringExpressions.scala | 16 +- .../sql/CollationStringExpressionsSuite.scala | 45 +- 5 files changed, 922 insertions(+), 237 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index af152c87f88ce..b9868ca665a65 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -33,6 +33,7 @@ import java.text.CharacterIterator; import java.text.StringCharacterIterator; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.Map; @@ -841,117 +842,268 @@ public static UTF8String translate(final UTF8String input, return UTF8String.fromString(sb.toString()); } + /** + * Trims the `srcString` string from both ends of the string using the specified `trimString` + * characters, with respect to the UTF8_LCASE collation. String trimming is performed by + * first trimming the left side of the string, and then trimming the right side of the string. + * The method returns the trimmed string. If the `trimString` is null, the method returns null. + * + * @param srcString the input string to be trimmed from both ends of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for UTF8_LCASE collation) + */ public static UTF8String lowercaseTrim( final UTF8String srcString, final UTF8String trimString) { - // Matching UTF8String behavior for null `trimString`. - if (trimString == null) { - return null; - } + return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString), trimString); + } - UTF8String leftTrimmed = lowercaseTrimLeft(srcString, trimString); - return lowercaseTrimRight(leftTrimmed, trimString); + /** + * Trims the `srcString` string from both ends of the string using the specified `trimString` + * characters, with respect to all ICU collations in Spark. String trimming is performed by + * first trimming the left side of the string, and then trimming the right side of the string. + * The method returns the trimmed string. If the `trimString` is null, the method returns null. + * + * @param srcString the input string to be trimmed from both ends of the string + * @param trimString the trim string characters to trim + * @param collationId the collation ID to use for string trimming + * @return the trimmed string (for ICU collations) + */ + public static UTF8String trim( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return trimRight(trimLeft(srcString, trimString, collationId), trimString, collationId); } + /** + * Trims the `srcString` string from the left side using the specified `trimString` characters, + * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method first creates a hash + * set of lowercased code points in `trimString`, and then iterates over the `srcString` from + * the left side, until reaching a character whose lowercased code point is not in the hash set. + * Finally, the method returns the substring from that position to the end of `srcString`. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the left end of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for UTF8_LCASE collation) + */ public static UTF8String lowercaseTrimLeft( final UTF8String srcString, final UTF8String trimString) { - // Matching UTF8String behavior for null `trimString`. + // Matching the default UTF8String behavior for null `trimString`. if (trimString == null) { return null; } - // The searching byte position in the srcString. - int searchIdx = 0; - // The byte position of a first non-matching character in the srcString. - int trimByteIdx = 0; - // Number of bytes in srcString. - int numBytes = srcString.numBytes(); - // Convert trimString to lowercase, so it can be searched properly. - UTF8String lowercaseTrimString = trimString.toLowerCase(); - - while (searchIdx < numBytes) { - UTF8String searchChar = srcString.copyUTF8String( - searchIdx, - searchIdx + UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1); - int searchCharBytes = searchChar.numBytes(); - - // Try to find the matching for the searchChar in the trimString. - if (lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { - trimByteIdx += searchCharBytes; - searchIdx += searchCharBytes; - } else { - // No matching, exit the search. + // Create a hash set of lowercased code points for all characters of `trimString`. + HashSet trimChars = new HashSet<>(); + Iterator trimIter = trimString.codePointIterator(); + while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next())); + + // Iterate over `srcString` from the left to find the first character that is not in the set. + int searchIndex = 0, codePoint; + Iterator srcIter = srcString.codePointIterator(); + while (srcIter.hasNext()) { + codePoint = getLowercaseCodePoint(srcIter.next()); + // Special handling for Turkish dotted uppercase letter I. + if (codePoint == CODE_POINT_LOWERCASE_I && srcIter.hasNext() && + trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) { + int nextCodePoint = getLowercaseCodePoint(srcIter.next()); + if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint)) + || nextCodePoint == CODE_POINT_COMBINING_DOT) { + searchIndex += 2; + } + else { + if (trimChars.contains(codePoint)) ++searchIndex; + break; + } + } else if (trimChars.contains(codePoint)) { + ++searchIndex; + } + else { break; } } - if (searchIdx == 0) { - // Nothing trimmed - return original string (not converted to lowercase). - return srcString; + // Return the substring from that position to the end of the string. + return searchIndex == 0 ? srcString : srcString.substring(searchIndex, srcString.numChars()); + } + + /** + * Trims the `srcString` string from the left side using the specified `trimString` characters, + * with respect to ICU collations. For these collations, the method iterates over `srcString` + * from left to right, and repeatedly skips the longest possible substring that matches any + * character in `trimString`, until reaching a character that is not found in `trimString`. + * Finally, the method returns the substring from that position to the end of `srcString`. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the left end of the string + * @param trimString the trim string characters to trim + * @param collationId the collation ID to use for string trimming + * @return the trimmed string (for ICU collations) + */ + public static UTF8String trimLeft( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + // Short-circuit for base cases. + if (trimString == null) return null; + if (srcString.numBytes() == 0) return srcString; + + // Create an array of Strings for all characters of `trimString`. + Map trimChars = new HashMap<>(); + Iterator trimIter = trimString.codePointIterator( + CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID); + while (trimIter.hasNext()) { + int codePoint = trimIter.next(); + trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint)); } - if (trimByteIdx >= numBytes) { - // Everything trimmed. - return UTF8String.EMPTY_UTF8; + + // Iterate over srcString from the left and find the first character that is not in trimChars. + String src = srcString.toValidString(); + CharacterIterator target = new StringCharacterIterator(src); + Collator collator = CollationFactory.fetchCollation(collationId).collator; + int charIndex = 0, longestMatchLen; + while (charIndex < src.length()) { + longestMatchLen = 0; + for (String trim : trimChars.values()) { + StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator); + stringSearch.setIndex(charIndex); + int matchIndex = stringSearch.next(); + if (matchIndex == charIndex) { + int matchLen = stringSearch.getMatchLength(); + if (matchLen > longestMatchLen) { + longestMatchLen = matchLen; + } + } + } + if (longestMatchLen == 0) break; + else charIndex += longestMatchLen; } - return srcString.copyUTF8String(trimByteIdx, numBytes - 1); + + // Return the substring from the calculated position until the end of the string. + return UTF8String.fromString(src.substring(charIndex)); } + /** + * Trims the `srcString` string from the right side using the specified `trimString` characters, + * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method first creates a hash + * set of lowercased code points in `trimString`, and then iterates over the `srcString` from + * the right side, until reaching a character whose lowercased code point is not in the hash set. + * Finally, the method returns the substring from the start of `srcString` until that position. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the right end of the string + * @param trimString the trim string characters to trim + * @return the trimmed string (for UTF8_LCASE collation) + */ public static UTF8String lowercaseTrimRight( final UTF8String srcString, final UTF8String trimString) { - // Matching UTF8String behavior for null `trimString`. + // Matching the default UTF8String behavior for null `trimString`. if (trimString == null) { return null; } - // Number of bytes iterated from the srcString. - int byteIdx = 0; - // Number of characters iterated from the srcString. - int numChars = 0; - // Number of bytes in srcString. - int numBytes = srcString.numBytes(); - // Array of character length for the srcString. - int[] stringCharLen = new int[numBytes]; - // Array of the first byte position for each character in the srcString. - int[] stringCharPos = new int[numBytes]; - // Convert trimString to lowercase, so it can be searched properly. - UTF8String lowercaseTrimString = trimString.toLowerCase(); - - // Build the position and length array. - while (byteIdx < numBytes) { - stringCharPos[numChars] = byteIdx; - stringCharLen[numChars] = UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx)); - byteIdx += stringCharLen[numChars]; - numChars++; - } - - // Index trimEnd points to the first no matching byte position from the right side of - // the source string. - int trimByteIdx = numBytes - 1; - - while (numChars > 0) { - UTF8String searchChar = srcString.copyUTF8String( - stringCharPos[numChars - 1], - stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1); - - if(lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { - trimByteIdx -= stringCharLen[numChars - 1]; - numChars--; - } else { + // Create a hash set of lowercased code points for all characters of `trimString`. + HashSet trimChars = new HashSet<>(); + Iterator trimIter = trimString.codePointIterator(); + while (trimIter.hasNext()) trimChars.add(getLowercaseCodePoint(trimIter.next())); + + // Iterate over `srcString` from the right to find the first character that is not in the set. + int searchIndex = srcString.numChars(), codePoint; + Iterator srcIter = srcString.reverseCodePointIterator(); + while (srcIter.hasNext()) { + codePoint = getLowercaseCodePoint(srcIter.next()); + // Special handling for Turkish dotted uppercase letter I. + if (codePoint == CODE_POINT_COMBINING_DOT && srcIter.hasNext() && + trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) { + int nextCodePoint = getLowercaseCodePoint(srcIter.next()); + if ((trimChars.contains(codePoint) && trimChars.contains(nextCodePoint)) + || nextCodePoint == CODE_POINT_LOWERCASE_I) { + searchIndex -= 2; + } + else { + if (trimChars.contains(codePoint)) --searchIndex; + break; + } + } else if (trimChars.contains(codePoint)) { + --searchIndex; + } + else { break; } } - if (trimByteIdx == numBytes - 1) { - // Nothing trimmed. - return srcString; + // Return the substring from the start of the string to the calculated position. + return searchIndex == srcString.numChars() ? srcString : srcString.substring(0, searchIndex); + } + + /** + * Trims the `srcString` string from the right side using the specified `trimString` characters, + * with respect to ICU collations. For these collations, the method iterates over `srcString` + * from right to left, and repeatedly skips the longest possible substring that matches any + * character in `trimString`, until reaching a character that is not found in `trimString`. + * Finally, the method returns the substring from the start of `srcString` until that position. + * If `trimString` is null, null is returned. If `trimString` is empty, `srcString` is returned. + * + * @param srcString the input string to be trimmed from the right end of the string + * @param trimString the trim string characters to trim + * @param collationId the collation ID to use for string trimming + * @return the trimmed string (for ICU collations) + */ + public static UTF8String trimRight( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + // Short-circuit for base cases. + if (trimString == null) return null; + if (srcString.numBytes() == 0) return srcString; + + // Create an array of Strings for all characters of `trimString`. + Map trimChars = new HashMap<>(); + Iterator trimIter = trimString.codePointIterator( + CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID); + while (trimIter.hasNext()) { + int codePoint = trimIter.next(); + trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint)); } - if (trimByteIdx < 0) { - // Everything trimmed. - return UTF8String.EMPTY_UTF8; + + // Iterate over srcString from the left and find the first character that is not in trimChars. + String src = srcString.toValidString(); + CharacterIterator target = new StringCharacterIterator(src); + Collator collator = CollationFactory.fetchCollation(collationId).collator; + int charIndex = src.length(), longestMatchLen; + while (charIndex >= 0) { + longestMatchLen = 0; + for (String trim : trimChars.values()) { + StringSearch stringSearch = new StringSearch(trim, target, (RuleBasedCollator) collator); + // Note: stringSearch.previous() is NOT consistent with stringSearch.next()! + // Example: StringSearch("İ", "i\\u0307İi\\u0307İi\\u0307İ", "UNICODE_CI") + // stringSearch.next() gives: [0, 2, 3, 5, 6, 8]. + // stringSearch.previous() gives: [8, 6, 3, 0]. + // Since 1 character can map to at most 3 characters in Unicode, we can begin the search + // from character position: `charIndex` - 3, and use `next()` to find the longest match. + stringSearch.setIndex(Math.max(charIndex - 3, 0)); + int matchIndex = stringSearch.next(); + int matchLen = stringSearch.getMatchLength(); + while (matchIndex != StringSearch.DONE && matchIndex < charIndex - matchLen) { + matchIndex = stringSearch.next(); + matchLen = stringSearch.getMatchLength(); + } + if (matchIndex == charIndex - matchLen) { + if (matchLen > longestMatchLen) { + longestMatchLen = matchLen; + } + } + } + if (longestMatchLen == 0) break; + else charIndex -= longestMatchLen; } - return srcString.copyUTF8String(0, trimByteIdx); + + // Return the substring from the start of the string until that position. + return UTF8String.fromString(src.substring(0, charIndex)); } // TODO: Add more collation-aware UTF8String operations here. diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index f9ccd22f3f5c6..453423ddbc33d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -531,15 +531,8 @@ public static UTF8String execICU(final UTF8String source, Map di } public static class StringTrim { - public static UTF8String exec( - final UTF8String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality) { - return execBinary(srcString); - } else { - return execLowercase(srcString); - } + public static UTF8String exec(final UTF8String srcString) { + return execBinary(srcString); } public static UTF8String exec( final UTF8String srcString, @@ -548,20 +541,14 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); + } else { + return execICU(srcString, trimString, collationId); } } - public static String genCode( - final String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.StringTrim.exec"; - if (collation.supportsBinaryEquality) { - return String.format(expr + "Binary(%s)", srcString); - } { - return String.format(expr + "Lowercase(%s)", srcString); - } + public static String genCode(final String srcString) { + return String.format("CollationSupport.StringTrim.execBinary(%s)", srcString); } public static String genCode( final String srcString, @@ -571,8 +558,10 @@ public static String genCode( String expr = "CollationSupport.StringTrim.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s, %s)", srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId); } } public static UTF8String execBinary( @@ -584,27 +573,22 @@ public static UTF8String execBinary( final UTF8String trimString) { return srcString.trim(trimString); } - public static UTF8String execLowercase( - final UTF8String srcString) { - return srcString.trim(); - } public static UTF8String execLowercase( final UTF8String srcString, final UTF8String trimString) { return CollationAwareUTF8String.lowercaseTrim(srcString, trimString); } + public static UTF8String execICU( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return CollationAwareUTF8String.trim(srcString, trimString, collationId); + } } public static class StringTrimLeft { - public static UTF8String exec( - final UTF8String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality) { - return execBinary(srcString); - } else { - return execLowercase(srcString); - } + public static UTF8String exec(final UTF8String srcString) { + return execBinary(srcString); } public static UTF8String exec( final UTF8String srcString, @@ -613,21 +597,15 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); - } - } - public static String genCode( - final String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.StringTrimLeft.exec"; - if (collation.supportsBinaryEquality) { - return String.format(expr + "Binary(%s)", srcString); } else { - return String.format(expr + "Lowercase(%s)", srcString); + return execICU(srcString, trimString, collationId); } } + public static String genCode(final String srcString) { + return String.format("CollationSupport.StringTrimLeft.execBinary(%s)", srcString); + } public static String genCode( final String srcString, final String trimString, @@ -636,12 +614,13 @@ public static String genCode( String expr = "CollationSupport.StringTrimLeft.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s, %s)", srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId); } } - public static UTF8String execBinary( - final UTF8String srcString) { + public static UTF8String execBinary(final UTF8String srcString) { return srcString.trimLeft(); } public static UTF8String execBinary( @@ -649,27 +628,22 @@ public static UTF8String execBinary( final UTF8String trimString) { return srcString.trimLeft(trimString); } - public static UTF8String execLowercase( - final UTF8String srcString) { - return srcString.trimLeft(); - } public static UTF8String execLowercase( final UTF8String srcString, final UTF8String trimString) { return CollationAwareUTF8String.lowercaseTrimLeft(srcString, trimString); } + public static UTF8String execICU( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return CollationAwareUTF8String.trimLeft(srcString, trimString, collationId); + } } public static class StringTrimRight { - public static UTF8String exec( - final UTF8String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - if (collation.supportsBinaryEquality) { - return execBinary(srcString); - } else { - return execLowercase(srcString); - } + public static UTF8String exec(final UTF8String srcString) { + return execBinary(srcString); } public static UTF8String exec( final UTF8String srcString, @@ -678,21 +652,15 @@ public static UTF8String exec( CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return execLowercase(srcString, trimString); - } - } - public static String genCode( - final String srcString, - final int collationId) { - CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); - String expr = "CollationSupport.StringTrimRight.exec"; - if (collation.supportsBinaryEquality) { - return String.format(expr + "Binary(%s)", srcString); } else { - return String.format(expr + "Lowercase(%s)", srcString); + return execICU(srcString, trimString, collationId); } } + public static String genCode(final String srcString) { + return String.format("CollationSupport.StringTrimRight.execBinary(%s)", srcString); + } public static String genCode( final String srcString, final String trimString, @@ -701,12 +669,13 @@ public static String genCode( String expr = "CollationSupport.StringTrimRight.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s, %s)", srcString, trimString); - } else { + } else if (collation.supportsLowercaseEquality) { return String.format(expr + "Lowercase(%s, %s)", srcString, trimString); + } else { + return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString, collationId); } } - public static UTF8String execBinary( - final UTF8String srcString) { + public static UTF8String execBinary(final UTF8String srcString) { return srcString.trimRight(); } public static UTF8String execBinary( @@ -714,15 +683,17 @@ public static UTF8String execBinary( final UTF8String trimString) { return srcString.trimRight(trimString); } - public static UTF8String execLowercase( - final UTF8String srcString) { - return srcString.trimRight(); - } public static UTF8String execLowercase( final UTF8String srcString, final UTF8String trimString) { return CollationAwareUTF8String.lowercaseTrimRight(srcString, trimString); } + public static UTF8String execICU( + final UTF8String srcString, + final UTF8String trimString, + final int collationId) { + return CollationAwareUTF8String.trimRight(srcString, trimString, collationId); + } } // TODO: Add more collation-aware string expressions. diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index ce0cef3fef307..b082ab21944f7 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -1227,20 +1227,34 @@ private void assertStringTrim( String sourceString, String trimString, String expectedResultString) throws SparkException { + // Prepare the input and expected result. int collationId = CollationFactory.collationNameToId(collation); - String result; + UTF8String src = UTF8String.fromString(sourceString); + UTF8String trim = UTF8String.fromString(trimString); + UTF8String resultTrimLeftRight, resultTrimRightLeft; + String resultTrim; if (trimString == null) { - result = CollationSupport.StringTrim.exec( - UTF8String.fromString(sourceString), collationId).toString(); + // Trim string is ASCII space. + resultTrim = CollationSupport.StringTrim.exec(src).toString(); + UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src); + resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft); + UTF8String trimRight = CollationSupport.StringTrimRight.exec(src); + resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight); } else { - result = CollationSupport.StringTrim.exec( - UTF8String - .fromString(sourceString), UTF8String.fromString(trimString), collationId) - .toString(); + // Trim string is specified. + resultTrim = CollationSupport.StringTrim.exec(src, trim, collationId).toString(); + UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src, trim, collationId); + resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft, trim, collationId); + UTF8String trimRight = CollationSupport.StringTrimRight.exec(src, trim, collationId); + resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight, trim, collationId); } - assertEquals(expectedResultString, result); + // Test that StringTrim result is as expected. + assertEquals(expectedResultString, resultTrim); + // Test that the order of the trims is not important. + assertEquals(resultTrimLeftRight.toString(), resultTrim); + assertEquals(resultTrimRightLeft.toString(), resultTrim); } private void assertStringTrimLeft( @@ -1248,19 +1262,21 @@ private void assertStringTrimLeft( String sourceString, String trimString, String expectedResultString) throws SparkException { + // Prepare the input and expected result. int collationId = CollationFactory.collationNameToId(collation); + UTF8String src = UTF8String.fromString(sourceString); + UTF8String trim = UTF8String.fromString(trimString); String result; if (trimString == null) { - result = CollationSupport.StringTrimLeft.exec( - UTF8String.fromString(sourceString), collationId).toString(); + // Trim string is ASCII space. + result = CollationSupport.StringTrimLeft.exec(src).toString(); } else { - result = CollationSupport.StringTrimLeft.exec( - UTF8String - .fromString(sourceString), UTF8String.fromString(trimString), collationId) - .toString(); + // Trim string is specified. + result = CollationSupport.StringTrimLeft.exec(src, trim, collationId).toString(); } + // Test that StringTrimLeft result is as expected. assertEquals(expectedResultString, result); } @@ -1269,116 +1285,645 @@ private void assertStringTrimRight( String sourceString, String trimString, String expectedResultString) throws SparkException { + // Prepare the input and expected result. int collationId = CollationFactory.collationNameToId(collation); + UTF8String src = UTF8String.fromString(sourceString); + UTF8String trim = UTF8String.fromString(trimString); String result; if (trimString == null) { - result = CollationSupport.StringTrimRight.exec( - UTF8String.fromString(sourceString), collationId).toString(); + // Trim string is ASCII space. + result = CollationSupport.StringTrimRight.exec(src).toString(); } else { - result = CollationSupport.StringTrimRight.exec( - UTF8String - .fromString(sourceString), UTF8String.fromString(trimString), collationId) - .toString(); + // Trim string is specified. + result = CollationSupport.StringTrimRight.exec(src, trim, collationId).toString(); } + // Test that StringTrimRight result is as expected. assertEquals(expectedResultString, result); } @Test public void testStringTrim() throws SparkException { + // Basic tests - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "", "", ""); + assertStringTrim("UTF8_BINARY", "", "xyz", ""); + assertStringTrim("UTF8_BINARY", "asd", "", "asd"); assertStringTrim("UTF8_BINARY", "asd", null, "asd"); assertStringTrim("UTF8_BINARY", " asd ", null, "asd"); assertStringTrim("UTF8_BINARY", " a世a ", null, "a世a"); assertStringTrim("UTF8_BINARY", "asd", "x", "asd"); assertStringTrim("UTF8_BINARY", "xxasdxx", "x", "asd"); assertStringTrim("UTF8_BINARY", "xa世ax", "x", "a世a"); - + assertStringTrimLeft("UTF8_BINARY", "", "", ""); + assertStringTrimLeft("UTF8_BINARY", "", "xyz", ""); + assertStringTrimLeft("UTF8_BINARY", "asd", "", "asd"); assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd"); assertStringTrimLeft("UTF8_BINARY", " asd ", null, "asd "); assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a "); assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd"); assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx"); assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax"); - + assertStringTrimRight("UTF8_BINARY", "", "", ""); + assertStringTrimRight("UTF8_BINARY", "", "xyz", ""); + assertStringTrimRight("UTF8_BINARY", "asd", "", "asd"); assertStringTrimRight("UTF8_BINARY", "asd", null, "asd"); assertStringTrimRight("UTF8_BINARY", " asd ", null, " asd"); assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a"); assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd"); assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a"); - + // Basic tests - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "", "", ""); + assertStringTrim("UTF8_LCASE", "", "xyz", ""); + assertStringTrim("UTF8_LCASE", "asd", "", "asd"); assertStringTrim("UTF8_LCASE", "asd", null, "asd"); assertStringTrim("UTF8_LCASE", " asd ", null, "asd"); assertStringTrim("UTF8_LCASE", " a世a ", null, "a世a"); assertStringTrim("UTF8_LCASE", "asd", "x", "asd"); assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd"); assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a"); - + assertStringTrimLeft("UTF8_LCASE", "", "", ""); + assertStringTrimLeft("UTF8_LCASE", "", "xyz", ""); + assertStringTrimLeft("UTF8_LCASE", "asd", "", "asd"); assertStringTrimLeft("UTF8_LCASE", "asd", null, "asd"); assertStringTrimLeft("UTF8_LCASE", " asd ", null, "asd "); assertStringTrimLeft("UTF8_LCASE", " a世a ", null, "a世a "); assertStringTrimLeft("UTF8_LCASE", "asd", "x", "asd"); assertStringTrimLeft("UTF8_LCASE", "xxasdxx", "x", "asdxx"); assertStringTrimLeft("UTF8_LCASE", "xa世ax", "x", "a世ax"); - + assertStringTrimRight("UTF8_LCASE", "", "", ""); + assertStringTrimRight("UTF8_LCASE", "", "xyz", ""); + assertStringTrimRight("UTF8_LCASE", "asd", "", "asd"); assertStringTrimRight("UTF8_LCASE", "asd", null, "asd"); assertStringTrimRight("UTF8_LCASE", " asd ", null, " asd"); assertStringTrimRight("UTF8_LCASE", " a世a ", null, " a世a"); assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd"); assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd"); assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a"); - - assertStringTrim("UTF8_LCASE", "asd", null, "asd"); - assertStringTrim("UTF8_LCASE", " asd ", null, "asd"); - assertStringTrim("UTF8_LCASE", " a世a ", null, "a世a"); - assertStringTrim("UTF8_LCASE", "asd", "x", "asd"); - assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd"); - assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a"); - - // Test cases where trimString has more than one character + // Basic tests - UNICODE. + assertStringTrim("UNICODE", "", "", ""); + assertStringTrim("UNICODE", "", "xyz", ""); + assertStringTrim("UNICODE", "asd", "", "asd"); + assertStringTrim("UNICODE", "asd", null, "asd"); + assertStringTrim("UNICODE", " asd ", null, "asd"); + assertStringTrim("UNICODE", " a世a ", null, "a世a"); + assertStringTrim("UNICODE", "asd", "x", "asd"); + assertStringTrim("UNICODE", "xxasdxx", "x", "asd"); + assertStringTrim("UNICODE", "xa世ax", "x", "a世a"); + assertStringTrimLeft("UNICODE", "", "", ""); + assertStringTrimLeft("UNICODE", "", "xyz", ""); + assertStringTrimLeft("UNICODE", "asd", "", "asd"); + assertStringTrimLeft("UNICODE", "asd", null, "asd"); + assertStringTrimLeft("UNICODE", " asd ", null, "asd "); + assertStringTrimLeft("UNICODE", " a世a ", null, "a世a "); + assertStringTrimLeft("UNICODE", "asd", "x", "asd"); + assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx"); + assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax"); + assertStringTrimRight("UNICODE", "", "", ""); + assertStringTrimRight("UNICODE", "", "xyz", ""); + assertStringTrimRight("UNICODE", "asd", "", "asd"); + assertStringTrimRight("UNICODE", "asd", null, "asd"); + assertStringTrimRight("UNICODE", " asd ", null, " asd"); + assertStringTrimRight("UNICODE", " a世a ", null, " a世a"); + assertStringTrimRight("UNICODE", "asd", "x", "asd"); + assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd"); + assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a"); + // Basic tests - UNICODE_CI. + assertStringTrim("UNICODE_CI", "", "", ""); + assertStringTrim("UNICODE_CI", "", "xyz", ""); + assertStringTrim("UNICODE_CI", "asd", "", "asd"); + assertStringTrim("UNICODE_CI", "asd", null, "asd"); + assertStringTrim("UNICODE_CI", " asd ", null, "asd"); + assertStringTrim("UNICODE_CI", " a世a ", null, "a世a"); + assertStringTrim("UNICODE_CI", "asd", "x", "asd"); + assertStringTrim("UNICODE_CI", "xxasdxx", "x", "asd"); + assertStringTrim("UNICODE_CI", "xa世ax", "x", "a世a"); + assertStringTrimLeft("UNICODE_CI", "", "", ""); + assertStringTrimLeft("UNICODE_CI", "", "xyz", ""); + assertStringTrimLeft("UNICODE_CI", "asd", "", "asd"); + assertStringTrimLeft("UNICODE_CI", "asd", null, "asd"); + assertStringTrimLeft("UNICODE_CI", " asd ", null, "asd "); + assertStringTrimLeft("UNICODE_CI", " a世a ", null, "a世a "); + assertStringTrimLeft("UNICODE_CI", "asd", "x", "asd"); + assertStringTrimLeft("UNICODE_CI", "xxasdxx", "x", "asdxx"); + assertStringTrimLeft("UNICODE_CI", "xa世ax", "x", "a世ax"); + assertStringTrimRight("UNICODE_CI", "", "", ""); + assertStringTrimRight("UNICODE_CI", "", "xyz", ""); + assertStringTrimRight("UNICODE_CI", "asd", "", "asd"); + assertStringTrimRight("UNICODE_CI", "asd", null, "asd"); + assertStringTrimRight("UNICODE_CI", " asd ", null, " asd"); + assertStringTrimRight("UNICODE_CI", " a世a ", null, " a世a"); + assertStringTrimRight("UNICODE_CI", "asd", "x", "asd"); + assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd"); + assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a"); + + // Case variation - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX"); + assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa"); assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX"); - - assertStringTrim("UTF8_LCASE", "ddsXXXaa", "asd", "XXX"); - assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "asd", "XXXaa"); - assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "asd", "ddsXXX"); - - // Test cases specific to collation type - // uppercase trim, lowercase src - assertStringTrim("UTF8_BINARY", "asd", "A", "asd"); + // Case variation - UTF8_LCASE. assertStringTrim("UTF8_LCASE", "asd", "A", "sd"); - - // lowercase trim, uppercase src - assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD"); assertStringTrim("UTF8_LCASE", "ASD", "a", "SD"); - - // uppercase and lowercase chars of different byte-length (utf8) + assertStringTrim("UTF8_LCASE", "ddsXXXaa", "ASD", "XXX"); + assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "aSd", "XXXaa"); + assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "AsD", "ddsXXX"); + // Case variation - UNICODE. + assertStringTrim("UNICODE", "asd", "A", "asd"); + assertStringTrim("UNICODE", "ASD", "a", "ASD"); + assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX"); + assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa"); + assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX"); + // Case variation - UNICODE_CI. + assertStringTrim("UNICODE_CI", "asd", "A", "sd"); + assertStringTrim("UNICODE_CI", "ASD", "a", "SD"); + assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX"); + assertStringTrimLeft("UNICODE_CI", "ddsXXXaa", "aSd", "XXXaa"); + assertStringTrimRight("UNICODE_CI", "ddsXXXaa", "AsD", "ddsXXX"); + + // Case-variable character length - UTF8_BINARY. assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ"); - - assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa"); - assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); - assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); - assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß"); - - assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa"); - assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß"); - assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa"); - - // different byte-length (utf8) chars trimmed assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ"); assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa"); - + // Case-variable character length - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa"); + assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ"); + assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa"); + assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa"); + assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß"); + assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa"); assertStringTrim("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaa"); assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + // Case-variable character length - UNICODE. + assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ"); + assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß"); + assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + // Case-variable character length - UNICODE_CI. + assertStringTrim("UNICODE_CI", "ẞaaaẞ", "ß", "aaa"); + assertStringTrimLeft("UNICODE_CI", "ẞaaaẞ", "ß", "aaaẞ"); + assertStringTrimRight("UNICODE_CI", "ẞaaaẞ", "ß", "ẞaaa"); + assertStringTrim("UNICODE_CI", "ßaaaß", "ẞ", "aaa"); + assertStringTrimLeft("UNICODE_CI", "ßaaaß", "ẞ", "aaaß"); + assertStringTrimRight("UNICODE_CI", "ßaaaß", "ẞ", "ßaaa"); + assertStringTrim("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaa"); + assertStringTrimLeft("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaaẞ"); + assertStringTrimRight("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "Ëaaa"); + + // One-to-many case mapping - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "i", "i", ""); + assertStringTrim("UTF8_BINARY", "iii", "I", "iii"); + assertStringTrim("UTF8_BINARY", "I", "iii", "I"); + assertStringTrim("UTF8_BINARY", "ixi", "i", "x"); + assertStringTrim("UTF8_BINARY", "i", "İ", "i"); + assertStringTrim("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); + assertStringTrim("UTF8_BINARY", "i\u0307", "i", "\u0307"); + assertStringTrim("UTF8_BINARY", "i\u0307", "\u0307", "i"); + assertStringTrim("UTF8_BINARY", "i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307i", "i\u0307", ""); + assertStringTrim("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); + assertStringTrim("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ"); + assertStringTrim("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307"); + assertStringTrim("UTF8_BINARY", "İ", "İ", ""); + assertStringTrim("UTF8_BINARY", "IXi", "İ", "IXi"); + assertStringTrim("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrim("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrim("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); + assertStringTrim("UTF8_BINARY", "İ", "i", "İ"); + assertStringTrim("UTF8_BINARY", "İ", "\u0307", "İ"); + assertStringTrim("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrim("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); + assertStringTrim("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi"); + assertStringTrimLeft("UTF8_BINARY", "i", "i", ""); + assertStringTrimLeft("UTF8_BINARY", "iii", "I", "iii"); + assertStringTrimLeft("UTF8_BINARY", "I", "iii", "I"); + assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi"); + assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "i\u0307", ""); + assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307İ"); + assertStringTrimLeft("UTF8_BINARY", "İ", "İ", ""); + assertStringTrimLeft("UTF8_BINARY", "IXi", "İ", "IXi"); + assertStringTrimLeft("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); + assertStringTrimLeft("UTF8_BINARY", "İ", "i", "İ"); + assertStringTrimLeft("UTF8_BINARY", "İ", "\u0307", "İ"); + assertStringTrimLeft("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimLeft("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimLeft("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimRight("UTF8_BINARY", "i", "i", ""); + assertStringTrimRight("UTF8_BINARY", "iii", "I", "iii"); + assertStringTrimRight("UTF8_BINARY", "I", "iii", "I"); + assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix"); + assertStringTrimRight("UTF8_BINARY", "i", "İ", "i"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i"); + assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307i", "i\u0307", ""); + assertStringTrimRight("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307"); + assertStringTrimRight("UTF8_BINARY", "İ", "İ", ""); + assertStringTrimRight("UTF8_BINARY", "IXi", "İ", "IXi"); + assertStringTrimRight("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimRight("UTF8_BINARY", "i\u0307x", "ix\u0307İ", ""); + assertStringTrimRight("UTF8_BINARY", "İ", "i", "İ"); + assertStringTrimRight("UTF8_BINARY", "İ", "\u0307", "İ"); + assertStringTrimRight("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi"); + // One-to-many case mapping - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "i", "i", ""); + assertStringTrim("UTF8_LCASE", "iii", "I", ""); + assertStringTrim("UTF8_LCASE", "I", "iii", ""); + assertStringTrim("UTF8_LCASE", "ixi", "i", "x"); + assertStringTrim("UTF8_LCASE", "i", "İ", "i"); + assertStringTrim("UTF8_LCASE", "i\u0307", "İ", ""); + assertStringTrim("UTF8_LCASE", "i\u0307", "i", "\u0307"); + assertStringTrim("UTF8_LCASE", "i\u0307", "\u0307", "i"); + assertStringTrim("UTF8_LCASE", "i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307i", "i\u0307", ""); + assertStringTrim("UTF8_LCASE", "i\u0307i", "İ", "i"); + assertStringTrim("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ"); + assertStringTrim("UTF8_LCASE", "i\u0307İ", "İ", ""); + assertStringTrim("UTF8_LCASE", "İ", "İ", ""); + assertStringTrim("UTF8_LCASE", "IXi", "İ", "IXi"); + assertStringTrim("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307"); + assertStringTrim("UTF8_LCASE", "i\u0307x", "IXİ", ""); + assertStringTrim("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); + assertStringTrim("UTF8_LCASE", "İ", "i", "İ"); + assertStringTrim("UTF8_LCASE", "İ", "\u0307", "İ"); + assertStringTrim("UTF8_LCASE", "Ixİ", "i\u0307", "xİ"); + assertStringTrim("UTF8_LCASE", "IXİ", "ix\u0307", "İ"); + assertStringTrim("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); + assertStringTrimLeft("UTF8_LCASE", "i", "i", ""); + assertStringTrimLeft("UTF8_LCASE", "iii", "I", ""); + assertStringTrimLeft("UTF8_LCASE", "I", "iii", ""); + assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi"); + assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "i\u0307", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "İ", "i"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "İ", ""); + assertStringTrimLeft("UTF8_LCASE", "İ", "İ", ""); + assertStringTrimLeft("UTF8_LCASE", "IXi", "İ", "IXi"); + assertStringTrimLeft("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307"); + assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "IXİ", ""); + assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimLeft("UTF8_LCASE", "İ", "i", "İ"); + assertStringTrimLeft("UTF8_LCASE", "İ", "\u0307", "İ"); + assertStringTrimLeft("UTF8_LCASE", "Ixİ", "i\u0307", "xİ"); + assertStringTrimLeft("UTF8_LCASE", "IXİ", "ix\u0307", "İ"); + assertStringTrimLeft("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); + assertStringTrimRight("UTF8_LCASE", "i", "i", ""); + assertStringTrimRight("UTF8_LCASE", "iii", "I", ""); + assertStringTrimRight("UTF8_LCASE", "I", "iii", ""); + assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix"); + assertStringTrimRight("UTF8_LCASE", "i", "İ", "i"); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i"); + assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307\u0307", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307i", "i\u0307", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "İ", ""); + assertStringTrimRight("UTF8_LCASE", "İ", "İ", ""); + assertStringTrimRight("UTF8_LCASE", "IXi", "İ", "IXi"); + assertStringTrimRight("UTF8_LCASE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UTF8_LCASE", "i\u0307x", "IXİ", ""); + assertStringTrimRight("UTF8_LCASE", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimRight("UTF8_LCASE", "İ", "i", "İ"); + assertStringTrimRight("UTF8_LCASE", "İ", "\u0307", "İ"); + assertStringTrimRight("UTF8_LCASE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UTF8_LCASE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UTF8_LCASE", "xi\u0307", "\u0307IX", ""); + // One-to-many case mapping - UNICODE. + assertStringTrim("UNICODE", "i", "i", ""); + assertStringTrim("UNICODE", "iii", "I", "iii"); + assertStringTrim("UNICODE", "I", "iii", "I"); + assertStringTrim("UNICODE", "ixi", "i", "x"); + assertStringTrim("UNICODE", "i", "İ", "i"); + assertStringTrim("UNICODE", "i\u0307", "İ", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307", "i", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrim("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrim("UNICODE", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrim("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrim("UNICODE", "i\u0307İ", "İ", "i\u0307"); + assertStringTrim("UNICODE", "İ", "İ", ""); + assertStringTrim("UNICODE", "IXi", "İ", "IXi"); + assertStringTrim("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrim("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrim("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307"); + assertStringTrim("UNICODE", "İ", "i", "İ"); + assertStringTrim("UNICODE", "İ", "\u0307", "İ"); + assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrim("UNICODE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrim("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimLeft("UNICODE", "i", "i", ""); + assertStringTrimLeft("UNICODE", "iii", "I", "iii"); + assertStringTrimLeft("UNICODE", "I", "iii", "I"); + assertStringTrimLeft("UNICODE", "ixi", "i", "xi"); + assertStringTrimLeft("UNICODE", "i", "İ", "i"); + assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307i", "i\u0307", "i\u0307i"); + assertStringTrimLeft("UNICODE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimLeft("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimLeft("UNICODE", "i\u0307İ", "İ", "i\u0307İ"); + assertStringTrimLeft("UNICODE", "İ", "İ", ""); + assertStringTrimLeft("UNICODE", "IXi", "İ", "IXi"); + assertStringTrimLeft("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimLeft("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimLeft("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307x"); + assertStringTrimLeft("UNICODE", "İ", "i", "İ"); + assertStringTrimLeft("UNICODE", "İ", "\u0307", "İ"); + assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimLeft("UNICODE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimLeft("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); + assertStringTrimRight("UNICODE", "i", "i", ""); + assertStringTrimRight("UNICODE", "iii", "I", "iii"); + assertStringTrimRight("UNICODE", "I", "iii", "I"); + assertStringTrimRight("UNICODE", "ixi", "i", "ix"); + assertStringTrimRight("UNICODE", "i", "İ", "i"); + assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimRight("UNICODE", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UNICODE", "i\u0307İ", "İ", "i\u0307"); + assertStringTrimRight("UNICODE", "İ", "İ", ""); + assertStringTrimRight("UNICODE", "IXi", "İ", "IXi"); + assertStringTrimRight("UNICODE", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UNICODE", "i\u0307x", "IXİ", "i\u0307x"); + assertStringTrimRight("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307"); + assertStringTrimRight("UNICODE", "İ", "i", "İ"); + assertStringTrimRight("UNICODE", "İ", "\u0307", "İ"); + assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UNICODE", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307"); + // One-to-many case mapping - UNICODE_CI. + assertStringTrim("UNICODE_CI", "i", "i", ""); + assertStringTrim("UNICODE_CI", "iii", "I", ""); + assertStringTrim("UNICODE_CI", "I", "iii", ""); + assertStringTrim("UNICODE_CI", "ixi", "i", "x"); + assertStringTrim("UNICODE_CI", "i", "İ", "i"); + assertStringTrim("UNICODE_CI", "i\u0307", "İ", ""); + assertStringTrim("UNICODE_CI", "i\u0307", "i", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307i", "İ", "i"); + assertStringTrim("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrim("UNICODE_CI", "i\u0307İ", "İ", ""); + assertStringTrim("UNICODE_CI", "İ", "İ", ""); + assertStringTrim("UNICODE_CI", "IXi", "İ", "IXi"); + assertStringTrim("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307"); + assertStringTrim("UNICODE_CI", "i\u0307x", "IXİ", ""); + assertStringTrim("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); + assertStringTrim("UNICODE_CI", "İ", "i", "İ"); + assertStringTrim("UNICODE_CI", "İ", "\u0307", "İ"); + assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrim("UNICODE_CI", "Ixİ", "i\u0307", "xİ"); + assertStringTrim("UNICODE_CI", "IXİ", "ix\u0307", "İ"); + assertStringTrim("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i", "i", ""); + assertStringTrimLeft("UNICODE_CI", "iii", "I", ""); + assertStringTrimLeft("UNICODE_CI", "I", "iii", ""); + assertStringTrimLeft("UNICODE_CI", "ixi", "i", "xi"); + assertStringTrimLeft("UNICODE_CI", "i", "İ", "i"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "İ", ""); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "i", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307i"); + assertStringTrimLeft("UNICODE_CI", "i\u0307i", "İ", "i"); + assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "İ", ""); + assertStringTrimLeft("UNICODE_CI", "İ", "İ", ""); + assertStringTrimLeft("UNICODE_CI", "IXi", "İ", "IXi"); + assertStringTrimLeft("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307"); + assertStringTrimLeft("UNICODE_CI", "i\u0307x", "IXİ", ""); + assertStringTrimLeft("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimLeft("UNICODE_CI", "İ", "i", "İ"); + assertStringTrimLeft("UNICODE_CI", "İ", "\u0307", "İ"); + assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimLeft("UNICODE_CI", "Ixİ", "i\u0307", "xİ"); + assertStringTrimLeft("UNICODE_CI", "IXİ", "ix\u0307", "İ"); + assertStringTrimLeft("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i", "i", ""); + assertStringTrimRight("UNICODE_CI", "iii", "I", ""); + assertStringTrimRight("UNICODE_CI", "I", "iii", ""); + assertStringTrimRight("UNICODE_CI", "ixi", "i", "ix"); + assertStringTrimRight("UNICODE_CI", "i", "İ", "i"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "İ", ""); + assertStringTrimRight("UNICODE_CI", "i\u0307", "i", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307\u0307", "i\u0307", "i\u0307\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307i", "İ", "i\u0307i"); + assertStringTrimRight("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ"); + assertStringTrimRight("UNICODE_CI", "i\u0307İ", "İ", ""); + assertStringTrimRight("UNICODE_CI", "İ", "İ", ""); + assertStringTrimRight("UNICODE_CI", "IXi", "İ", "IXi"); + assertStringTrimRight("UNICODE_CI", "ix\u0307", "Ixİ", "ix\u0307"); + assertStringTrimRight("UNICODE_CI", "i\u0307x", "IXİ", ""); + assertStringTrimRight("UNICODE_CI", "i\u0307x", "I\u0307xİ", ""); + assertStringTrimRight("UNICODE_CI", "İ", "i", "İ"); + assertStringTrimRight("UNICODE_CI", "İ", "\u0307", "İ"); + assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307"); + assertStringTrimRight("UNICODE_CI", "Ixİ", "i\u0307", "Ixİ"); + assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ"); + assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307"); + + // Greek sigmas - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς"); + assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x"); + assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς"); + assertStringTrim("UTF8_BINARY", "σxσ", "σ", "x"); + assertStringTrim("UTF8_BINARY", "σxσ", "ς", "σxσ"); + assertStringTrim("UTF8_BINARY", "σxσ", "Σ", "σxσ"); + assertStringTrim("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrim("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrim("UTF8_BINARY", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UTF8_BINARY", "ςxς", "σ", "ςxς"); + assertStringTrimLeft("UTF8_BINARY", "ςxς", "ς", "xς"); + assertStringTrimLeft("UTF8_BINARY", "ςxς", "Σ", "ςxς"); + assertStringTrimLeft("UTF8_BINARY", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UTF8_BINARY", "σxσ", "ς", "σxσ"); + assertStringTrimLeft("UTF8_BINARY", "σxσ", "Σ", "σxσ"); + assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UTF8_BINARY", "ςxς", "σ", "ςxς"); + assertStringTrimRight("UTF8_BINARY", "ςxς", "ς", "ςx"); + assertStringTrimRight("UTF8_BINARY", "ςxς", "Σ", "ςxς"); + assertStringTrimRight("UTF8_BINARY", "σxσ", "σ", "σx"); + assertStringTrimRight("UTF8_BINARY", "σxσ", "ς", "σxσ"); + assertStringTrimRight("UTF8_BINARY", "σxσ", "Σ", "σxσ"); + assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx"); + // Greek sigmas - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x"); + assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x"); + assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x"); + assertStringTrim("UTF8_LCASE", "σxσ", "σ", "x"); + assertStringTrim("UTF8_LCASE", "σxσ", "ς", "x"); + assertStringTrim("UTF8_LCASE", "σxσ", "Σ", "x"); + assertStringTrim("UTF8_LCASE", "ΣxΣ", "σ", "x"); + assertStringTrim("UTF8_LCASE", "ΣxΣ", "ς", "x"); + assertStringTrim("UTF8_LCASE", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UTF8_LCASE", "ςxς", "σ", "xς"); + assertStringTrimLeft("UTF8_LCASE", "ςxς", "ς", "xς"); + assertStringTrimLeft("UTF8_LCASE", "ςxς", "Σ", "xς"); + assertStringTrimLeft("UTF8_LCASE", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UTF8_LCASE", "σxσ", "ς", "xσ"); + assertStringTrimLeft("UTF8_LCASE", "σxσ", "Σ", "xσ"); + assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "σ", "xΣ"); + assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "ς", "xΣ"); + assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UTF8_LCASE", "ςxς", "σ", "ςx"); + assertStringTrimRight("UTF8_LCASE", "ςxς", "ς", "ςx"); + assertStringTrimRight("UTF8_LCASE", "ςxς", "Σ", "ςx"); + assertStringTrimRight("UTF8_LCASE", "σxσ", "σ", "σx"); + assertStringTrimRight("UTF8_LCASE", "σxσ", "ς", "σx"); + assertStringTrimRight("UTF8_LCASE", "σxσ", "Σ", "σx"); + assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx"); + assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx"); + assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx"); + // Greek sigmas - UNICODE. + assertStringTrim("UNICODE", "ςxς", "σ", "ςxς"); + assertStringTrim("UNICODE", "ςxς", "ς", "x"); + assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς"); + assertStringTrim("UNICODE", "σxσ", "σ", "x"); + assertStringTrim("UNICODE", "σxσ", "ς", "σxσ"); + assertStringTrim("UNICODE", "σxσ", "Σ", "σxσ"); + assertStringTrim("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrim("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrim("UNICODE", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UNICODE", "ςxς", "σ", "ςxς"); + assertStringTrimLeft("UNICODE", "ςxς", "ς", "xς"); + assertStringTrimLeft("UNICODE", "ςxς", "Σ", "ςxς"); + assertStringTrimLeft("UNICODE", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UNICODE", "σxσ", "ς", "σxσ"); + assertStringTrimLeft("UNICODE", "σxσ", "Σ", "σxσ"); + assertStringTrimLeft("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimLeft("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimLeft("UNICODE", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UNICODE", "ςxς", "σ", "ςxς"); + assertStringTrimRight("UNICODE", "ςxς", "ς", "ςx"); + assertStringTrimRight("UNICODE", "ςxς", "Σ", "ςxς"); + assertStringTrimRight("UNICODE", "σxσ", "σ", "σx"); + assertStringTrimRight("UNICODE", "σxσ", "ς", "σxσ"); + assertStringTrimRight("UNICODE", "σxσ", "Σ", "σxσ"); + assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ"); + assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ"); + assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx"); + // Greek sigmas - UNICODE_CI. + assertStringTrim("UNICODE_CI", "ςxς", "σ", "x"); + assertStringTrim("UNICODE_CI", "ςxς", "ς", "x"); + assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x"); + assertStringTrim("UNICODE_CI", "σxσ", "σ", "x"); + assertStringTrim("UNICODE_CI", "σxσ", "ς", "x"); + assertStringTrim("UNICODE_CI", "σxσ", "Σ", "x"); + assertStringTrim("UNICODE_CI", "ΣxΣ", "σ", "x"); + assertStringTrim("UNICODE_CI", "ΣxΣ", "ς", "x"); + assertStringTrim("UNICODE_CI", "ΣxΣ", "Σ", "x"); + assertStringTrimLeft("UNICODE_CI", "ςxς", "σ", "xς"); + assertStringTrimLeft("UNICODE_CI", "ςxς", "ς", "xς"); + assertStringTrimLeft("UNICODE_CI", "ςxς", "Σ", "xς"); + assertStringTrimLeft("UNICODE_CI", "σxσ", "σ", "xσ"); + assertStringTrimLeft("UNICODE_CI", "σxσ", "ς", "xσ"); + assertStringTrimLeft("UNICODE_CI", "σxσ", "Σ", "xσ"); + assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "σ", "xΣ"); + assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "ς", "xΣ"); + assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "Σ", "xΣ"); + assertStringTrimRight("UNICODE_CI", "ςxς", "σ", "ςx"); + assertStringTrimRight("UNICODE_CI", "ςxς", "ς", "ςx"); + assertStringTrimRight("UNICODE_CI", "ςxς", "Σ", "ςx"); + assertStringTrimRight("UNICODE_CI", "σxσ", "σ", "σx"); + assertStringTrimRight("UNICODE_CI", "σxσ", "ς", "σx"); + assertStringTrimRight("UNICODE_CI", "σxσ", "Σ", "σx"); + assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx"); + assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx"); + assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx"); + + // Unicode normalization - UTF8_BINARY. + assertStringTrim("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimLeft("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimRight("UTF8_BINARY", "åβγδa\u030A", "å", "åβγδa\u030A"); + // Unicode normalization - UTF8_LCASE. + assertStringTrim("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimLeft("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimRight("UTF8_LCASE", "åβγδa\u030A", "Å", "åβγδa\u030A"); + // Unicode normalization - UNICODE. + assertStringTrim("UNICODE", "åβγδa\u030A", "å", "βγδ"); + assertStringTrimLeft("UNICODE", "åβγδa\u030A", "å", "βγδa\u030A"); + assertStringTrimRight("UNICODE", "åβγδa\u030A", "å", "åβγδ"); + // Unicode normalization - UNICODE_CI. + assertStringTrim("UNICODE_CI", "åβγδa\u030A", "Å", "βγδ"); + assertStringTrimLeft("UNICODE_CI", "åβγδa\u030A", "Å", "βγδa\u030A"); + assertStringTrimRight("UNICODE_CI", "åβγδa\u030A", "Å", "åβγδ"); } private void assertStringTranslate( diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 1302ca80e51a3..fec782002bb7b 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -38,7 +38,7 @@ import org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UPPER_OR_LO import org.apache.spark.sql.catalyst.util.{ArrayData, CharsetProvider, CollationFactory, CollationSupport, GenericArrayData, TypeUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation, StringTypeBinaryLcase} +import org.apache.spark.sql.internal.types.{AbstractArrayType, StringTypeAnyCollation} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.UTF8StringBuilder import org.apache.spark.unsafe.array.ByteArrayMethods @@ -1232,7 +1232,7 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { override def children: Seq[Expression] = srcStr +: trimStr.toSeq override def dataType: DataType = srcStr.dataType - override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeBinaryLcase) + override def inputTypes: Seq[AbstractDataType] = Seq.fill(children.size)(StringTypeAnyCollation) final lazy val collationId: Int = srcStr.dataType.asInstanceOf[StringType].collationId @@ -1260,11 +1260,11 @@ trait String2TrimExpression extends Expression with ImplicitCastInputTypes { if (evals.length == 1) { val stringTrimCode: String = this match { case _: StringTrim => - CollationSupport.StringTrim.genCode(srcString.value, collationId) + CollationSupport.StringTrim.genCode(srcString.value) case _: StringTrimLeft => - CollationSupport.StringTrimLeft.genCode(srcString.value, collationId) + CollationSupport.StringTrimLeft.genCode(srcString.value) case _: StringTrimRight => - CollationSupport.StringTrimRight.genCode(srcString.value, collationId) + CollationSupport.StringTrimRight.genCode(srcString.value) } ev.copy(code = code""" |${srcString.code} @@ -1390,7 +1390,7 @@ case class StringTrim(srcStr: Expression, trimStr: Option[Expression] = None) override protected def direction: String = "BOTH" override def doEval(srcString: UTF8String): UTF8String = - CollationSupport.StringTrim.exec(srcString, collationId) + CollationSupport.StringTrim.exec(srcString) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = CollationSupport.StringTrim.exec(srcString, trimString, collationId) @@ -1497,7 +1497,7 @@ case class StringTrimLeft(srcStr: Expression, trimStr: Option[Expression] = None override protected def direction: String = "LEADING" override def doEval(srcString: UTF8String): UTF8String = - CollationSupport.StringTrimLeft.exec(srcString, collationId) + CollationSupport.StringTrimLeft.exec(srcString) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = CollationSupport.StringTrimLeft.exec(srcString, trimString, collationId) @@ -1557,7 +1557,7 @@ case class StringTrimRight(srcStr: Expression, trimStr: Option[Expression] = Non override protected def direction: String = "TRAILING" override def doEval(srcString: UTF8String): UTF8String = - CollationSupport.StringTrimRight.exec(srcString, collationId) + CollationSupport.StringTrimRight.exec(srcString) override def doEval(srcString: UTF8String, trimString: UTF8String): UTF8String = CollationSupport.StringTrimRight.exec(srcString, trimString, collationId) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 5f722b2f01fb5..815a8bc595294 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -875,6 +875,37 @@ class CollationStringExpressionsSuite } test("StringTrim* functions - unit tests for both paths (codegen and eval)") { + def evalStringTrim(src: Any, trim: Any, result: String): Unit = { + Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach { collation => + val dt: DataType = StringType(collation) + checkEvaluation(StringTrim(Literal.create(src, dt), Literal.create(trim, dt)), result) + checkEvaluation(StringTrimLeft(Literal.create(src, dt), Literal.create(trim, dt)), result) + checkEvaluation(StringTrimRight(Literal.create(src, dt), Literal.create(trim, dt)), result) + } + } + // General edge cases and basic tests. + evalStringTrim(null, null, null) + evalStringTrim(null, "", null) + evalStringTrim(null, "a", null) + evalStringTrim("", null, null) + evalStringTrim("a", null, null) + evalStringTrim("", "", "") + evalStringTrim("", " ", "") + evalStringTrim("", "a", "") + evalStringTrim("", "aaa", "") + evalStringTrim(" ", "", " ") + evalStringTrim("a", "", "a") + evalStringTrim("aaa", "", "aaa") + evalStringTrim(" ", " ", "") + evalStringTrim(" ", " ", "") + evalStringTrim(" ", " ", "") + evalStringTrim(" ", " ", "") + evalStringTrim("a", "aaa", "") + evalStringTrim("aaa", "a", "") + evalStringTrim("aaa", "aaa", "") + evalStringTrim("abc", "cba", "") + evalStringTrim("cba", "abc", "") + // Without trimString param. checkEvaluation( StringTrim(Literal.create( " asd ", StringType("UTF8_BINARY"))), "asd") @@ -1019,20 +1050,6 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } - test("StringTrim* functions - unsupported collation types") { - List("TRIM", "LTRIM", "RTRIM").foreach(func => { - val collationMismatch = intercept[AnalysisException] { - sql("SELECT " + func + "(COLLATE('x', 'UNICODE_CI'), COLLATE('xxaaaxx', 'UNICODE_CI'))") - } - assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - }) - - val collationMismatch = intercept[AnalysisException] { - sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UNICODE_CI'), COLLATE('x', 'UNICODE_CI'))") - } - assert(collationMismatch.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") - } - // TODO: Add more tests for other string expressions }