diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 0d0094d8d0a03..a6e96003ec34d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -345,14 +345,14 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co */ public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, final int start) { - if (pattern.numChars() == 0) return 0; + if (pattern.numChars() == 0) return target.indexOfEmpty(start); return lowercaseFind(target, pattern.toLowerCase(), start); } public static int indexOf(final UTF8String target, final UTF8String pattern, final int start, final int collationId) { if (pattern.numBytes() == 0) { - return 0; + return target.indexOfEmpty(start); } StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId); @@ -444,47 +444,27 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // Search left to right (note: the start code point is inclusive). + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter. + else return string; // Cannot find enough delimiters in the string. } - byte[] bytes = new byte[idx]; - copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); - return UTF8String.fromBytes(bytes); - + return string.substring(0, matchLength); } else { - int idx = string.numBytes() - delimiter.numBytes() + 1; + // Search right to left (note: the end code point is exclusive). + int matchLength = string.numChars() + 1; count = -count; while (count > 0) { - idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } + matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); + if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter. + else return string; // Cannot find enough delimiters in the string. } - if (idx + delimiter.numBytes() == string.numBytes()) { - return UTF8String.EMPTY_UTF8; - } - int size = string.numBytes() - delimiter.numBytes() - idx; - byte[] bytes = new byte[size]; - copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(), - bytes, BYTE_ARRAY_OFFSET, size); - return UTF8String.fromBytes(bytes); + return string.substring(matchLength, string.numChars()); } } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 8f7aed30464cc..d5bcc61bac2af 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring return string.indexOf(substring, 0); } public static int execLowercase(final UTF8String string, final UTF8String substring) { - return string.toLowerCase().indexOf(substring.toLowerCase(), 0); + return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0); } public static int execICU(final UTF8String string, final UTF8String substring, final int collationId) { diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 03286e0635287..e28dfa910b59e 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -773,6 +773,17 @@ public UTF8String repeat(int times) { return UTF8String.fromBytes(newBytes); } + /** + * Returns the (default) position of the first occurrence of an empty substr in the current + * string from the specified position (0-based index). + * + * @param start the start position of the current string for searching + * @return the position of the first occurrence of the empty substr (now, always 0) + */ + public int indexOfEmpty(int start) { + return 0; // TODO: Fix this behaviour (SPARK-48284) + } + /** * Returns the position of the first occurrence of substr in * current string from the specified position (0-based index). @@ -783,7 +794,7 @@ public UTF8String repeat(int times) { */ public int indexOf(UTF8String v, int start) { if (v.numBytes() == 0) { - return 0; + return indexOfEmpty(start); } // locate to the start position. diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index eb18d7665b092..d5045721f941e 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -635,8 +635,28 @@ public void testStringInstr() throws SparkException { assertStringInstr("aaads", "dS", "UNICODE_CI", 4); assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); - assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3); - assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3); + assertStringInstr("i̇", "i", "UNICODE_CI", 0); + assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0); + assertStringInstr("i̇", "İ", "UNICODE_CI", 1); + assertStringInstr("İ", "i", "UNICODE_CI", 0); + assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1); + assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); + assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3); + assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); + assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); + assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // != UNICODE_CI + assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // != UNICODE_CI + assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1); + assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0); + assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1); + assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1); + assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3); + assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3); + assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_BINARY_LCASE", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5); + assertStringInstr("abİoi̇o", "\u0307o", "UTF8_BINARY_LCASE", 6); + assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7); } private void assertFindInSet(String word, String set, String collationName, @@ -878,6 +898,32 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o"); } private void assertStringTrim(