Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48281][SQL] Alter string search logic for UTF8_BINARY_LCASE collation (StringInStr, SubstringIndex) #46589

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,155 @@
* Utility class for collation-aware UTF8String operations.
*/
public class CollationAwareUTF8String {

/**
* The constant value to indicate that the match is not found when searching for a pattern
* string in a target string.
*/
private static final int MATCH_NOT_FOUND = -1;

/**
* Returns whether the target string starts with the specified prefix, starting from the
* specified position (0-based index referring to character position in UTF8String), with respect
* to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
* prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
* same prefix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
*/
public static boolean lowercaseMatchFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that starts with the specified
* prefix, starting from the specified position (0-based index referring to character position
* in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* prefix is already lowercased. The method only considers the part of target string that
* starts from the specified (inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the prefix is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return length of the target substring that ends with the specified prefix in lowercase
*/
public static int lowercaseMatchLengthFrom(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int len = 0; len <= target.numChars() - startPos; ++len) {
if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target
*/
public static int lowercaseFind(
final UTF8String target,
final UTF8String lowercasePattern,
int startPos) {
assert startPos >= 0;
for (int i = startPos; i <= target.numChars(); ++i) {
if (lowercaseMatchFrom(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns whether the target string ends with the specified suffix, ending at the specified
* position (0-based index referring to character position in UTF8String), with respect to the
* UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
* to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
* suffix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return whether the target string ends with the specified suffix in lowercase
*/
public static boolean lowercaseMatchUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
}

/**
* Returns the length of the substring of the target string that ends with the specified
* suffix, ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* suffix is already lowercased. The method only considers the part of target string that ends
* at the specified (non-inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the suffix is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return length of the target substring that ends with the specified suffix in lowercase
*/
public static int lowercaseMatchLengthUntil(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int len = 0; len <= endPos; ++len) {
if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
return len;
}
}
return MATCH_NOT_FOUND;
}

/**
* Returns the position of the last occurrence of the pattern string in the target string,
* ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param endPos the end position for searching (in the target string)
* @return the position of the last occurrence of pattern in target
*/
public static int lowercaseRFind(
final UTF8String target,
final UTF8String lowercasePattern,
int endPos) {
assert endPos <= target.numChars();
for (int i = endPos; i >= 0; --i) {
if (lowercaseMatchUntil(target, lowercasePattern, i)) {
return i;
}
}
return MATCH_NOT_FOUND;
}

public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -183,10 +332,27 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
return 0;
}

/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
* @param pattern the string to be searched for
* @param start the start position for searching (in the target string)
* @return the position of the first occurrence of pattern in target
*/
public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern,
final int start) {
if (pattern.numChars() == 0) return target.indexOfEmpty(start);
return lowercaseFind(target, pattern.toLowerCase(), start);
}

public static int indexOf(final UTF8String target, final UTF8String pattern,
final int start, final int collationId) {
if (pattern.numBytes() == 0) {
return 0;
return target.indexOfEmpty(start);
}

StringSearch stringSearch = CollationFactory.getStringSearch(target, pattern, collationId);
Expand Down Expand Up @@ -278,47 +444,27 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string,
return UTF8String.EMPTY_UTF8;
}

UTF8String lowercaseString = string.toLowerCase();
UTF8String lowercaseDelimiter = delimiter.toLowerCase();

if (count > 0) {
int idx = -1;
// Search left to right (note: the start code point is inclusive).
int matchLength = -1;
while (count > 0) {
idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
if (idx >= 0) {
count--;
} else {
// can not find enough delim
return string;
}
}
if (idx == 0) {
return UTF8String.EMPTY_UTF8;
matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1);
if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter.
else return string; // Cannot find enough delimiters in the string.
}
byte[] bytes = new byte[idx];
copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx);
return UTF8String.fromBytes(bytes);

return string.substring(0, matchLength);
} else {
int idx = string.numBytes() - delimiter.numBytes() + 1;
// Search right to left (note: the end code point is exclusive).
int matchLength = string.numChars() + 1;
count = -count;
while (count > 0) {
idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1);
if (idx >= 0) {
count--;
} else {
// can not find enough delim
return string;
}
matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1);
if (matchLength > MATCH_NOT_FOUND) --count; // Found a delimiter.
else return string; // Cannot find enough delimiters in the string.
}
if (idx + delimiter.numBytes() == string.numBytes()) {
return UTF8String.EMPTY_UTF8;
}
int size = string.numBytes() - delimiter.numBytes() - idx;
byte[] bytes = new byte[size];
copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(),
bytes, BYTE_ARRAY_OFFSET, size);
return UTF8String.fromBytes(bytes);
return string.substring(matchLength, string.numChars());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ public static int execBinary(final UTF8String string, final UTF8String substring
return string.indexOf(substring, 0);
}
public static int execLowercase(final UTF8String string, final UTF8String substring) {
return string.toLowerCase().indexOf(substring.toLowerCase(), 0);
return CollationAwareUTF8String.lowercaseIndexOf(string, substring, 0);
}
public static int execICU(final UTF8String string, final UTF8String substring,
final int collationId) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,17 @@ public UTF8String repeat(int times) {
return UTF8String.fromBytes(newBytes);
}

/**
* Returns the (default) position of the first occurrence of an empty substr in the current
* string from the specified position (0-based index).
*
* @param start the start position of the current string for searching
* @return the position of the first occurrence of the empty substr (now, always 0)
*/
public int indexOfEmpty(int start) {
return 0; // TODO: Fix this behaviour (SPARK-48284)
}

/**
* Returns the position of the first occurrence of substr in
* current string from the specified position (0-based index).
Expand All @@ -901,7 +912,7 @@ public UTF8String repeat(int times) {
*/
public int indexOf(UTF8String v, int start) {
if (v.numBytes() == 0) {
return 0;
return indexOfEmpty(start);
}

// locate to the start position.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

import static org.junit.jupiter.api.Assertions.*;


// checkstyle.off: AvoidEscapedUnicodeCharacters
public class CollationSupportSuite {

/**
Expand Down Expand Up @@ -567,8 +567,28 @@ public void testStringInstr() throws SparkException {
assertStringInstr("aaads", "dS", "UNICODE_CI", 4);
assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0);
assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8);
assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3);
assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
assertStringInstr("i̇", "i", "UNICODE_CI", 0);
assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0);
assertStringInstr("i̇", "İ", "UNICODE_CI", 1);
assertStringInstr("İ", "i", "UNICODE_CI", 0);
assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1);
assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1);
assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3);
assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3);
assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5);
assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7);
assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // != UNICODE_CI
assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // != UNICODE_CI
assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1);
assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0);
assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1);
assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1);
assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3);
assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3);
uros-db marked this conversation as resolved.
Show resolved Hide resolved
assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_BINARY_LCASE", 3);
assertStringInstr("ai̇oxXİo", "Xx", "UTF8_BINARY_LCASE", 5);
assertStringInstr("abİoi̇o", "\u0307o", "UTF8_BINARY_LCASE", 6);
assertStringInstr("aİoi̇oxx", "XX", "UTF8_BINARY_LCASE", 7);
}

private void assertFindInSet(String word, String set, String collationName,
Expand Down Expand Up @@ -798,6 +818,32 @@ public void testSubstringIndex() throws SparkException {
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12");
assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12");
assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab");
assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12");
assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI
assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI
assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab");
assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo");
uros-db marked this conversation as resolved.
Show resolved Hide resolved
assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12");
uros-db marked this conversation as resolved.
Show resolved Hide resolved
assertSubstringIndex("bİoi̇o12i̇o", "\u0307oi", 1, "UTF8_BINARY_LCASE", "bİoi̇o12i̇o");
}

private void assertStringTrim(
Expand Down Expand Up @@ -1008,3 +1054,4 @@ public void testStringTrim() throws SparkException {
// TODO: Test other collation-aware expressions.

}
// checkstyle.on: AvoidEscapedUnicodeCharacters