Skip to content

Commit

Permalink
[SPARK-48282][SQL] Alter string search logic for UTF8_BINARY_LCASE co…
Browse files Browse the repository at this point in the history
…llation (StringReplace, FindInSet)

### What changes were proposed in this pull request?
String searching in UTF8_BINARY_LCASE now works on character-level, rather than on byte-level. For example: `replace("İ", "i")`; now returns "İ", because there exists no `start, len` such that `lowercase(substring("İ", start, len)) == "i"`.

### Why are the changes needed?
Fix functions that give unusable results due to one-to-many case mapping when performing string search under UTF8_BINARY_LCASE (see example above).

### Does this PR introduce _any_ user-facing change?
Yes, behaviour of `replace` expression is changed for edge cases with one-to-many case mapping.

### How was this patch tested?
New unit tests for StringReplace and FindInSet in `CollationSupportSuite`.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes apache#46682 from uros-db/alter-lcase-vol3.

Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
uros-db authored and attilapiros committed Oct 4, 2024
1 parent 334253f commit a3161b4
Show file tree
Hide file tree
Showing 3 changed files with 177 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,19 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right));
}

/*
* Performs string replacement for ICU collations by searching for instances of the search
* string in the `src` string, with respect to the specified collation, and then replacing
* them with the replace string. The method returns a new UTF8String with all instances of the
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
* used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
*
* @param src the string to be searched in
* @param search the string to be searched for
* @param replace the string to be used as replacement
* @param collationId the collation ID to use for string search
* @return the position of the first occurrence of `match` in `set`
*/
public static UTF8String replace(final UTF8String src, final UTF8String search,
final UTF8String replace, final int collationId) {
// This collation aware implementation is based on existing implementation on UTF8String
Expand Down Expand Up @@ -286,49 +299,47 @@ public static UTF8String replace(final UTF8String src, final UTF8String search,
return buf.build();
}

/*
* Performs string replacement for UTF8_LCASE collation by searching for instances of the search
* string in the src string, with respect to lowercased string versions, and then replacing
* them with the replace string. The method returns a new UTF8String with all instances of the
* search string replaced using the replace string. Similar to UTF8String.findInSet behavior
* used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
*
* @param src the string to be searched in
* @param search the string to be searched for
* @param replace the string to be used as replacement
* @param collationId the collation ID to use for string search
* @return the position of the first occurrence of `match` in `set`
*/
public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search,
final UTF8String replace) {
if (src.numBytes() == 0 || search.numBytes() == 0) {
return src;
}
UTF8String lowercaseString = src.toLowerCase();

// TODO(SPARK-48725): Use lowerCaseCodePoints instead of UTF8String.toLowerCase.
UTF8String lowercaseSearch = search.toLowerCase();

int start = 0;
int end = lowercaseString.indexOf(lowercaseSearch, 0);
int end = lowercaseFind(src, lowercaseSearch, start);
if (end == -1) {
// Search string was not found, so string is unchanged.
return src;
}

// Initialize byte positions
int c = 0;
int byteStart = 0; // position in byte
int byteEnd = 0; // position in byte
while (byteEnd < src.numBytes() && c < end) {
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
c += 1;
}

// At least one match was found. Estimate space needed for result.
// The 16x multiplier here is chosen to match commons-lang3's implementation.
int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
while (end != -1) {
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
buf.append(src.substring(start, end));
buf.append(replace);
// Update character positions
start = end + lowercaseSearch.numChars();
end = lowercaseString.indexOf(lowercaseSearch, start);
// Update byte positions
byteStart = byteEnd + search.numBytes();
while (byteEnd < src.numBytes() && c < end) {
byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
c += 1;
}
start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
end = lowercaseFind(src, lowercaseSearch, start);
}
buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
src.numBytes() - byteStart);
buf.append(src.substring(start, src.numChars()));
return buf.build();
}

Expand All @@ -344,7 +355,7 @@ public static UTF8String toUpperCase(final UTF8String target) {
}

private static UTF8String toUpperCaseSlow(final UTF8String target) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to uppercase, which only accepts a Java strings as input.
// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
return UTF8String.fromString(UCharacter.toUpperCase(target.toString()));
Expand All @@ -362,7 +373,7 @@ public static UTF8String toUpperCase(final UTF8String target, final int collatio
}

private static UTF8String toUpperCaseSlow(final UTF8String target, final int collationId) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to uppercase, which only accepts a Java strings as input.
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand All @@ -382,7 +393,7 @@ public static UTF8String toLowerCase(final UTF8String target) {
}

private static UTF8String toLowerCaseSlow(final UTF8String target) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to lowercase, which only accepts a Java strings as input.
// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
return UTF8String.fromString(UCharacter.toLowerCase(target.toString()));
Expand All @@ -400,7 +411,7 @@ public static UTF8String toLowerCase(final UTF8String target, final int collatio
}

private static UTF8String toLowerCaseSlow(final UTF8String target, final int collationId) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to lowercase, which only accepts a Java strings as input.
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand Down Expand Up @@ -461,7 +472,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) {
* Convert the input string to titlecase using the ICU root locale rules.
*/
public static UTF8String toTitleCase(final UTF8String target) {
// Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
// Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
// convert the string to titlecase, which only accepts a Java strings as input.
// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
return UTF8String.fromString(UCharacter.toTitleCase(target.toString(),
Expand All @@ -479,34 +490,40 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio
BreakIterator.getWordInstance(locale)));
}

/*
* Returns the position of the first occurrence of the match string in the set string,
* counting ASCII commas as delimiters. The match string is compared in a collation-aware manner,
* with respect to the specified collation ID. Similar to UTF8String.findInSet behavior used
* for UTF8_BINARY collation, the method returns 0 if the match string contains no commas.
*
* @param match the string to be searched for
* @param set the string to be searched in
* @param collationId the collation ID to use for string comparison
* @return the position of the first occurrence of `match` in `set`
*/
public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
// If the "word" string contains a comma, FindInSet should return 0.
if (match.contains(UTF8String.fromString(","))) {
return 0;
}

// TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
String setString = set.toString();
StringSearch stringSearch = CollationFactory.getStringSearch(setString, match.toString(),
collationId);

int wordStart = 0;
while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ',';
boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length()
|| setString.charAt(wordStart + stringSearch.getMatchLength()) == ',';

if (isValidStart && isValidEnd) {
int pos = 0;
for (int i = 0; i < setString.length() && i < wordStart; i++) {
if (setString.charAt(i) == ',') {
pos++;
}
// Otherwise, search for commas in "set" and compare each substring with "word".
int byteIndex = 0, charIndex = 0, wordCount = 1, lastComma = -1;
while (byteIndex < set.numBytes()) {
byte nextByte = set.getByte(byteIndex);
if (nextByte == (byte) ',') {
if (set.substring(lastComma + 1, charIndex).semanticEquals(match, collationId)) {
return wordCount;
}

return pos + 1;
lastComma = charIndex;
++wordCount;
}
byteIndex += UTF8String.numBytesForFirstByte(nextByte);
++charIndex;
}

if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) {
return wordCount;
}
// If no match is found, return 0.
return 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -322,31 +322,24 @@ public static int exec(final UTF8String word, final UTF8String set, final int co
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(word, set);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(word, set);
} else {
return execICU(word, set, collationId);
return execCollationAware(word, set, collationId);
}
}
public static String genCode(final String word, final String set, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.FindInSet.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", word, set);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", word, set);
} else {
return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId);
return String.format(expr + "execCollationAware(%s, %s, %d)", word, set, collationId);
}
}
public static int execBinary(final UTF8String word, final UTF8String set) {
return set.findInSet(word);
}
public static int execLowercase(final UTF8String word, final UTF8String set) {
return set.toLowerCase().findInSet(word.toLowerCase());
}
public static int execICU(final UTF8String word, final UTF8String set,
final int collationId) {
public static int execCollationAware(final UTF8String word, final UTF8String set,
final int collationId) {
return CollationAwareUTF8String.findInSet(word, set, collationId);
}
}
Expand Down
Loading

0 comments on commit a3161b4

Please sign in to comment.