[SPARK-48282][SQL] Alter string search logic for UTF8_BINARY_LCASE co…

…llation (StringReplace, FindInSet) ### What changes were proposed in this pull request? String searching in UTF8_BINARY_LCASE now works on character-level, rather than on byte-level. For example: `replace("İ", "i")`; now returns "İ", because there exists no `start, len` such that `lowercase(substring("İ", start, len)) == "i"`. ### Why are the changes needed? Fix functions that give unusable results due to one-to-many case mapping when performing string search under UTF8_BINARY_LCASE (see example above). ### Does this PR introduce _any_ user-facing change? Yes, behaviour of `replace` expression is changed for edge cases with one-to-many case mapping. ### How was this patch tested? New unit tests for StringReplace and FindInSet in `CollationSupportSuite`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#46682 from uros-db/alter-lcase-vol3. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
attilapiros · Oct 4, 2024 · a3161b4 · a3161b4
1 parent 334253f
commit a3161b4
Show file tree

Hide file tree

Showing 3 changed files with 177 additions and 94 deletions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -231,6 +231,19 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
     return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right));
   }
 
+  /*
+   * Performs string replacement for ICU collations by searching for instances of the search
+   * string in the `src` string, with respect to the specified collation, and then replacing
+   * them with the replace string. The method returns a new UTF8String with all instances of the
+   * search string replaced using the replace string. Similar to UTF8String.findInSet behavior
+   * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
+   *
+   * @param src the string to be searched in
+   * @param search the string to be searched for
+   * @param replace the string to be used as replacement
+   * @param collationId the collation ID to use for string search
+   * @return the position of the first occurrence of `match` in `set`
+   */
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -286,49 +299,47 @@ public static UTF8String replace(final UTF8String src, final UTF8String search,
     return buf.build();
   }
 
+  /*
+   * Performs string replacement for UTF8_LCASE collation by searching for instances of the search
+   * string in the src string, with respect to lowercased string versions, and then replacing
+   * them with the replace string. The method returns a new UTF8String with all instances of the
+   * search string replaced using the replace string. Similar to UTF8String.findInSet behavior
+   * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
+   *
+   * @param src the string to be searched in
+   * @param search the string to be searched for
+   * @param replace the string to be used as replacement
+   * @param collationId the collation ID to use for string search
+   * @return the position of the first occurrence of `match` in `set`
+   */
   public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search,
       final UTF8String replace) {
     if (src.numBytes() == 0 || search.numBytes() == 0) {
       return src;
     }
-    UTF8String lowercaseString = src.toLowerCase();
+
+    // TODO(SPARK-48725): Use lowerCaseCodePoints instead of UTF8String.toLowerCase.
     UTF8String lowercaseSearch = search.toLowerCase();
 
     int start = 0;
-    int end = lowercaseString.indexOf(lowercaseSearch, 0);
+    int end = lowercaseFind(src, lowercaseSearch, start);
     if (end == -1) {
       // Search string was not found, so string is unchanged.
       return src;
     }
 
-    // Initialize byte positions
-    int c = 0;
-    int byteStart = 0; // position in byte
-    int byteEnd = 0; // position in byte
-    while (byteEnd < src.numBytes() && c < end) {
-      byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
-      c += 1;
-    }
-
     // At least one match was found. Estimate space needed for result.
     // The 16x multiplier here is chosen to match commons-lang3's implementation.
     int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
     final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
     while (end != -1) {
-      buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
+      buf.append(src.substring(start, end));
       buf.append(replace);
       // Update character positions
-      start = end + lowercaseSearch.numChars();
-      end = lowercaseString.indexOf(lowercaseSearch, start);
-      // Update byte positions
-      byteStart = byteEnd + search.numBytes();
-      while (byteEnd < src.numBytes() && c < end) {
-        byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
-        c += 1;
-      }
+      start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
+      end = lowercaseFind(src, lowercaseSearch, start);
     }
-    buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
-      src.numBytes() - byteStart);
+    buf.append(src.substring(start, src.numChars()));
     return buf.build();
   }
 
@@ -344,7 +355,7 @@ public static UTF8String toUpperCase(final UTF8String target) {
   }
 
   private static UTF8String toUpperCaseSlow(final UTF8String target) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to uppercase, which only accepts a Java strings as input.
     // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
     return UTF8String.fromString(UCharacter.toUpperCase(target.toString()));
@@ -362,7 +373,7 @@ public static UTF8String toUpperCase(final UTF8String target, final int collatio
   }
 
   private static UTF8String toUpperCaseSlow(final UTF8String target, final int collationId) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to uppercase, which only accepts a Java strings as input.
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
@@ -382,7 +393,7 @@ public static UTF8String toLowerCase(final UTF8String target) {
   }
 
   private static UTF8String toLowerCaseSlow(final UTF8String target) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to lowercase, which only accepts a Java strings as input.
     // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
     return UTF8String.fromString(UCharacter.toLowerCase(target.toString()));
@@ -400,7 +411,7 @@ public static UTF8String toLowerCase(final UTF8String target, final int collatio
   }
 
   private static UTF8String toLowerCaseSlow(final UTF8String target, final int collationId) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to lowercase, which only accepts a Java strings as input.
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
@@ -461,7 +472,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) {
    * Convert the input string to titlecase using the ICU root locale rules.
    */
   public static UTF8String toTitleCase(final UTF8String target) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to titlecase, which only accepts a Java strings as input.
     // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
     return UTF8String.fromString(UCharacter.toTitleCase(target.toString(),
@@ -479,34 +490,40 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio
       BreakIterator.getWordInstance(locale)));
   }
 
+  /*
+   * Returns the position of the first occurrence of the match string in the set string,
+   * counting ASCII commas as delimiters. The match string is compared in a collation-aware manner,
+   * with respect to the specified collation ID. Similar to UTF8String.findInSet behavior used
+   * for UTF8_BINARY collation, the method returns 0 if the match string contains no commas.
+   *
+   * @param match the string to be searched for
+   * @param set the string to be searched in
+   * @param collationId the collation ID to use for string comparison
+   * @return the position of the first occurrence of `match` in `set`
+   */
   public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
+    // If the "word" string contains a comma, FindInSet should return 0.
     if (match.contains(UTF8String.fromString(","))) {
       return 0;
     }
-
-    // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
-    String setString = set.toString();
-    StringSearch stringSearch = CollationFactory.getStringSearch(setString, match.toString(),
-      collationId);
-
-    int wordStart = 0;
-    while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
-      boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ',';
-      boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length()
-        || setString.charAt(wordStart + stringSearch.getMatchLength()) == ',';
-
-      if (isValidStart && isValidEnd) {
-        int pos = 0;
-        for (int i = 0; i < setString.length() && i < wordStart; i++) {
-          if (setString.charAt(i) == ',') {
-            pos++;
-          }
+    // Otherwise, search for commas in "set" and compare each substring with "word".
+    int byteIndex = 0, charIndex = 0, wordCount = 1, lastComma = -1;
+    while (byteIndex < set.numBytes()) {
+      byte nextByte = set.getByte(byteIndex);
+      if (nextByte == (byte) ',') {
+        if (set.substring(lastComma + 1, charIndex).semanticEquals(match, collationId)) {
+          return wordCount;
         }
-
-        return pos + 1;
+        lastComma = charIndex;
+        ++wordCount;
       }
+      byteIndex += UTF8String.numBytesForFirstByte(nextByte);
+      ++charIndex;
     }
-
+    if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) {
+      return wordCount;
+    }
+    // If no match is found, return 0.
     return 0;
   }
 

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -322,31 +322,24 @@ public static int exec(final UTF8String word, final UTF8String set, final int co
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(word, set);
-      } else if (collation.supportsLowercaseEquality) {
-        return execLowercase(word, set);
       } else {
-        return execICU(word, set, collationId);
+        return execCollationAware(word, set, collationId);
       }
     }
     public static String genCode(final String word, final String set, final int collationId) {
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       String expr = "CollationSupport.FindInSet.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s, %s)", word, set);
-      } else if (collation.supportsLowercaseEquality) {
-        return String.format(expr + "Lowercase(%s, %s)", word, set);
       } else {
-        return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId);
+        return String.format(expr + "execCollationAware(%s, %s, %d)", word, set, collationId);
       }
     }
     public static int execBinary(final UTF8String word, final UTF8String set) {
       return set.findInSet(word);
     }
-    public static int execLowercase(final UTF8String word, final UTF8String set) {
-      return set.toLowerCase().findInSet(word.toLowerCase());
-    }
-    public static int execICU(final UTF8String word, final UTF8String set,
-                                  final int collationId) {
+    public static int execCollationAware(final UTF8String word, final UTF8String set,
+        final int collationId) {
       return CollationAwareUTF8String.findInSet(word, set, collationId);
     }
   }