[SPARK-48576][SQL] Rename UTF8_BINARY_LCASE to UTF8_LCASE

### What changes were proposed in this pull request? Renaming `UTF8_BINARY_LCASE` collation to `UTF8_LCASE`. ### Why are the changes needed? As part of the collation effort in Spark, we've moved away from byte-by-byte logic towards character-by-character logic, so what we used to call `UTF8_BINARY_LCASE` is now more precisely `UTF8_LCASE`. For example, string searching in UTF8_LCASE now works on character-level (rather than on byte-level), which is reflected in this PRs: #46511, #46589, #46682, #46761, #46762. In addition, string comparison also works on character-level now, as per the changes introduced in this PR: #46700. ### Does this PR introduce _any_ user-facing change? Yes, what was previously named `UTF8_BINARY_LCASE` collation, will from now on be named `UTF8_LCASE`. ### How was this patch tested? Existing tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46924 from uros-db/rename-lcase. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
apache · Jun 11, 2024 · aad6771 · aad6771
1 parent 224ba16
commit aad6771
Show file tree

Hide file tree

Showing 37 changed files with 802 additions and 802 deletions.
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -44,14 +44,14 @@ public class CollationAwareUTF8String {
   /**
    * Returns whether the target string starts with the specified prefix, starting from the
    * specified position (0-based index referring to character position in UTF8String), with respect
-   * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
+   * to the UTF8_LCASE collation. The method assumes that the prefix is already lowercased
    * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
    * same prefix string.
    *
    * @param target the string to be searched in
    * @param lowercasePattern the string to be searched for
    * @param startPos the start position for searching (in the target string)
-   * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
+   * @return whether the target string starts with the specified prefix in UTF8_LCASE
    */
   public static boolean lowercaseMatchFrom(
       final UTF8String target,
@@ -63,7 +63,7 @@ public static boolean lowercaseMatchFrom(
   /**
    * Returns the length of the substring of the target string that starts with the specified
    * prefix, starting from the specified position (0-based index referring to character position
-   * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * in UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
    * prefix is already lowercased. The method only considers the part of target string that
    * starts from the specified (inclusive) position (that is, the method does not look at UTF8
    * characters of the target string at or after position `endPos`). If the prefix is not found,
@@ -90,7 +90,7 @@ private static int lowercaseMatchLengthFrom(
   /**
    * Returns the position of the first occurrence of the pattern string in the target string,
    * starting from the specified position (0-based index referring to character position in
-   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
    * pattern string is already lowercased prior to call. If the pattern is not found,
    * MATCH_NOT_FOUND is returned.
    *
@@ -115,7 +115,7 @@ private static int lowercaseFind(
   /**
    * Returns whether the target string ends with the specified suffix, ending at the specified
    * position (0-based index referring to character position in UTF8String), with respect to the
-   * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
+   * UTF8_LCASE collation. The method assumes that the suffix is already lowercased prior
    * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
    * suffix string.
    *
@@ -134,7 +134,7 @@ public static boolean lowercaseMatchUntil(
   /**
    * Returns the length of the substring of the target string that ends with the specified
    * suffix, ending at the specified position (0-based index referring to character position in
-   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
    * suffix is already lowercased. The method only considers the part of target string that ends
    * at the specified (non-inclusive) position (that is, the method does not look at UTF8
    * characters of the target string at or after position `endPos`). If the suffix is not found,
@@ -161,7 +161,7 @@ private static int lowercaseMatchLengthUntil(
   /**
    * Returns the position of the last occurrence of the pattern string in the target string,
    * ending at the specified position (0-based index referring to character position in
-   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
    * pattern string is already lowercased prior to call. If the pattern is not found,
    * MATCH_NOT_FOUND is returned.
    *
@@ -184,7 +184,7 @@ private static int lowercaseRFind(
   }
 
   /**
-   * Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default
+   * Lowercase UTF8String comparison used for UTF8_LCASE collation. While the default
    * UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
    * method uses code points to compare the strings in a case-insensitive manner using ICU rules,
    * as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints).
@@ -489,7 +489,7 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
   /**
    * Returns the position of the first occurrence of the pattern string in the target string,
    * starting from the specified position (0-based index referring to character position in
-   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found,
+   * UTF8String), with respect to the UTF8_LCASE collation. If the pattern is not found,
    * MATCH_NOT_FOUND is returned.
    *
    * @param target the string to be searched in

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -134,7 +134,7 @@ public static class Collation {
     /**
      * Support for Lowercase Equality implies that it is possible to check equality on
      * byte by byte level, but only after calling "UTF8String.toLowerCase" on both arguments.
-     * This allows custom collation support for UTF8_BINARY_LCASE collation in various Spark
+     * This allows custom collation support for UTF8_LCASE collation in various Spark
      * expressions, as this particular collation is not supported by the external ICU library.
      */
     public final boolean supportsLowercaseEquality;
@@ -220,7 +220,7 @@ public Collation(
      * ---
      * Some illustrative examples of collation name to ID mapping:
      * - UTF8_BINARY       -> 0
-     * - UTF8_BINARY_LCASE -> 1
+     * - UTF8_LCASE        -> 1
      * - UNICODE           -> 0x20000000
      * - UNICODE_AI        -> 0x20010000
      * - UNICODE_CI        -> 0x20020000
@@ -326,7 +326,7 @@ protected static SparkException collationInvalidNameException(String collationNa
       private static int collationNameToId(String collationName) throws SparkException {
         // Collation names provided by user are treated as case-insensitive.
         String collationNameUpper = collationName.toUpperCase();
-        if (collationNameUpper.startsWith("UTF8_BINARY")) {
+        if (collationNameUpper.startsWith("UTF8_")) {
           return CollationSpecUTF8Binary.collationNameToId(collationName, collationNameUpper);
         } else {
           return CollationSpecICU.collationNameToId(collationName, collationNameUpper);
@@ -339,7 +339,7 @@ private static int collationNameToId(String collationName) throws SparkException
     private static class CollationSpecUTF8Binary extends CollationSpec {
 
       /**
-       * Bit 0 in collation ID having value 0 for plain UTF8_BINARY and 1 for UTF8_BINARY_LCASE
+       * Bit 0 in collation ID having value 0 for plain UTF8_BINARY and 1 for UTF8_LCASE
        * collation.
        */
       private enum CaseSensitivity {
@@ -358,11 +358,11 @@ private enum CaseSensitivity {
 
       private static final int UTF8_BINARY_COLLATION_ID =
         new CollationSpecUTF8Binary(CaseSensitivity.UNSPECIFIED).collationId;
-      private static final int UTF8_BINARY_LCASE_COLLATION_ID =
+      private static final int UTF8_LCASE_COLLATION_ID =
         new CollationSpecUTF8Binary(CaseSensitivity.LCASE).collationId;
       protected static Collation UTF8_BINARY_COLLATION =
         new CollationSpecUTF8Binary(CaseSensitivity.UNSPECIFIED).buildCollation();
-      protected static Collation UTF8_BINARY_LCASE_COLLATION =
+      protected static Collation UTF8_LCASE_COLLATION =
         new CollationSpecUTF8Binary(CaseSensitivity.LCASE).buildCollation();
 
       private final int collationId;
@@ -376,8 +376,8 @@ private static int collationNameToId(String originalName, String collationName)
           throws SparkException {
         if (UTF8_BINARY_COLLATION.collationName.equals(collationName)) {
           return UTF8_BINARY_COLLATION_ID;
-        } else if (UTF8_BINARY_LCASE_COLLATION.collationName.equals(collationName)) {
-          return UTF8_BINARY_LCASE_COLLATION_ID;
+        } else if (UTF8_LCASE_COLLATION.collationName.equals(collationName)) {
+          return UTF8_LCASE_COLLATION_ID;
         } else {
           // Throw exception with original (before case conversion) collation name.
           throw collationInvalidNameException(originalName);
@@ -409,7 +409,7 @@ protected Collation buildCollation() {
             /* supportsLowercaseEquality = */ false);
         } else {
           return new Collation(
-            "UTF8_BINARY_LCASE",
+            "UTF8_LCASE",
             PROVIDER_SPARK,
             null,
             CollationAwareUTF8String::compareLowerCase,
@@ -633,7 +633,7 @@ private static CollationSpecICU fromCollationId(int collationId) {
         // Locale ID remains after removing all other specifiers.
         int localeId = collationId;
         // Verify locale ID is valid against `ICULocaleNames` array.
-        assert (localeId < ICULocaleNames.length);
+        assert(localeId >= 0 && localeId < ICULocaleNames.length);
         CaseSensitivity caseSensitivity = CaseSensitivity.values()[caseSensitivityOrdinal];
         AccentSensitivity accentSensitivity = AccentSensitivity.values()[accentSensitivityOrdinal];
         String locale = ICULocaleNames[localeId];
@@ -728,8 +728,8 @@ public CollationIdentifier identifier() {
 
   public static final int UTF8_BINARY_COLLATION_ID =
     Collation.CollationSpecUTF8Binary.UTF8_BINARY_COLLATION_ID;
-  public static final int UTF8_BINARY_LCASE_COLLATION_ID =
-    Collation.CollationSpecUTF8Binary.UTF8_BINARY_LCASE_COLLATION_ID;
+  public static final int UTF8_LCASE_COLLATION_ID =
+    Collation.CollationSpecUTF8Binary.UTF8_LCASE_COLLATION_ID;
   public static final int UNICODE_COLLATION_ID =
     Collation.CollationSpecICU.UNICODE_COLLATION_ID;
   public static final int UNICODE_CI_COLLATION_ID =
@@ -766,7 +766,7 @@ public static StringSearch getStringSearch(
   /**
    * Returns a collation-unaware StringSearch object for the given pattern and target strings.
    * While this object does not respect collation, it can be used to find occurrences of the pattern
-   * in the target string for UTF8_BINARY or UTF8_BINARY_LCASE (if arguments are lowercased).
+   * in the target string for UTF8_BINARY or UTF8_LCASE (if arguments are lowercased).
    */
   public static StringSearch getStringSearch(
           final UTF8String targetUTF8String,

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -743,7 +743,7 @@ public static UTF8String execLowercase(
 
   public static boolean supportsLowercaseRegex(final int collationId) {
     // for regex, only Unicode case-insensitive matching is possible,
-    // so UTF8_BINARY_LCASE is treated as UNICODE_CI in this context
+    // so UTF8_LCASE is treated as UNICODE_CI in this context
     return CollationFactory.fetchCollation(collationId).supportsLowercaseEquality;
   }