Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48576][SQL] Rename UTF8_BINARY_LCASE to UTF8_LCASE #46924

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ public class CollationAwareUTF8String {
/**
* Returns whether the target string starts with the specified prefix, starting from the
* specified position (0-based index referring to character position in UTF8String), with respect
* to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
* to the UTF8_LCASE collation. The method assumes that the prefix is already lowercased
* prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
* same prefix string.
*
* @param target the string to be searched in
* @param lowercasePattern the string to be searched for
* @param startPos the start position for searching (in the target string)
* @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
* @return whether the target string starts with the specified prefix in UTF8_LCASE
*/
public static boolean lowercaseMatchFrom(
final UTF8String target,
Expand All @@ -63,7 +63,7 @@ public static boolean lowercaseMatchFrom(
/**
* Returns the length of the substring of the target string that starts with the specified
* prefix, starting from the specified position (0-based index referring to character position
* in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* in UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
* prefix is already lowercased. The method only considers the part of target string that
* starts from the specified (inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the prefix is not found,
Expand All @@ -90,7 +90,7 @@ private static int lowercaseMatchLengthFrom(
/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
Expand All @@ -115,7 +115,7 @@ private static int lowercaseFind(
/**
* Returns whether the target string ends with the specified suffix, ending at the specified
* position (0-based index referring to character position in UTF8String), with respect to the
* UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
* UTF8_LCASE collation. The method assumes that the suffix is already lowercased prior
* to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
* suffix string.
*
Expand All @@ -134,7 +134,7 @@ public static boolean lowercaseMatchUntil(
/**
* Returns the length of the substring of the target string that ends with the specified
* suffix, ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
* suffix is already lowercased. The method only considers the part of target string that ends
* at the specified (non-inclusive) position (that is, the method does not look at UTF8
* characters of the target string at or after position `endPos`). If the suffix is not found,
Expand All @@ -161,7 +161,7 @@ private static int lowercaseMatchLengthUntil(
/**
* Returns the position of the last occurrence of the pattern string in the target string,
* ending at the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
* UTF8String), with respect to the UTF8_LCASE collation. The method assumes that the
* pattern string is already lowercased prior to call. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
Expand All @@ -184,7 +184,7 @@ private static int lowercaseRFind(
}

/**
* Lowercase UTF8String comparison used for UTF8_BINARY_LCASE collation. While the default
* Lowercase UTF8String comparison used for UTF8_LCASE collation. While the default
* UTF8String comparison is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()), this
* method uses code points to compare the strings in a case-insensitive manner using ICU rules,
* as well as handling special rules for one-to-many case mappings (see: lowerCaseCodePoints).
Expand Down Expand Up @@ -489,7 +489,7 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
/**
* Returns the position of the first occurrence of the pattern string in the target string,
* starting from the specified position (0-based index referring to character position in
* UTF8String), with respect to the UTF8_BINARY_LCASE collation. If the pattern is not found,
* UTF8String), with respect to the UTF8_LCASE collation. If the pattern is not found,
* MATCH_NOT_FOUND is returned.
*
* @param target the string to be searched in
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ public static class Collation {
/**
* Support for Lowercase Equality implies that it is possible to check equality on
* byte by byte level, but only after calling "UTF8String.toLowerCase" on both arguments.
* This allows custom collation support for UTF8_BINARY_LCASE collation in various Spark
* This allows custom collation support for UTF8_LCASE collation in various Spark
* expressions, as this particular collation is not supported by the external ICU library.
*/
public final boolean supportsLowercaseEquality;
Expand Down Expand Up @@ -220,7 +220,7 @@ public Collation(
* ---
* Some illustrative examples of collation name to ID mapping:
* - UTF8_BINARY -> 0
* - UTF8_BINARY_LCASE -> 1
* - UTF8_LCASE -> 1
* - UNICODE -> 0x20000000
* - UNICODE_AI -> 0x20010000
* - UNICODE_CI -> 0x20020000
Expand Down Expand Up @@ -326,7 +326,7 @@ protected static SparkException collationInvalidNameException(String collationNa
private static int collationNameToId(String collationName) throws SparkException {
// Collation names provided by user are treated as case-insensitive.
String collationNameUpper = collationName.toUpperCase();
if (collationNameUpper.startsWith("UTF8_BINARY")) {
if (collationNameUpper.startsWith("UTF8_")) {
return CollationSpecUTF8Binary.collationNameToId(collationName, collationNameUpper);
} else {
return CollationSpecICU.collationNameToId(collationName, collationNameUpper);
Expand All @@ -339,7 +339,7 @@ private static int collationNameToId(String collationName) throws SparkException
private static class CollationSpecUTF8Binary extends CollationSpec {

/**
* Bit 0 in collation ID having value 0 for plain UTF8_BINARY and 1 for UTF8_BINARY_LCASE
* Bit 0 in collation ID having value 0 for plain UTF8_BINARY and 1 for UTF8_LCASE
* collation.
*/
private enum CaseSensitivity {
Expand All @@ -358,11 +358,11 @@ private enum CaseSensitivity {

private static final int UTF8_BINARY_COLLATION_ID =
new CollationSpecUTF8Binary(CaseSensitivity.UNSPECIFIED).collationId;
private static final int UTF8_BINARY_LCASE_COLLATION_ID =
private static final int UTF8_LCASE_COLLATION_ID =
new CollationSpecUTF8Binary(CaseSensitivity.LCASE).collationId;
protected static Collation UTF8_BINARY_COLLATION =
new CollationSpecUTF8Binary(CaseSensitivity.UNSPECIFIED).buildCollation();
protected static Collation UTF8_BINARY_LCASE_COLLATION =
protected static Collation UTF8_LCASE_COLLATION =
new CollationSpecUTF8Binary(CaseSensitivity.LCASE).buildCollation();

private final int collationId;
Expand All @@ -376,8 +376,8 @@ private static int collationNameToId(String originalName, String collationName)
throws SparkException {
if (UTF8_BINARY_COLLATION.collationName.equals(collationName)) {
return UTF8_BINARY_COLLATION_ID;
} else if (UTF8_BINARY_LCASE_COLLATION.collationName.equals(collationName)) {
return UTF8_BINARY_LCASE_COLLATION_ID;
} else if (UTF8_LCASE_COLLATION.collationName.equals(collationName)) {
return UTF8_LCASE_COLLATION_ID;
} else {
// Throw exception with original (before case conversion) collation name.
throw collationInvalidNameException(originalName);
Expand Down Expand Up @@ -409,7 +409,7 @@ protected Collation buildCollation() {
/* supportsLowercaseEquality = */ false);
} else {
return new Collation(
"UTF8_BINARY_LCASE",
"UTF8_LCASE",
PROVIDER_SPARK,
null,
CollationAwareUTF8String::compareLowerCase,
Expand Down Expand Up @@ -633,7 +633,7 @@ private static CollationSpecICU fromCollationId(int collationId) {
// Locale ID remains after removing all other specifiers.
int localeId = collationId;
// Verify locale ID is valid against `ICULocaleNames` array.
assert (localeId < ICULocaleNames.length);
assert(localeId >= 0 && localeId < ICULocaleNames.length);
CaseSensitivity caseSensitivity = CaseSensitivity.values()[caseSensitivityOrdinal];
AccentSensitivity accentSensitivity = AccentSensitivity.values()[accentSensitivityOrdinal];
String locale = ICULocaleNames[localeId];
Expand Down Expand Up @@ -728,8 +728,8 @@ public CollationIdentifier identifier() {

public static final int UTF8_BINARY_COLLATION_ID =
Collation.CollationSpecUTF8Binary.UTF8_BINARY_COLLATION_ID;
public static final int UTF8_BINARY_LCASE_COLLATION_ID =
Collation.CollationSpecUTF8Binary.UTF8_BINARY_LCASE_COLLATION_ID;
public static final int UTF8_LCASE_COLLATION_ID =
Collation.CollationSpecUTF8Binary.UTF8_LCASE_COLLATION_ID;
public static final int UNICODE_COLLATION_ID =
Collation.CollationSpecICU.UNICODE_COLLATION_ID;
public static final int UNICODE_CI_COLLATION_ID =
Expand Down Expand Up @@ -766,7 +766,7 @@ public static StringSearch getStringSearch(
/**
* Returns a collation-unaware StringSearch object for the given pattern and target strings.
* While this object does not respect collation, it can be used to find occurrences of the pattern
* in the target string for UTF8_BINARY or UTF8_BINARY_LCASE (if arguments are lowercased).
* in the target string for UTF8_BINARY or UTF8_LCASE (if arguments are lowercased).
*/
public static StringSearch getStringSearch(
final UTF8String targetUTF8String,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ public static UTF8String execLowercase(

public static boolean supportsLowercaseRegex(final int collationId) {
// for regex, only Unicode case-insensitive matching is possible,
// so UTF8_BINARY_LCASE is treated as UNICODE_CI in this context
// so UTF8_LCASE is treated as UNICODE_CI in this context
return CollationFactory.fetchCollation(collationId).supportsLowercaseEquality;
}

Expand Down
Loading