Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-49911][SQL] Fix semantic of support binary equality #48472

Original file line number Diff line number Diff line change
Expand Up @@ -1363,9 +1363,9 @@ public static UTF8String trimRight(

public static UTF8String[] splitSQL(final UTF8String input, final UTF8String delim,
final int limit, final int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
if (CollationFactory.fetchCollation(collationId).isUtf8BinaryType) {
return input.split(delim, limit);
} else if (CollationFactory.fetchCollation(collationId).supportsLowercaseEquality) {
} else if (CollationFactory.fetchCollation(collationId).isUtf8LcaseType) {
return lowercaseSplitSQL(input, delim, limit);
} else {
return icuSplitSQL(input, delim, limit, collationId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,18 @@ public static class Collation {
*/
public final boolean supportsSpaceTrimming;

/**
* Is Utf8 binary type as indicator if collation base type is UTF8 binary. Note currently only
* collations Utf8_Binary and Utf8_Binary_RTRIM are considered as Utf8 binary type.
*/
public final boolean isUtf8BinaryType;

/**
* Is Utf8 lcase type as indicator if collation base type is UTF8 lcase. Note currently only
* collations Utf8_Lcase and Utf8_Lcase_RTRIM are considered as Utf8 Lcase type.
*/
public final boolean isUtf8LcaseType;

public Collation(
String collationName,
String provider,
Expand All @@ -168,24 +180,22 @@ public Collation(
String version,
ToLongFunction<UTF8String> hashFunction,
BiFunction<UTF8String, UTF8String, Boolean> equalsFunction,
boolean supportsBinaryEquality,
boolean supportsBinaryOrdering,
boolean supportsLowercaseEquality,
boolean isUtf8BinaryType,
boolean isUtf8LcaseType,
boolean supportsSpaceTrimming) {
this.collationName = collationName;
this.provider = provider;
this.collator = collator;
this.comparator = comparator;
this.version = version;
this.hashFunction = hashFunction;
this.supportsBinaryEquality = supportsBinaryEquality;
this.supportsBinaryOrdering = supportsBinaryOrdering;
this.supportsLowercaseEquality = supportsLowercaseEquality;
this.isUtf8BinaryType = isUtf8BinaryType;
this.isUtf8LcaseType = isUtf8LcaseType;
this.equalsFunction = equalsFunction;
this.supportsSpaceTrimming = supportsSpaceTrimming;

// De Morgan's Law to check supportsBinaryOrdering => supportsBinaryEquality
assert(!supportsBinaryOrdering || supportsBinaryEquality);
this.supportsBinaryEquality = !supportsSpaceTrimming && isUtf8BinaryType;
this.supportsBinaryOrdering = !supportsSpaceTrimming && isUtf8BinaryType;
this.supportsLowercaseEquality = !supportsSpaceTrimming && isUtf8LcaseType;
jovanpavl-db marked this conversation as resolved.
Show resolved Hide resolved
// No Collation can simultaneously support binary equality and lowercase equality
assert(!supportsBinaryEquality || !supportsLowercaseEquality);

Expand Down Expand Up @@ -567,9 +577,8 @@ protected Collation buildCollation() {
"1.0",
hashFunction,
equalsFunction,
/* supportsBinaryEquality = */ true,
/* supportsBinaryOrdering = */ true,
/* supportsLowercaseEquality = */ false,
/* isUtf8BinaryType = */ true,
/* isUtf8LcaseType = */ false,
spaceTrimming != SpaceTrimming.NONE);
} else {
Comparator<UTF8String> comparator;
Expand All @@ -595,9 +604,8 @@ protected Collation buildCollation() {
"1.0",
hashFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* supportsBinaryEquality = */ false,
/* supportsBinaryOrdering = */ false,
/* supportsLowercaseEquality = */ true,
/* isUtf8BinaryType = */ false,
/* isUtf8LcaseType = */ true,
spaceTrimming != SpaceTrimming.NONE);
}
}
Expand Down Expand Up @@ -982,9 +990,8 @@ protected Collation buildCollation() {
ICU_COLLATOR_VERSION,
hashFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* supportsBinaryEquality = */ false,
/* supportsBinaryOrdering = */ false,
/* supportsLowercaseEquality = */ false,
/* isUtf8BinaryType = */ false,
/* isUtf8LcaseType = */ false,
spaceTrimming != SpaceTrimming.NONE);
}

Expand Down Expand Up @@ -1191,9 +1198,9 @@ public static UTF8String getCollationKey(UTF8String input, int collationId) {
if (collation.supportsSpaceTrimming) {
input = Collation.CollationSpec.applyTrimmingPolicy(input, collationId);
}
if (collation.supportsBinaryEquality) {
if (collation.isUtf8BinaryType) {
return input;
} else if (collation.supportsLowercaseEquality) {
} else if (collation.isUtf8LcaseType) {
return CollationAwareUTF8String.lowerCaseCodePoints(input);
} else {
CollationKey collationKey = collation.collator.getCollationKey(
Expand All @@ -1207,9 +1214,9 @@ public static byte[] getCollationKeyBytes(UTF8String input, int collationId) {
if (collation.supportsSpaceTrimming) {
input = Collation.CollationSpec.applyTrimmingPolicy(input, collationId);
}
if (collation.supportsBinaryEquality) {
if (collation.isUtf8BinaryType) {
return input.getBytes();
} else if (collation.supportsLowercaseEquality) {
} else if (collation.isUtf8LcaseType) {
return CollationAwareUTF8String.lowerCaseCodePoints(input).getBytes();
} else {
return collation.collator.getCollationKey(
Expand Down
Loading