Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-49911][SQL] Fix semantic of support binary equality #48472

Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,24 @@ public static class Collation {
* expressions, as this particular collation is not supported by the external ICU library.
*/
public final boolean supportsLowercaseEquality;

jovanpavl-db marked this conversation as resolved.
Show resolved Hide resolved
/**
* Support for Space Trimming implies that that based on specifier (for now only right trim)
* leading, trailing or both spaces are removed from the input string before comparison.
*/
public final boolean supportsSpaceTrimming;

/**
* Is Utf8 binary type as indicator if collation base type is UTF8 binary. Note currently only
* collations Utf8_Binary and Utf8_Binary_RTRIM are considered as Utf8 binary type.
*/
public final boolean isUtf8BinaryType;

/**
* Is Utf8 lcase type as indicator if collation base type is UTF8 lcase. Note currently only
* collations Utf8_Lcase and Utf8_Lcase_RTRIM are considered as Utf8 Lcase type.
*/
public final boolean isUtf8LcaseType;

public Collation(
String collationName,
String provider,
Expand All @@ -168,24 +179,22 @@ public Collation(
String version,
ToLongFunction<UTF8String> hashFunction,
BiFunction<UTF8String, UTF8String, Boolean> equalsFunction,
boolean supportsBinaryEquality,
boolean supportsBinaryOrdering,
boolean supportsLowercaseEquality,
boolean isUtf8BinaryType,
boolean isUtf8LcaseType,
boolean supportsSpaceTrimming) {
this.collationName = collationName;
this.provider = provider;
this.collator = collator;
this.comparator = comparator;
this.version = version;
this.hashFunction = hashFunction;
this.supportsBinaryEquality = supportsBinaryEquality;
this.supportsBinaryOrdering = supportsBinaryOrdering;
this.supportsLowercaseEquality = supportsLowercaseEquality;
this.isUtf8BinaryType = isUtf8BinaryType;
this.isUtf8LcaseType = isUtf8LcaseType;
this.equalsFunction = equalsFunction;
this.supportsSpaceTrimming = supportsSpaceTrimming;

// De Morgan's Law to check supportsBinaryOrdering => supportsBinaryEquality
assert(!supportsBinaryOrdering || supportsBinaryEquality);
this.supportsBinaryEquality = !supportsSpaceTrimming && isUtf8BinaryType;
this.supportsBinaryOrdering = !supportsSpaceTrimming && isUtf8BinaryType;
this.supportsLowercaseEquality = !supportsSpaceTrimming && isUtf8LcaseType;
jovanpavl-db marked this conversation as resolved.
Show resolved Hide resolved
// No Collation can simultaneously support binary equality and lowercase equality
assert(!supportsBinaryEquality || !supportsLowercaseEquality);

Expand Down Expand Up @@ -567,9 +576,8 @@ protected Collation buildCollation() {
"1.0",
hashFunction,
equalsFunction,
/* supportsBinaryEquality = */ true,
/* supportsBinaryOrdering = */ true,
/* supportsLowercaseEquality = */ false,
/* isUtf8BinaryType = */ true,
/* isUtf8LcaseType = */ false,
spaceTrimming != SpaceTrimming.NONE);
} else {
Comparator<UTF8String> comparator;
Expand All @@ -595,9 +603,8 @@ protected Collation buildCollation() {
"1.0",
hashFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* supportsBinaryEquality = */ false,
/* supportsBinaryOrdering = */ false,
/* supportsLowercaseEquality = */ true,
/* isUtf8BinaryType = */ false,
/* isUtf8LcaseType = */ true,
spaceTrimming != SpaceTrimming.NONE);
}
}
Expand Down Expand Up @@ -982,9 +989,8 @@ protected Collation buildCollation() {
ICU_COLLATOR_VERSION,
hashFunction,
(s1, s2) -> comparator.compare(s1, s2) == 0,
/* supportsBinaryEquality = */ false,
/* supportsBinaryOrdering = */ false,
/* supportsLowercaseEquality = */ false,
/* isUtf8BinaryType = */ false,
/* isUtf8LcaseType = */ false,
spaceTrimming != SpaceTrimming.NONE);
}

Expand Down Expand Up @@ -1191,9 +1197,9 @@ public static UTF8String getCollationKey(UTF8String input, int collationId) {
if (collation.supportsSpaceTrimming) {
input = Collation.CollationSpec.applyTrimmingPolicy(input, collationId);
}
if (collation.supportsBinaryEquality) {
if (collation.isUtf8BinaryType) {
return input;
} else if (collation.supportsLowercaseEquality) {
} else if (collation.isUtf8LcaseType) {
return CollationAwareUTF8String.lowerCaseCodePoints(input);
} else {
CollationKey collationKey = collation.collator.getCollationKey(
Expand All @@ -1207,9 +1213,9 @@ public static byte[] getCollationKeyBytes(UTF8String input, int collationId) {
if (collation.supportsSpaceTrimming) {
input = Collation.CollationSpec.applyTrimmingPolicy(input, collationId);
}
if (collation.supportsBinaryEquality) {
if (collation.isUtf8BinaryType) {
return input.getBytes();
} else if (collation.supportsLowercaseEquality) {
} else if (collation.isUtf8LcaseType) {
return CollationAwareUTF8String.lowerCaseCodePoints(input).getBytes();
} else {
return collation.collator.getCollationKey(
Expand Down
Loading