Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48410][SQL] Fix InitCap expression for UTF8_BINARY_LCASE & ICU collations #46732

Closed
wants to merge 11 commits into from
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,24 @@ public static String lowerCaseCodePoints(final String target) {
return sb.toString();
}

/**
* Convert the input string to titlecase using the ICU root locale rules.
*/
public static UTF8String toTitleCase(final UTF8String target) {
return UTF8String.fromString(toTitleCase(target.toString()));
}

public static String toTitleCase(final String target) {
return UCharacter.toTitleCase(target, BreakIterator.getWordInstance());
}

/**
* Convert the input string to titlecase using the specified ICU collation rules.
*/
public static UTF8String toTitleCase(final UTF8String target, final int collationId) {
return UTF8String.fromString(toTitleCase(target.toString(), collationId));
}

public static String toTitleCase(final String target, final int collationId) {
ULocale locale = CollationFactory.fetchCollation(collationId)
.collator.getLocale(ULocale.ACTUAL_LOCALE);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,10 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
public static class InitCap {
public static UTF8String exec(final UTF8String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return execUTF8(v);
if (collation.supportsBinaryEquality) {
return execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
Expand All @@ -270,25 +272,22 @@ public static UTF8String exec(final UTF8String v, final int collationId) {
public static String genCode(final String v, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.InitCap.exec";
if (collation.supportsBinaryEquality || collation.supportsLowercaseEquality) {
return String.format(expr + "UTF8(%s)", v);
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
return String.format(expr + "ICU(%s, %d)", v, collationId);
}
}

public static UTF8String execUTF8(final UTF8String v) {
public static UTF8String execBinary(final UTF8String v) {
uros-db marked this conversation as resolved.
Show resolved Hide resolved
return v.toLowerCase().toTitleCase();
}

public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toTitleCase(v);
}
public static UTF8String execICU(final UTF8String v, final int collationId) {
return UTF8String.fromString(
CollationAwareUTF8String.toTitleCase(
CollationAwareUTF8String.toLowerCase(
v.toString(),
collationId
),
collationId));
return CollationAwareUTF8String.toTitleCase(v, collationId);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -745,10 +745,48 @@ public void testInitCap() throws SparkException {
assertInitCap("aB 世 de", "UNICODE_CI", "Ab 世 De");
assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe");
// Case-variable character length
assertInitCap("İo", "UTF8_BINARY", "İo");
assertInitCap("İo", "UTF8_BINARY_LCASE", "İo");
assertInitCap("İo", "UNICODE", "İo");
assertInitCap("İo", "UNICODE_CI", "İo");
assertInitCap("İo", "UTF8_BINARY", "I\u0307o");
assertInitCap("İo", "UTF8_BINARY_LCASE", "İo");
assertInitCap("İo", "UNICODE", "İo");
assertInitCap("İo", "UNICODE_CI", "İo");
assertInitCap("i\u0307o", "UTF8_BINARY", "I\u0307o");
assertInitCap("i\u0307o", "UTF8_BINARY_LCASE", "I\u0307o");
assertInitCap("i\u0307o", "UNICODE", "I\u0307o");
assertInitCap("i\u0307o", "UNICODE_CI", "I\u0307o");
// Different possible word boundaries
assertInitCap("a b c", "UTF8_BINARY", "A B C");
assertInitCap("a b c", "UNICODE", "A B C");
assertInitCap("a b c", "UTF8_BINARY_LCASE", "A B C");
assertInitCap("a b c", "UNICODE_CI", "A B C");
assertInitCap("a.b,c", "UTF8_BINARY", "A.b,c");
assertInitCap("a.b,c", "UNICODE", "A.b,C");
assertInitCap("a.b,c", "UTF8_BINARY_LCASE", "A.b,C");
assertInitCap("a.b,c", "UNICODE_CI", "A.b,C");
assertInitCap("a. b-c", "UTF8_BINARY", "A. B-c");
assertInitCap("a. b-c", "UNICODE", "A. B-C");
assertInitCap("a. b-c", "UTF8_BINARY_LCASE", "A. B-C");
assertInitCap("a. b-c", "UNICODE_CI", "A. B-C");
assertInitCap("a?b世c", "UTF8_BINARY", "A?b世c");
assertInitCap("a?b世c", "UNICODE", "A?B世C");
assertInitCap("a?b世c", "UTF8_BINARY_LCASE", "A?B世C");
assertInitCap("a?b世c", "UNICODE_CI", "A?B世C");
// Titlecase characters that are different from uppercase characters
assertInitCap("dzDZDz", "UTF8_BINARY", "Dzdzdz");
assertInitCap("dzDZDz", "UNICODE", "Dzdzdz");
assertInitCap("dzDZDz", "UTF8_BINARY_LCASE", "Dzdzdz");
assertInitCap("dzDZDz", "UNICODE_CI", "Dzdzdz");
assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY", "Džaba Ljubav Njegova");
assertInitCap("džaba Ljubav NJegova", "UNICODE", "Džaba Ljubav Njegova");
assertInitCap("džaba Ljubav NJegova", "UTF8_BINARY_LCASE", "Džaba Ljubav Njegova");
assertInitCap("džaba Ljubav NJegova", "UNICODE_CI", "Džaba Ljubav Njegova");
uros-db marked this conversation as resolved.
Show resolved Hide resolved
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY",
"ß fi ffi ff st Σημερινος Ασημενιος I\u0307ota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UTF8_BINARY_LCASE",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
assertInitCap("ß fi ffi ff st ΣΗΜΕΡΙΝΟΣ ΑΣΗΜΕΝΙΟΣ İOTA", "UNICODE_CI",
"Ss Fi Ffi Ff St Σημερινος Ασημενιος İota");
}

private void assertStringInstr(String string, String substring, String collationName,
Expand Down