Skip to content

Commit

Permalink
[SPARK-48681][SQL] Use ICU in Lower/Upper expressions for UTF8_BINARY…
Browse files Browse the repository at this point in the history
… strings

### What changes were proposed in this pull request?
Update `Lower` & `Upper` Spark expressions to use ICU case mappings for UTF8_BINARY collation, instead of the currently used JVM case mappings. This behaviour is put under the `ICU_CASE_MAPPINGS_ENABLED` flag in SQLConf, which is `true` by default.

### Why are the changes needed?
To keep the consistency between collations - all collations shouls use ICU-based case mappings, including the UTF8_BINARY collation.

### Does this PR introduce _any_ user-facing change?
Yes, the behaviour of `lower` & `upper` string functions for UTF8_BINARY will now rely on ICU-based case mappings. However, by turning the `ICU_CASE_MAPPINGS_ENABLED` flag off, users can get the old JVM-based case mappings. Note that the difference between the two is really subtle.

### How was this patch tested?
Existing tests, with extended `CollationSupport` unit tests for Lower/Upper to verify both ICU and JVM behaviour.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes #47043 from uros-db/change-lower-upper.

Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
uros-db authored and cloud-fan committed Jun 24, 2024
1 parent 4663b84 commit a7dc020
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -206,21 +206,22 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
}

public static class Upper {
public static UTF8String exec(final UTF8String v, final int collationId) {
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(v);
return useICU ? execBinaryICU(v) : execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
public static String genCode(final String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
String funcName = useICU ? "BinaryICU" : "Binary";
return String.format(expr + "%s(%s)", funcName, v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
Expand All @@ -230,6 +231,9 @@ public static String genCode(final String v, final int collationId) {
public static UTF8String execBinary(final UTF8String v) {
return v.toUpperCase();
}
public static UTF8String execBinaryICU(final UTF8String v) {
return CollationAwareUTF8String.toUpperCase(v);
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toUpperCase(v);
}
Expand All @@ -239,21 +243,22 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
}

public static class Lower {
public static UTF8String exec(final UTF8String v, final int collationId) {
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(v);
return useICU ? execBinaryICU(v) : execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
public static String genCode(final String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Lower.exec";
String expr = "CollationSupport.Lower.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
String funcName = useICU ? "BinaryICU" : "Binary";
return String.format(expr + "%s(%s)", funcName, v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
Expand All @@ -263,6 +268,9 @@ public static String genCode(final String v, final int collationId) {
public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase();
}
public static UTF8String execBinaryICU(final UTF8String v) {
return CollationAwareUTF8String.toLowerCase(v);
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toLowerCase(v);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,11 @@ private void assertUpper(String target, String collationName, String expected)
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId));
// Testing the new ICU-based implementation of the Upper function.
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, true));
// Testing the old JVM-based implementation of the Upper function.
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, false));
// Note: results should be the same in these tests for both ICU and JVM-based implementations.
}

@Test
Expand Down Expand Up @@ -660,7 +664,11 @@ private void assertLower(String target, String collationName, String expected)
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId));
// Testing the new ICU-based implementation of the Lower function.
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, true));
// Testing the old JVM-based implementation of the Lower function.
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, false));
// Note: results should be the same in these tests for both ICU and JVM-based implementations.
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -455,12 +455,16 @@ case class Upper(child: Expression)

final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId

override def convert(v: UTF8String): UTF8String = CollationSupport.Upper.exec(v, collationId)
// Flag to indicate whether to use ICU instead of JVM case mappings for UTF8_BINARY collation.
private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)

override def convert(v: UTF8String): UTF8String =
CollationSupport.Upper.exec(v, collationId, useICU)

final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId))
defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId, useICU))
}

override protected def withNewChildInternal(newChild: Expression): Upper = copy(child = newChild)
Expand All @@ -483,12 +487,16 @@ case class Lower(child: Expression)

final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId

override def convert(v: UTF8String): UTF8String = CollationSupport.Lower.exec(v, collationId)
// Flag to indicate whether to use ICU instead of JVM case mappings for UTF8_BINARY collation.
private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)

override def convert(v: UTF8String): UTF8String =
CollationSupport.Lower.exec(v, collationId, useICU)

final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId))
defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId, useICU))
}

override def prettyName: String =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,14 @@ object SQLConf {
_ => Map())
.createWithDefault("UTF8_BINARY")

val ICU_CASE_MAPPINGS_ENABLED =
buildConf("spark.sql.icu.caseMappings.enabled")
.doc("When enabled we use the ICU library (instead of the JVM) to implement case mappings" +
" for strings under UTF8_BINARY collation.")
.version("4.0.0")
.booleanConf
.createWithDefault(true)

val FETCH_SHUFFLE_BLOCKS_IN_BATCH =
buildConf("spark.sql.adaptive.fetchShuffleBlocksInBatch")
.internal()
Expand Down

0 comments on commit a7dc020

Please sign in to comment.