Skip to content

Commit

Permalink
Use ICU
Browse files Browse the repository at this point in the history
  • Loading branch information
uros-db committed Jun 20, 2024
1 parent 714699b commit 194177d
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -206,21 +206,22 @@ public static boolean execICU(final UTF8String l, final UTF8String r,
}

public static class Upper {
public static UTF8String exec(final UTF8String v, final int collationId) {
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(v);
return useICU ? execBinaryICU(v) : execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
public static String genCode(final String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Upper.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
String funcName = useICU ? "BinaryICU" : "Binary";
return String.format(expr + "%s(%s)", funcName, v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
Expand All @@ -230,6 +231,9 @@ public static String genCode(final String v, final int collationId) {
public static UTF8String execBinary(final UTF8String v) {
return v.toUpperCase();
}
public static UTF8String execBinaryICU(final UTF8String v) {
return CollationAwareUTF8String.toUpperCase(v);
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toUpperCase(v);
}
Expand All @@ -239,21 +243,22 @@ public static UTF8String execICU(final UTF8String v, final int collationId) {
}

public static class Lower {
public static UTF8String exec(final UTF8String v, final int collationId) {
public static UTF8String exec(final UTF8String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(v);
return useICU ? execBinaryICU(v) : execBinary(v);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(v);
} else {
return execICU(v, collationId);
}
}
public static String genCode(final String v, final int collationId) {
public static String genCode(final String v, final int collationId, boolean useICU) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Lower.exec";
String expr = "CollationSupport.Lower.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s)", v);
String funcName = useICU ? "BinaryICU" : "Binary";
return String.format(expr + "%s(%s)", funcName, v);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s)", v);
} else {
Expand All @@ -263,6 +268,9 @@ public static String genCode(final String v, final int collationId) {
public static UTF8String execBinary(final UTF8String v) {
return v.toLowerCase();
}
public static UTF8String execBinaryICU(final UTF8String v) {
return CollationAwareUTF8String.toLowerCase(v);
}
public static UTF8String execLowercase(final UTF8String v) {
return CollationAwareUTF8String.toLowerCase(v);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -606,7 +606,8 @@ private void assertUpper(String target, String collationName, String expected)
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId));
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, true));
assertEquals(expected_utf8, CollationSupport.Upper.exec(target_utf8, collationId, false));
}

@Test
Expand Down Expand Up @@ -660,7 +661,8 @@ private void assertLower(String target, String collationName, String expected)
UTF8String target_utf8 = UTF8String.fromString(target);
UTF8String expected_utf8 = UTF8String.fromString(expected);
int collationId = CollationFactory.collationNameToId(collationName);
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId));
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, true));
assertEquals(expected_utf8, CollationSupport.Lower.exec(target_utf8, collationId, false));
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -455,12 +455,15 @@ case class Upper(child: Expression)

final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId

override def convert(v: UTF8String): UTF8String = CollationSupport.Upper.exec(v, collationId)
private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)

override def convert(v: UTF8String): UTF8String =
CollationSupport.Upper.exec(v, collationId, useICU)

final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId))
defineCodeGen(ctx, ev, c => CollationSupport.Upper.genCode(c, collationId, useICU))
}

override protected def withNewChildInternal(newChild: Expression): Upper = copy(child = newChild)
Expand All @@ -483,12 +486,15 @@ case class Lower(child: Expression)

final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId

override def convert(v: UTF8String): UTF8String = CollationSupport.Lower.exec(v, collationId)
private final lazy val useICU = SQLConf.get.getConf(SQLConf.ICU_CASE_MAPPINGS_ENABLED)

override def convert(v: UTF8String): UTF8String =
CollationSupport.Lower.exec(v, collationId, useICU)

final override val nodePatterns: Seq[TreePattern] = Seq(UPPER_OR_LOWER)

override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId))
defineCodeGen(ctx, ev, c => CollationSupport.Lower.genCode(c, collationId, useICU))
}

override def prettyName: String =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,14 @@ object SQLConf {
_ => Map())
.createWithDefault("UTF8_BINARY")

val ICU_CASE_MAPPINGS_ENABLED =
buildConf("spark.sql.icu.caseMappings.enabled")
.doc("When enabled we use the ICU library (instead of the JVM) to implement case mappings" +
" for strings under UTF8_BINARY collation.")
.version("4.0.0")
.booleanConf
.createWithDefault(true)

val FETCH_SHUFFLE_BLOCKS_IN_BATCH =
buildConf("spark.sql.adaptive.fetchShuffleBlocksInBatch")
.internal()
Expand Down

0 comments on commit 194177d

Please sign in to comment.