From b4624bf4be28974e8df175670f89cf96858b7b81 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 25 Apr 2024 12:53:17 +0800 Subject: [PATCH] [SPARK-47414][SQL] Lowercase collation support for regexp expressions ### What changes were proposed in this pull request? Introduce collation awareness for regexp expressions: like, ilike, like all, not like all, like any, not like any, rlike, split, regexp_replace, regexp_extract, regexp_extract_all, regexp_count, regexp_substr, regexp_instr. Note: collation support is only enabled for binary (UTF8_BINARY, UNICODE) & lowercase (UTF8_BINARY_LCASE) collation. ### Why are the changes needed? Add collation support for built-in regexp functions in Spark. ### Does this PR introduce _any_ user-facing change? Yes, users should now be able to use collated strings within arguments for built-in regexp functions: like, ilike, like all, not like all, like any, not like any, rlike, split, regexp_replace, regexp_extract, regexp_extract_all, regexp_count, regexp_substr, regexp_instr. ### How was this patch tested? Unit regexp expression tests and e2e sql tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46077 from uros-db/SPARK-47414. Authored-by: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Signed-off-by: Wenchen Fan --- .../sql/catalyst/util/CollationSupport.java | 21 +- .../analysis/CollationTypeCasts.scala | 7 +- .../expressions/regexpExpressions.scala | 92 ++++-- .../CollationRegexpExpressionsSuite.scala | 170 ++++++++++ ...te.scala => CollationSQLRegexpSuite.scala} | 292 ++++++++++++------ 5 files changed, 456 insertions(+), 126 deletions(-) create mode 100644 sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala rename sql/core/src/test/scala/org/apache/spark/sql/{CollationRegexpExpressionsSuite.scala => CollationSQLRegexpSuite.scala} (50%) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 3e4973f5c187e..70a3f5bd61362 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -23,6 +23,8 @@ import org.apache.spark.unsafe.types.UTF8String; +import java.util.regex.Pattern; + /** * Static entry point for collation-aware expressions (StringExpressions, RegexpExpressions, and * other expressions that require custom collation support), as well as private utility methods for @@ -310,7 +312,24 @@ public static int execICU(final UTF8String string, final UTF8String substring, * Collation-aware regexp expressions. */ - // TODO: Add more collation-aware regexp expressions. + public static boolean supportsLowercaseRegex(final int collationId) { + // for regex, only Unicode case-insensitive matching is possible, + // so UTF8_BINARY_LCASE is treated as UNICODE_CI in this context + return CollationFactory.fetchCollation(collationId).supportsLowercaseEquality; + } + + private static final int lowercaseRegexFlags = Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE; + public static int collationAwareRegexFlags(final int collationId) { + return supportsLowercaseRegex(collationId) ? lowercaseRegexFlags : 0; + } + + private static final UTF8String lowercaseRegexPrefix = UTF8String.fromString("(?ui)"); + public static UTF8String lowercaseRegex(final UTF8String regex) { + return UTF8String.concat(lowercaseRegexPrefix, regex); + } + public static UTF8String collationAwareRegex(final UTF8String regex, final int collationId) { + return supportsLowercaseRegex(collationId) ? lowercaseRegex(regex) : regex; + } /** * Other collation-aware expressions. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala index c6232a870dff7..7179d69f75bc8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala @@ -22,7 +22,7 @@ import javax.annotation.Nullable import scala.annotation.tailrec import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType, haveSameType} -import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least, Literal, Overlay, StringLPad, StringRPad} +import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Elt, Expression, Greatest, If, In, InSubquery, Least, Literal, Overlay, RegExpReplace, StringLPad, StringRPad} import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @@ -52,6 +52,11 @@ object CollationTypeCasts extends TypeCoercionRule { overlayExpr.withNewChildren(collateToSingleType(Seq(overlayExpr.input, overlayExpr.replace)) ++ Seq(overlayExpr.pos, overlayExpr.len)) + case regExpReplace: RegExpReplace => + val Seq(subject, rep) = collateToSingleType(Seq(regExpReplace.subject, regExpReplace.rep)) + val newChildren = Seq(subject, regExpReplace.regexp, rep, regExpReplace.pos) + regExpReplace.withNewChildren(newChildren) + case stringPadExpr @ (_: StringRPad | _: StringLPad) => val Seq(str, len, pad) = stringPadExpr.children val Seq(newStr, newPad) = collateToSingleType(Seq(str, pad)) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index b33de303b5d55..297c709c6d7d9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -33,8 +33,9 @@ import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.catalyst.trees.BinaryLike import org.apache.spark.sql.catalyst.trees.TreePattern.{LIKE_FAMLIY, REGEXP_EXTRACT_FAMILY, REGEXP_REPLACE, TreePattern} -import org.apache.spark.sql.catalyst.util.{GenericArrayData, StringUtils} +import org.apache.spark.sql.catalyst.util.{CollationSupport, GenericArrayData, StringUtils} import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.types.{StringTypeAnyCollation, StringTypeBinaryLcase} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -44,7 +45,11 @@ abstract class StringRegexExpression extends BinaryExpression def escape(v: String): String def matches(regex: Pattern, str: String): Boolean - override def inputTypes: Seq[DataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) + + final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId + final lazy val collationRegexFlags: Int = CollationSupport.collationAwareRegexFlags(collationId) // try cache foldable pattern private lazy val cache: Pattern = right match { @@ -58,7 +63,7 @@ abstract class StringRegexExpression extends BinaryExpression } else { // Let it raise exception if couldn't compile the regex string try { - Pattern.compile(escape(str)) + Pattern.compile(escape(str), collationRegexFlags) } catch { case e: PatternSyntaxException => throw QueryExecutionErrors.invalidPatternError(prettyName, e.getPattern, e) @@ -158,7 +163,8 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) val regexStr = StringEscapeUtils.escapeJava(escape(rVal.asInstanceOf[UTF8String].toString())) val pattern = ctx.addMutableState(patternClass, "patternLike", - v => s"""$v = $patternClass.compile("$regexStr");""") + v => + s"""$v = $patternClass.compile("$regexStr", $collationRegexFlags);""".stripMargin) // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. val eval = left.genCode(ctx) @@ -186,7 +192,7 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) s""" String $rightStr = $eval2.toString(); $patternClass $pattern = $patternClass.compile( - $escapeFunc($rightStr, '$escapedEscapeChar')); + $escapeFunc($rightStr, '$escapedEscapeChar'), $collationRegexFlags); ${ev.value} = $pattern.matcher($eval1.toString()).matches(); """ }) @@ -258,7 +264,8 @@ case class ILike( def this(left: Expression, right: Expression) = this(left, right, '\\') - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) override protected def withNewChildrenInternal( newLeft: Expression, newRight: Expression): Expression = { @@ -273,7 +280,9 @@ sealed abstract class MultiLikeBase protected def isNotSpecified: Boolean - override def inputTypes: Seq[DataType] = StringType :: Nil + override def inputTypes: Seq[AbstractDataType] = StringTypeBinaryLcase :: Nil + final lazy val collationId: Int = child.dataType.asInstanceOf[StringType].collationId + final lazy val collationRegexFlags: Int = CollationSupport.collationAwareRegexFlags(collationId) override def nullable: Boolean = true @@ -281,8 +290,8 @@ sealed abstract class MultiLikeBase protected lazy val hasNull: Boolean = patterns.contains(null) - protected lazy val cache = patterns.filterNot(_ == null) - .map(s => Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'))) + protected lazy val cache = patterns.filterNot(_ == null).map(s => + Pattern.compile(StringUtils.escapeLikeRegex(s.toString, '\\'), collationRegexFlags)) protected lazy val matchFunc = if (isNotSpecified) { (p: Pattern, inputValue: String) => !p.matcher(inputValue).matches() @@ -475,7 +484,7 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress val regexStr = StringEscapeUtils.escapeJava(rVal.asInstanceOf[UTF8String].toString()) val pattern = ctx.addMutableState(patternClass, "patternRLike", - v => s"""$v = $patternClass.compile("$regexStr");""") + v => s"""$v = $patternClass.compile("$regexStr", $collationRegexFlags);""".stripMargin) // We don't use nullSafeCodeGen here because we don't want to re-evaluate right again. val eval = left.genCode(ctx) @@ -499,7 +508,7 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress nullSafeCodeGen(ctx, ev, (eval1, eval2) => { s""" String $rightStr = $eval2.toString(); - $patternClass $pattern = $patternClass.compile($rightStr); + $patternClass $pattern = $patternClass.compile($rightStr, $collationRegexFlags); ${ev.value} = $pattern.matcher($eval1.toString()).find(0); """ }) @@ -543,17 +552,20 @@ case class RLike(left: Expression, right: Expression) extends StringRegexExpress case class StringSplit(str: Expression, regex: Expression, limit: Expression) extends TernaryExpression with ImplicitCastInputTypes with NullIntolerant { - override def dataType: DataType = ArrayType(StringType, containsNull = false) - override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType) + override def dataType: DataType = ArrayType(str.dataType, containsNull = false) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType) override def first: Expression = str override def second: Expression = regex override def third: Expression = limit + final lazy val collationId: Int = str.dataType.asInstanceOf[StringType].collationId + def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)) override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { - val strings = string.asInstanceOf[UTF8String].split( - regex.asInstanceOf[UTF8String], limit.asInstanceOf[Int]) + val pattern = CollationSupport.collationAwareRegex(regex.asInstanceOf[UTF8String], collationId) + val strings = string.asInstanceOf[UTF8String].split(pattern, limit.asInstanceOf[Int]) new GenericArrayData(strings.asInstanceOf[Array[Any]]) } @@ -561,7 +573,8 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) val arrayClass = classOf[GenericArrayData].getName nullSafeCodeGen(ctx, ev, (str, regex, limit) => { // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = new $arrayClass($str.split($regex,$limit));""".stripMargin + s"""${ev.value} = new $arrayClass($str.split( + |CollationSupport.collationAwareRegex($regex, $collationId),$limit));""".stripMargin }) } @@ -658,7 +671,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio override def nullSafeEval(s: Any, p: Any, r: Any, i: Any): Any = { if (!p.equals(lastRegex)) { - val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName) + val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName, collationId) pattern = patternAndRegex._1 lastRegex = patternAndRegex._2 } @@ -683,9 +696,10 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio } } - override def dataType: DataType = StringType + override def dataType: DataType = subject.dataType override def inputTypes: Seq[AbstractDataType] = - Seq(StringType, StringType, StringType, IntegerType) + Seq(StringTypeBinaryLcase, StringTypeAnyCollation, StringTypeBinaryLcase, IntegerType) + final lazy val collationId: Int = subject.dataType.asInstanceOf[StringType].collationId override def prettyName: String = "regexp_replace" override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -708,7 +722,7 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio nullSafeCodeGen(ctx, ev, (subject, regexp, rep, pos) => { s""" - ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, collationId)} if (!$rep.equals($termLastReplacementInUTF8)) { // replacement string changed $termLastReplacementInUTF8 = $rep.clone(); @@ -771,15 +785,18 @@ abstract class RegExpExtractBase final override val nodePatterns: Seq[TreePattern] = Seq(REGEXP_EXTRACT_FAMILY) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation, IntegerType) override def first: Expression = subject override def second: Expression = regexp override def third: Expression = idx + final lazy val collationId: Int = subject.dataType.asInstanceOf[StringType].collationId + protected def getLastMatcher(s: Any, p: Any): Matcher = { if (p != lastRegex) { // regex value changed - val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName) + val patternAndRegex = RegExpUtils.getPatternAndLastRegex(p, prettyName, collationId) pattern = patternAndRegex._1 lastRegex = patternAndRegex._2 } @@ -848,7 +865,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio } } - override def dataType: DataType = StringType + override def dataType: DataType = subject.dataType override def prettyName: String = "regexp_extract" override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -863,7 +880,7 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => { s""" - ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, collationId)} if ($matcher.find()) { java.util.regex.MatchResult $matchResult = $matcher.toMatchResult(); $classNameRegExpExtractBase.checkGroupIndex("$prettyName", $matchResult.groupCount(), $idx); @@ -947,7 +964,7 @@ case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expres new GenericArrayData(matchResults.toArray.asInstanceOf[Array[Any]]) } - override def dataType: DataType = ArrayType(StringType) + override def dataType: DataType = ArrayType(subject.dataType) override def prettyName: String = "regexp_extract_all" override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { @@ -963,7 +980,8 @@ case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expres } nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => { s""" - | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, + collationId)} | java.util.ArrayList $matchResults = new java.util.ArrayList(); | while ($matcher.find()) { | java.util.regex.MatchResult $matchResult = $matcher.toMatchResult(); @@ -1020,7 +1038,8 @@ case class RegExpCount(left: Expression, right: Expression) override def children: Seq[Expression] = Seq(left, right) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): RegExpCount = @@ -1053,13 +1072,14 @@ case class RegExpSubStr(left: Expression, right: Expression) override lazy val replacement: Expression = new NullIf( RegExpExtract(subject = left, regexp = right, idx = Literal(0)), - Literal("")) + Literal.create("", left.dataType)) override def prettyName: String = "regexp_substr" override def children: Seq[Expression] = Seq(left, right) - override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType) + override def inputTypes: Seq[AbstractDataType] = + Seq(StringTypeBinaryLcase, StringTypeAnyCollation) override protected def withNewChildrenInternal( newChildren: IndexedSeq[Expression]): RegExpSubStr = @@ -1127,7 +1147,8 @@ case class RegExpInStr(subject: Expression, regexp: Expression, idx: Expression) s""" |try { | $setEvNotNull - | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName)} + | ${RegExpUtils.initLastMatcherCode(ctx, subject, regexp, matcher, prettyName, + collationId)} | if ($matcher.find()) { | ${ev.value} = $matcher.toMatchResult().start() + 1; | } else { @@ -1151,17 +1172,19 @@ object RegExpUtils { subject: String, regexp: String, matcher: String, - prettyName: String): String = { + prettyName: String, + collationId: Int): String = { val classNamePattern = classOf[Pattern].getCanonicalName val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex") val termPattern = ctx.addMutableState(classNamePattern, "pattern") + val collationRegexFlags = CollationSupport.collationAwareRegexFlags(collationId) s""" |if (!$regexp.equals($termLastRegex)) { | // regex value changed | try { | UTF8String r = $regexp.clone(); - | $termPattern = $classNamePattern.compile(r.toString()); + | $termPattern = $classNamePattern.compile(r.toString(), $collationRegexFlags); | $termLastRegex = r; | } catch (java.util.regex.PatternSyntaxException e) { | throw QueryExecutionErrors.invalidPatternError("$prettyName", e.getPattern(), e); @@ -1171,10 +1194,11 @@ object RegExpUtils { |""".stripMargin } - def getPatternAndLastRegex(p: Any, prettyName: String): (Pattern, UTF8String) = { + def getPatternAndLastRegex(p: Any, prettyName: String, collationId: Int): (Pattern, UTF8String) = + { val r = p.asInstanceOf[UTF8String].clone() val pattern = try { - Pattern.compile(r.toString) + Pattern.compile(r.toString, CollationSupport.collationAwareRegexFlags(collationId)) } catch { case e: PatternSyntaxException => throw QueryExecutionErrors.invalidPatternError(prettyName, e.getPattern, e) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala new file mode 100644 index 0000000000000..cc50aebf589e7 --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala @@ -0,0 +1,170 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.dsl.expressions._ +import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.types._ + +class CollationRegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { + + test("Like/ILike/RLike expressions with collated strings") { + case class LikeTestCase[R](l: String, regexLike: String, regexRLike: String, collation: String, + expectedLike: R, expectedILike: R, expectedRLike: R) + val testCases = Seq( + LikeTestCase("AbC", "%AbC%", ".b.", "UTF8_BINARY", true, true, true), + LikeTestCase("AbC", "%ABC%", ".B.", "UTF8_BINARY", false, true, false), + LikeTestCase("AbC", "%abc%", ".b.", "UTF8_BINARY_LCASE", true, true, true), + LikeTestCase("", "", "", "UTF8_BINARY_LCASE", true, true, true), + LikeTestCase("Foo", "", "", "UTF8_BINARY_LCASE", false, false, true), + LikeTestCase("", "%foo%", ".o.", "UTF8_BINARY_LCASE", false, false, false), + LikeTestCase("AbC", "%ABC%", ".B.", "UNICODE", false, true, false), + LikeTestCase(null, "%foo%", ".o.", "UNICODE", null, null, null), + LikeTestCase("Foo", null, null, "UNICODE", null, null, null), + LikeTestCase(null, null, null, "UNICODE", null, null, null) + ) + testCases.foreach(t => { + // Like + checkEvaluation(Like( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.regexLike, StringType), '\\'), t.expectedLike) + // ILike + checkEvaluation(ILike( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.regexLike, StringType), '\\').replacement, t.expectedILike) + // RLike + checkEvaluation(RLike( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.regexRLike, StringType)), t.expectedRLike) + }) + } + + test("StringSplit expression with collated strings") { + case class StringSplitTestCase[R](s: String, r: String, collation: String, expected: R) + val testCases = Seq( + StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_BINARY", Seq("1", "2", "3", "")), + StringSplitTestCase("1A2B3C", "[abc]", "UTF8_BINARY", Seq("1A2B3C")), + StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_BINARY_LCASE", Seq("1", "2", "3", "")), + StringSplitTestCase("1A2B3C", "[abc]", "UTF8_BINARY_LCASE", Seq("1", "2", "3", "")), + StringSplitTestCase("1A2B3C", "[1-9]+", "UNICODE", Seq("", "A", "B", "C")), + StringSplitTestCase("", "", "UNICODE", Seq("")), + StringSplitTestCase("1A2B3C", "", "UNICODE", Seq("1", "A", "2", "B", "3", "C")), + StringSplitTestCase("", "[1-9]+", "UNICODE", Seq("")), + StringSplitTestCase(null, "[1-9]+", "UNICODE", null), + StringSplitTestCase("1A2B3C", null, "UNICODE", null), + StringSplitTestCase(null, null, "UNICODE", null) + ) + testCases.foreach(t => { + // StringSplit + checkEvaluation(StringSplit( + Literal.create(t.s, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), -1), t.expected) + }) + } + + test("Regexp expressions with collated strings") { + case class RegexpTestCase[R](l: String, r: String, collation: String, + expectedExtract: R, expectedExtractAll: R, expectedCount: R) + val testCases = Seq( + RegexpTestCase("AbC-aBc", ".b.", "UTF8_BINARY", "AbC", Seq("AbC"), 1), + RegexpTestCase("AbC-abc", ".b.", "UTF8_BINARY", "AbC", Seq("AbC", "abc"), 2), + RegexpTestCase("AbC-aBc", ".b.", "UTF8_BINARY_LCASE", "AbC", Seq("AbC", "aBc"), 2), + RegexpTestCase("ABC-abc", ".b.", "UTF8_BINARY_LCASE", "ABC", Seq("ABC", "abc"), 2), + RegexpTestCase("", "", "UTF8_BINARY_LCASE", "", Seq(""), 1), + RegexpTestCase("Foo", "", "UTF8_BINARY_LCASE", "", Seq("", "", "", ""), 4), + RegexpTestCase("", ".o.", "UTF8_BINARY_LCASE", "", Seq(), 0), + RegexpTestCase("Foo", ".O.", "UNICODE", "", Seq(), 0), + RegexpTestCase(null, ".O.", "UNICODE", null, null, null), + RegexpTestCase("Foo", null, "UNICODE", null, null, null), + RegexpTestCase(null, null, "UNICODE", null, null, null) + ) + testCases.foreach(t => { + // RegExpExtract + checkEvaluation(RegExpExtract( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), 0), t.expectedExtract) + // RegExpExtractAll + checkEvaluation(RegExpExtractAll( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), 0), t.expectedExtractAll) + // RegExpCount + checkEvaluation(RegExpCount( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType)).replacement, t.expectedCount) + // RegExpInStr + def expectedInStr(count: Any): Any = count match { + case null => null + case 0 => 0 + case n: Int if n >= 1 => 1 + } + checkEvaluation(RegExpInStr( + Literal.create(t.l, StringType(CollationFactory.collationNameToId(t.collation))), + Literal.create(t.r, StringType), 0), expectedInStr(t.expectedCount)) + }) + } + + test("MultiLikeBase regexp expressions with collated strings") { + val nullStr = Literal.create(null, StringType) + // Supported collations (StringTypeBinaryLcase) + val binaryCollation = StringType(CollationFactory.collationNameToId("UTF8_BINARY")) + val lowercaseCollation = StringType(CollationFactory.collationNameToId("UTF8_BINARY_LCASE")) + val unicodeCollation = StringType(CollationFactory.collationNameToId("UNICODE")) + // LikeAll + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%foo%", "%oo"), true) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%foo%", "%bar%"), false) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%foo%", nullStr), null) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAll("%feo%", nullStr), false) + checkEvaluation(Literal.create(null, unicodeCollation).likeAll("%foo%", "%oo"), null) + // NotLikeAll + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%foo%", "%oo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAll("%goo%", "%bar%"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAll("%foo%", "%oo"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAll("%goo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%foo%", "%oo"), false) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%goo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%foo%", nullStr), false) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAll("%feo%", nullStr), null) + checkEvaluation(Literal.create(null, unicodeCollation).notLikeAll("%foo%", "%oo"), null) + // LikeAny + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%goo%", "%hoo"), false) + checkEvaluation(Literal.create("foo", binaryCollation).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAny("%goo%", "%hoo"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%goo%", "%hoo"), false) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%foo%", "%bar%"), true) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%foo%", nullStr), true) + checkEvaluation(Literal.create("foo", unicodeCollation).likeAny("%feo%", nullStr), null) + checkEvaluation(Literal.create(null, unicodeCollation).likeAny("%foo%", "%oo"), null) + // NotLikeAny + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%hoo"), true) + checkEvaluation(Literal.create("foo", binaryCollation).notLikeAny("%foo%", "%oo%"), false) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAny("%Foo%", "%hoo"), true) + checkEvaluation(Literal.create("Foo", lowercaseCollation).notLikeAny("%foo%", "%oo%"), false) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%Foo%", "%hoo"), true) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%foo%", "%oo%"), false) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%foo%", nullStr), null) + checkEvaluation(Literal.create("foo", unicodeCollation).notLikeAny("%feo%", nullStr), true) + checkEvaluation(Literal.create(null, unicodeCollation).notLikeAny("%foo%", "%oo"), null) + } + +} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationRegexpExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala similarity index 50% rename from sql/core/src/test/scala/org/apache/spark/sql/CollationRegexpExpressionsSuite.scala rename to sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala index bd7dfe1364d3b..739b000492c55 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationRegexpExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationSQLRegexpSuite.scala @@ -17,13 +17,12 @@ package org.apache.spark.sql -import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.{ArrayType, BooleanType, IntegerType, StringType} -class CollationRegexpExpressionsSuite +// scalastyle:off nonascii +class CollationSQLRegexpSuite extends QueryTest with SharedSparkSession with ExpressionEvalHelper { @@ -32,296 +31,409 @@ class CollationRegexpExpressionsSuite // Supported collations case class LikeTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - LikeTestCase("ABC", "%B%", "UTF8_BINARY", true) + LikeTestCase("ABC", "%B%", "UTF8_BINARY", true), + LikeTestCase("AḂC", "%ḃ%", "UTF8_BINARY_LCASE", true), + LikeTestCase("ABC", "%b%", "UNICODE", false) ) testCases.foreach(t => { - val query = s"SELECT like(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT like(collate('${t.l}', '${t.c}'), '${t.r}')" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class LikeTestFail(l: String, r: String, c: String) val failCases = Seq( - LikeTestFail("ABC", "%b%", "UTF8_BINARY_LCASE"), - LikeTestFail("ABC", "%B%", "UNICODE"), LikeTestFail("ABC", "%b%", "UNICODE_CI") ) failCases.foreach(t => { - val query = s"SELECT like(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + val query = s"SELECT like(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support ILike string expression with collation") { // Supported collations case class ILikeTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - ILikeTestCase("ABC", "%b%", "UTF8_BINARY", true) + ILikeTestCase("ABC", "%b%", "UTF8_BINARY", true), + ILikeTestCase("AḂC", "%ḃ%", "UTF8_BINARY_LCASE", true), + ILikeTestCase("ABC", "%b%", "UNICODE", true) ) testCases.foreach(t => { - val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), '${t.r}')" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class ILikeTestFail(l: String, r: String, c: String) val failCases = Seq( - ILikeTestFail("ABC", "%b%", "UTF8_BINARY_LCASE"), - ILikeTestFail("ABC", "%b%", "UNICODE"), ILikeTestFail("ABC", "%b%", "UNICODE_CI") ) failCases.foreach(t => { - val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + val query = s"SELECT ilike(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support LikeAll string expression with collation") { + // Supported collations + case class LikeAllTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + LikeAllTestCase("foo", Seq("%foo%", "%oo"), "UTF8_BINARY", true), + LikeAllTestCase("Foo", Seq("%foo%", "%oo"), "UTF8_BINARY_LCASE", true), + LikeAllTestCase("foo", Seq("%foo%", "%bar%"), "UNICODE", false) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ALL ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class LikeAllTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + LikeAllTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ALL ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support NotLikeAll string expression with collation") { + // Supported collations + case class NotLikeAllTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + NotLikeAllTestCase("foo", Seq("%foo%", "%oo"), "UTF8_BINARY", false), + NotLikeAllTestCase("Foo", Seq("%foo%", "%oo"), "UTF8_BINARY_LCASE", false), + NotLikeAllTestCase("foo", Seq("%goo%", "%bar%"), "UNICODE", true) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ALL ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class NotLikeAllTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + NotLikeAllTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ALL ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support LikeAny string expression with collation") { + // Supported collations + case class LikeAnyTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + LikeAnyTestCase("foo", Seq("%foo%", "%bar"), "UTF8_BINARY", true), + LikeAnyTestCase("Foo", Seq("%foo%", "%bar"), "UTF8_BINARY_LCASE", true), + LikeAnyTestCase("foo", Seq("%goo%", "%hoo%"), "UNICODE", false) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ANY ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class LikeAnyTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + LikeAnyTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') LIKE ANY ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } + assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") + }) + } + + test("Support NotLikeAny string expression with collation") { + // Supported collations + case class NotLikeAnyTestCase[R](s: String, p: Seq[String], c: String, result: R) + val testCases = Seq( + NotLikeAnyTestCase("foo", Seq("%foo%", "%hoo"), "UTF8_BINARY", true), + NotLikeAnyTestCase("Foo", Seq("%foo%", "%hoo"), "UTF8_BINARY_LCASE", true), + NotLikeAnyTestCase("foo", Seq("%foo%", "%oo%"), "UNICODE", false) + ) + testCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ANY ('${t.p.mkString("','")}')" + // Result & data type + checkAnswer(sql(query), Row(t.result)) + assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) + }) + // Unsupported collations + case class NotLikeAnyTestFail(s: String, p: Seq[String], c: String) + val failCases = Seq( + NotLikeAnyTestFail("Foo", Seq("%foo%", "%oo"), "UNICODE_CI") + ) + failCases.foreach(t => { + val query = s"SELECT collate('${t.s}', '${t.c}') NOT LIKE ANY ('${t.p.mkString("','")}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support RLike string expression with collation") { // Supported collations case class RLikeTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - RLikeTestCase("ABC", ".B.", "UTF8_BINARY", true) + RLikeTestCase("ABC", ".B.", "UTF8_BINARY", true), + RLikeTestCase("AḂC", ".ḃ.", "UTF8_BINARY_LCASE", true), + RLikeTestCase("ABC", ".b.", "UNICODE", false) ) testCases.foreach(t => { - val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), '${t.r}')" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(BooleanType)) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class RLikeTestFail(l: String, r: String, c: String) val failCases = Seq( - RLikeTestFail("ABC", ".b.", "UTF8_BINARY_LCASE"), - RLikeTestFail("ABC", ".B.", "UNICODE"), RLikeTestFail("ABC", ".b.", "UNICODE_CI") ) failCases.foreach(t => { - val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + val query = s"SELECT rlike(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support StringSplit string expression with collation") { // Supported collations case class StringSplitTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - StringSplitTestCase("ABC", "[B]", "UTF8_BINARY", Seq("A", "C")) + StringSplitTestCase("ABC", "[B]", "UTF8_BINARY", Seq("A", "C")), + StringSplitTestCase("AḂC", "[ḃ]", "UTF8_BINARY_LCASE", Seq("A", "C")), + StringSplitTestCase("ABC", "[B]", "UNICODE", Seq("A", "C")) ) testCases.foreach(t => { - val query = s"SELECT split(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT split(collate('${t.l}', '${t.c}'), '${t.r}')" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(ArrayType(StringType(t.c)))) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class StringSplitTestFail(l: String, r: String, c: String) val failCases = Seq( - StringSplitTestFail("ABC", "[b]", "UTF8_BINARY_LCASE"), - StringSplitTestFail("ABC", "[B]", "UNICODE"), StringSplitTestFail("ABC", "[b]", "UNICODE_CI") ) failCases.foreach(t => { - val query = s"SELECT split(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + val query = s"SELECT split(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support RegExpReplace string expression with collation") { // Supported collations case class RegExpReplaceTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - RegExpReplaceTestCase("ABCDE", ".C.", "UTF8_BINARY", "AFFFE") + RegExpReplaceTestCase("ABCDE", ".C.", "UTF8_BINARY", "AFFFE"), + RegExpReplaceTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", "AFFFE"), + RegExpReplaceTestCase("ABCDE", ".c.", "UNICODE", "ABCDE") ) testCases.foreach(t => { val query = - s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 'FFF')" + s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', collate('FFF', '${t.c}'))" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) - // TODO: Implicit casting (not currently supported) + // Implicit casting + checkAnswer(sql(s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', 'FFF')"), + Row(t.result)) + checkAnswer(sql(s"SELECT regexp_replace('${t.l}', '${t.r}', collate('FFF', '${t.c}'))"), + Row(t.result)) }) + // Collation mismatch + val collationMismatch = intercept[AnalysisException] { + sql("SELECT regexp_replace(collate('ABCDE','UTF8_BINARY'), '.c.', collate('FFF','UNICODE'))") + } + assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") // Unsupported collations case class RegExpReplaceTestFail(l: String, r: String, c: String) val failCases = Seq( - RegExpReplaceTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpReplaceTestFail("ABCDE", ".C.", "UNICODE"), RegExpReplaceTestFail("ABCDE", ".c.", "UNICODE_CI") ) failCases.foreach(t => { val query = - s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 'FFF')" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + s"SELECT regexp_replace(collate('${t.l}', '${t.c}'), '${t.r}', 'FFF')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support RegExpExtract string expression with collation") { // Supported collations case class RegExpExtractTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - RegExpExtractTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD") + RegExpExtractTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD"), + RegExpExtractTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", "BĆD"), + RegExpExtractTestCase("ABCDE", ".c.", "UNICODE", "") ) testCases.foreach(t => { val query = - s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" + s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), '${t.r}', 0)" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class RegExpExtractTestFail(l: String, r: String, c: String) val failCases = Seq( - RegExpExtractTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpExtractTestFail("ABCDE", ".C.", "UNICODE"), RegExpExtractTestFail("ABCDE", ".c.", "UNICODE_CI") ) failCases.foreach(t => { val query = - s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + s"SELECT regexp_extract(collate('${t.l}', '${t.c}'), '${t.r}', 0)" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support RegExpExtractAll string expression with collation") { // Supported collations case class RegExpExtractAllTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - RegExpExtractAllTestCase("ABCDE", ".C.", "UTF8_BINARY", Seq("BCD")) + RegExpExtractAllTestCase("ABCDE", ".C.", "UTF8_BINARY", Seq("BCD")), + RegExpExtractAllTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", Seq("BĆD")), + RegExpExtractAllTestCase("ABCDE", ".c.", "UNICODE", Seq()) ) testCases.foreach(t => { val query = - s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" + s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), '${t.r}', 0)" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(ArrayType(StringType(t.c)))) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class RegExpExtractAllTestFail(l: String, r: String, c: String) val failCases = Seq( - RegExpExtractAllTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpExtractAllTestFail("ABCDE", ".C.", "UNICODE"), RegExpExtractAllTestFail("ABCDE", ".c.", "UNICODE_CI") ) failCases.foreach(t => { val query = - s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'), 0)" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + s"SELECT regexp_extract_all(collate('${t.l}', '${t.c}'), '${t.r}', 0)" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support RegExpCount string expression with collation") { // Supported collations case class RegExpCountTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - RegExpCountTestCase("ABCDE", ".C.", "UTF8_BINARY", 1) + RegExpCountTestCase("ABCDE", ".C.", "UTF8_BINARY", 1), + RegExpCountTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", 1), + RegExpCountTestCase("ABCDE", ".c.", "UNICODE", 0) ) testCases.foreach(t => { - val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), '${t.r}')" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class RegExpCountTestFail(l: String, r: String, c: String) val failCases = Seq( - RegExpCountTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpCountTestFail("ABCDE", ".C.", "UNICODE"), RegExpCountTestFail("ABCDE", ".c.", "UNICODE_CI") ) failCases.foreach(t => { - val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + val query = s"SELECT regexp_count(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support RegExpSubStr string expression with collation") { // Supported collations case class RegExpSubStrTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - RegExpSubStrTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD") + RegExpSubStrTestCase("ABCDE", ".C.", "UTF8_BINARY", "BCD"), + RegExpSubStrTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", "BĆD"), + RegExpSubStrTestCase("ABCDE", ".c.", "UNICODE", null) ) testCases.foreach(t => { - val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), '${t.r}')" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(StringType(t.c))) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class RegExpSubStrTestFail(l: String, r: String, c: String) val failCases = Seq( - RegExpSubStrTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpSubStrTestFail("ABCDE", ".C.", "UNICODE"), RegExpSubStrTestFail("ABCDE", ".c.", "UNICODE_CI") ) failCases.foreach(t => { - val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" - val unsupportedCollation = intercept[AnalysisException] { sql(query) } + val query = s"SELECT regexp_substr(collate('${t.l}', '${t.c}'), '${t.r}')" + val unsupportedCollation = intercept[AnalysisException] { + sql(query) + } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } test("Support RegExpInStr string expression with collation") { // Supported collations case class RegExpInStrTestCase[R](l: String, r: String, c: String, result: R) val testCases = Seq( - RegExpInStrTestCase("ABCDE", ".C.", "UTF8_BINARY", 2) + RegExpInStrTestCase("ABCDE", ".C.", "UTF8_BINARY", 2), + RegExpInStrTestCase("ABĆDE", ".ć.", "UTF8_BINARY_LCASE", 2), + RegExpInStrTestCase("ABCDE", ".c.", "UNICODE", 0) ) testCases.foreach(t => { - val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), '${t.r}')" // Result & data type checkAnswer(sql(query), Row(t.result)) assert(sql(query).schema.fields.head.dataType.sameType(IntegerType)) - // TODO: Implicit casting (not currently supported) }) // Unsupported collations case class RegExpInStrTestFail(l: String, r: String, c: String) val failCases = Seq( - RegExpInStrTestFail("ABCDE", ".c.", "UTF8_BINARY_LCASE"), - RegExpInStrTestFail("ABCDE", ".C.", "UNICODE"), RegExpInStrTestFail("ABCDE", ".c.", "UNICODE_CI") ) failCases.foreach(t => { - val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), collate('${t.r}', '${t.c}'))" + val query = s"SELECT regexp_instr(collate('${t.l}', '${t.c}'), '${t.r}')" val unsupportedCollation = intercept[AnalysisException] { sql(query) } assert(unsupportedCollation.getErrorClass === "DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE") }) - // TODO: Collation mismatch (not currently supported) } } - -class CollationRegexpExpressionsANSISuite extends CollationRegexpExpressionsSuite { - override protected def sparkConf: SparkConf = - super.sparkConf.set(SQLConf.ANSI_ENABLED, true) - - // TODO: If needed, add more tests for other regexp expressions (with ANSI mode enabled) - -} +// scalastyle:on nonascii