[SPARK-49355][SQL] levenshtein should check whether the collation…

… values of all parameter types are the same ### What changes were proposed in this pull request? The same principle as apache#47825 (review), the parameters (`left` and `right`) in expression `levenshtein` are `collation-dependent`, rather than `collation-unaware`. ### Why are the changes needed? Strengthen the parameter data type check of expression `levenshtein` to avoid potential issues. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - Add some `test case` to `collations.sql`. - Pass GA. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#47847 from panbingkun/SPARK-49355. Authored-by: panbingkun <panbingkun@baidu.com> Signed-off-by: Max Gekk <max.gekk@gmail.com>
IvanK-db · Sep 19, 2024 · 7afbb67 · 7afbb67
1 parent 4539505
commit 7afbb67
Show file tree

Hide file tree

Showing 4 changed files with 196 additions and 0 deletions.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala
@@ -100,6 +100,11 @@ object CollationTypeCasts extends TypeCoercionRule {
       val Seq(newStr, newDelimiter) = collateToSingleType(Seq(str, delimiter))
       stringSplitSQL.withNewChildren(Seq(newStr, newDelimiter))
 
+    case levenshtein: Levenshtein =>
+      val Seq(left, right, threshold @ _*) = levenshtein.children
+      val Seq(newLeft, newRight) = collateToSingleType(Seq(left, right))
+      levenshtein.withNewChildren(Seq(newLeft, newRight) ++ threshold)
+
     case otherExpr @ (
       _: In | _: InSubquery | _: CreateArray | _: ArrayJoin | _: Concat | _: Greatest | _: Least |
       _: Coalesce | _: ArrayContains | _: ArrayExcept | _: ConcatWs | _: Mask | _: StringReplace |

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
@@ -493,3 +493,86 @@ drop table t5
 -- !query analysis
 DropTable false, false
 +- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t5
+
+
+-- !query
+create table t6 (utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase, threshold int) using parquet
+-- !query analysis
+CreateDataSourceTableCommand `spark_catalog`.`default`.`t6`, false
+
+
+-- !query
+insert into t6 values('kitten', 'sitting', 2)
+-- !query analysis
+InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t6, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t6], Append, `spark_catalog`.`default`.`t6`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t6), [utf8_binary, utf8_lcase, threshold]
++- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x, cast(col3#x as int) AS threshold#x]
+   +- LocalRelation [col1#x, col2#x, col3#x]
+
+
+-- !query
+select levenshtein(utf8_binary, utf8_lcase) from t6
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.IMPLICIT",
+  "sqlState" : "42P21"
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase) from t6
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.EXPLICIT",
+  "sqlState" : "42P21",
+  "messageParameters" : {
+    "explicitTypes" : "`string`, `string collate UTF8_LCASE`"
+  }
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary) from t6
+-- !query analysis
+Project [levenshtein(collate(utf8_binary#x, utf8_binary), collate(utf8_lcase#x, utf8_binary), None) AS levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary))#x]
++- SubqueryAlias spark_catalog.default.t6
+   +- Relation spark_catalog.default.t6[utf8_binary#x,utf8_lcase#x,threshold#x] parquet
+
+
+-- !query
+select levenshtein(utf8_binary, utf8_lcase, threshold) from t6
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.IMPLICIT",
+  "sqlState" : "42P21"
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase, threshold) from t6
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.EXPLICIT",
+  "sqlState" : "42P21",
+  "messageParameters" : {
+    "explicitTypes" : "`string`, `string collate UTF8_LCASE`"
+  }
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary, threshold) from t6
+-- !query analysis
+Project [levenshtein(collate(utf8_binary#x, utf8_binary), collate(utf8_lcase#x, utf8_binary), Some(threshold#x)) AS levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary), threshold)#x]
++- SubqueryAlias spark_catalog.default.t6
+   +- Relation spark_catalog.default.t6[utf8_binary#x,utf8_lcase#x,threshold#x] parquet
+
+
+-- !query
+drop table t6
+-- !query analysis
+DropTable false, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t6
diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql
@@ -112,3 +112,17 @@ select split_part(str collate utf8_binary, delimiter collate utf8_lcase, partNum
 select split_part(str collate utf8_binary, delimiter collate utf8_binary, partNum) from t5;
 
 drop table t5;
+
+-- create table for levenshtein
+create table t6 (utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase, threshold int) using parquet;
+
+insert into t6 values('kitten', 'sitting', 2);
+
+select levenshtein(utf8_binary, utf8_lcase) from t6;
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase) from t6;
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary) from t6;
+select levenshtein(utf8_binary, utf8_lcase, threshold) from t6;
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase, threshold) from t6;
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary, threshold) from t6;
+
+drop table t6;
diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
@@ -545,3 +545,97 @@ drop table t5
 struct<>
 -- !query output
 
+
+
+-- !query
+create table t6 (utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase, threshold int) using parquet
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+insert into t6 values('kitten', 'sitting', 2)
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+select levenshtein(utf8_binary, utf8_lcase) from t6
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.IMPLICIT",
+  "sqlState" : "42P21"
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase) from t6
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.EXPLICIT",
+  "sqlState" : "42P21",
+  "messageParameters" : {
+    "explicitTypes" : "`string`, `string collate UTF8_LCASE`"
+  }
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary) from t6
+-- !query schema
+struct<levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary)):int>
+-- !query output
+3
+
+
+-- !query
+select levenshtein(utf8_binary, utf8_lcase, threshold) from t6
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.IMPLICIT",
+  "sqlState" : "42P21"
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase, threshold) from t6
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.EXPLICIT",
+  "sqlState" : "42P21",
+  "messageParameters" : {
+    "explicitTypes" : "`string`, `string collate UTF8_LCASE`"
+  }
+}
+
+
+-- !query
+select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary, threshold) from t6
+-- !query schema
+struct<levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary), threshold):int>
+-- !query output
+-1
+
+
+-- !query
+drop table t6
+-- !query schema
+struct<>
+-- !query output
+