Skip to content

Commit

Permalink
[SPARK-49355][SQL] levenshtein should check whether the collation
Browse files Browse the repository at this point in the history
… values of all parameter types are the same

### What changes were proposed in this pull request?
The same principle as apache#47825 (review), the parameters (`left` and `right`) in expression `levenshtein` are `collation-dependent`, rather than `collation-unaware`.

### Why are the changes needed?
Strengthen the parameter data type check of expression `levenshtein`  to avoid potential issues.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
- Add some `test case` to `collations.sql`.
- Pass GA.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes apache#47847 from panbingkun/SPARK-49355.

Authored-by: panbingkun <panbingkun@baidu.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
  • Loading branch information
panbingkun authored and IvanK-db committed Sep 19, 2024
1 parent 4539505 commit 7afbb67
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ object CollationTypeCasts extends TypeCoercionRule {
val Seq(newStr, newDelimiter) = collateToSingleType(Seq(str, delimiter))
stringSplitSQL.withNewChildren(Seq(newStr, newDelimiter))

case levenshtein: Levenshtein =>
val Seq(left, right, threshold @ _*) = levenshtein.children
val Seq(newLeft, newRight) = collateToSingleType(Seq(left, right))
levenshtein.withNewChildren(Seq(newLeft, newRight) ++ threshold)

case otherExpr @ (
_: In | _: InSubquery | _: CreateArray | _: ArrayJoin | _: Concat | _: Greatest | _: Least |
_: Coalesce | _: ArrayContains | _: ArrayExcept | _: ConcatWs | _: Mask | _: StringReplace |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -493,3 +493,86 @@ drop table t5
-- !query analysis
DropTable false, false
+- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t5


-- !query
create table t6 (utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase, threshold int) using parquet
-- !query analysis
CreateDataSourceTableCommand `spark_catalog`.`default`.`t6`, false


-- !query
insert into t6 values('kitten', 'sitting', 2)
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t6, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t6], Append, `spark_catalog`.`default`.`t6`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t6), [utf8_binary, utf8_lcase, threshold]
+- Project [cast(col1#x as string) AS utf8_binary#x, cast(col2#x as string collate UTF8_LCASE) AS utf8_lcase#x, cast(col3#x as int) AS threshold#x]
+- LocalRelation [col1#x, col2#x, col3#x]


-- !query
select levenshtein(utf8_binary, utf8_lcase) from t6
-- !query analysis
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.IMPLICIT",
"sqlState" : "42P21"
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase) from t6
-- !query analysis
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.EXPLICIT",
"sqlState" : "42P21",
"messageParameters" : {
"explicitTypes" : "`string`, `string collate UTF8_LCASE`"
}
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary) from t6
-- !query analysis
Project [levenshtein(collate(utf8_binary#x, utf8_binary), collate(utf8_lcase#x, utf8_binary), None) AS levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary))#x]
+- SubqueryAlias spark_catalog.default.t6
+- Relation spark_catalog.default.t6[utf8_binary#x,utf8_lcase#x,threshold#x] parquet


-- !query
select levenshtein(utf8_binary, utf8_lcase, threshold) from t6
-- !query analysis
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.IMPLICIT",
"sqlState" : "42P21"
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase, threshold) from t6
-- !query analysis
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.EXPLICIT",
"sqlState" : "42P21",
"messageParameters" : {
"explicitTypes" : "`string`, `string collate UTF8_LCASE`"
}
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary, threshold) from t6
-- !query analysis
Project [levenshtein(collate(utf8_binary#x, utf8_binary), collate(utf8_lcase#x, utf8_binary), Some(threshold#x)) AS levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary), threshold)#x]
+- SubqueryAlias spark_catalog.default.t6
+- Relation spark_catalog.default.t6[utf8_binary#x,utf8_lcase#x,threshold#x] parquet


-- !query
drop table t6
-- !query analysis
DropTable false, false
+- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t6
14 changes: 14 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/collations.sql
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,17 @@ select split_part(str collate utf8_binary, delimiter collate utf8_lcase, partNum
select split_part(str collate utf8_binary, delimiter collate utf8_binary, partNum) from t5;

drop table t5;

-- create table for levenshtein
create table t6 (utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase, threshold int) using parquet;

insert into t6 values('kitten', 'sitting', 2);

select levenshtein(utf8_binary, utf8_lcase) from t6;
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase) from t6;
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary) from t6;
select levenshtein(utf8_binary, utf8_lcase, threshold) from t6;
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase, threshold) from t6;
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary, threshold) from t6;

drop table t6;
94 changes: 94 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/collations.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -545,3 +545,97 @@ drop table t5
struct<>
-- !query output



-- !query
create table t6 (utf8_binary string collate utf8_binary, utf8_lcase string collate utf8_lcase, threshold int) using parquet
-- !query schema
struct<>
-- !query output



-- !query
insert into t6 values('kitten', 'sitting', 2)
-- !query schema
struct<>
-- !query output



-- !query
select levenshtein(utf8_binary, utf8_lcase) from t6
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.IMPLICIT",
"sqlState" : "42P21"
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase) from t6
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.EXPLICIT",
"sqlState" : "42P21",
"messageParameters" : {
"explicitTypes" : "`string`, `string collate UTF8_LCASE`"
}
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary) from t6
-- !query schema
struct<levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary)):int>
-- !query output
3


-- !query
select levenshtein(utf8_binary, utf8_lcase, threshold) from t6
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.IMPLICIT",
"sqlState" : "42P21"
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_lcase, threshold) from t6
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.EXPLICIT",
"sqlState" : "42P21",
"messageParameters" : {
"explicitTypes" : "`string`, `string collate UTF8_LCASE`"
}
}


-- !query
select levenshtein(utf8_binary collate utf8_binary, utf8_lcase collate utf8_binary, threshold) from t6
-- !query schema
struct<levenshtein(collate(utf8_binary, utf8_binary), collate(utf8_lcase, utf8_binary), threshold):int>
-- !query output
-1


-- !query
drop table t6
-- !query schema
struct<>
-- !query output

0 comments on commit 7afbb67

Please sign in to comment.