Skip to content

Commit

Permalink
[SPARK-49334][SQL] str_to_map should check whether the collation
Browse files Browse the repository at this point in the history
…values of all parameter types are the same

### What changes were proposed in this pull request?
From the implementation logic of expression `str_to_map`, we found that it only extracts `collation` of the first parameter `text` for the `next` logical processing, as follows:
https://github.com/apache/spark/blob/7f1a0c62a03c0d6498503987c4855327c0b6cae1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L588

https://github.com/apache/spark/blob/7f1a0c62a03c0d6498503987c4855327c0b6cae1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L594-L595

https://github.com/apache/spark/blob/7f1a0c62a03c0d6498503987c4855327c0b6cae1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L600-L601

I propose to perform `consistency checks` on the `collation` values of all parameters to avoid potential issues.

### Why are the changes needed?
Strengthen the parameter data type check of expression `str_to_map`  to avoid potential issues.

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?
- Add new UT.
- Pass GA.

### Was this patch authored or co-authored using generative AI tooling?
No.

Closes apache#47825 from panbingkun/SPARK-49334.

Authored-by: panbingkun <panbingkun@baidu.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
  • Loading branch information
panbingkun authored and attilapiros committed Oct 4, 2024
1 parent 3ecdf34 commit 511a38a
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ object CollationTypeCasts extends TypeCoercionRule {
_: ArrayIntersect | _: ArrayPosition | _: ArrayRemove | _: ArrayUnion | _: ArraysOverlap |
_: Contains | _: EndsWith | _: EqualNullSafe | _: EqualTo | _: FindInSet | _: GreaterThan |
_: GreaterThanOrEqual | _: LessThan | _: LessThanOrEqual | _: StartsWith | _: StringInstr |
_: ToNumber | _: TryToNumber) =>
_: ToNumber | _: TryToNumber | _: StringToMap) =>
val newChildren = collateToSingleType(otherExpr.children)
otherExpr.withNewChildren(newChildren)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -389,3 +389,55 @@ select 'I' collate tr_ci = 'ı'
-- !query analysis
Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I, tr_ci) = ı)#x]
+- OneRowRelation


-- !query
create table t4 (text string collate utf8_binary, pairDelim string collate utf8_lcase, keyValueDelim string collate utf8_binary) using parquet
-- !query analysis
CreateDataSourceTableCommand `spark_catalog`.`default`.`t4`, false


-- !query
insert into t4 values('a:1,b:2,c:3', ',', ':')
-- !query analysis
InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t4, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t4], Append, `spark_catalog`.`default`.`t4`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t4), [text, pairDelim, keyValueDelim]
+- Project [cast(col1#x as string) AS text#x, cast(col2#x as string collate UTF8_LCASE) AS pairDelim#x, cast(col3#x as string) AS keyValueDelim#x]
+- LocalRelation [col1#x, col2#x, col3#x]


-- !query
select str_to_map(text, pairDelim, keyValueDelim) from t4
-- !query analysis
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.IMPLICIT",
"sqlState" : "42P21"
}


-- !query
select str_to_map(text collate utf8_binary, pairDelim collate utf8_lcase, keyValueDelim collate utf8_binary) from t4
-- !query analysis
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.EXPLICIT",
"sqlState" : "42P21",
"messageParameters" : {
"explicitTypes" : "`string`, `string collate UTF8_LCASE`"
}
}


-- !query
select str_to_map(text collate utf8_binary, pairDelim collate utf8_binary, keyValueDelim collate utf8_binary) from t4
-- !query analysis
Project [str_to_map(collate(text#x, utf8_binary), collate(pairDelim#x, utf8_binary), collate(keyValueDelim#x, utf8_binary)) AS str_to_map(collate(text, utf8_binary), collate(pairDelim, utf8_binary), collate(keyValueDelim, utf8_binary))#x]
+- SubqueryAlias spark_catalog.default.t4
+- Relation spark_catalog.default.t4[text#x,pairDelim#x,keyValueDelim#x] parquet


-- !query
drop table t4
-- !query analysis
DropTable false, false
+- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t4
11 changes: 11 additions & 0 deletions sql/core/src/test/resources/sql-tests/inputs/collations.sql
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,14 @@ select 'a' collate en_ci_ai = 'Å';
select 'Kypper' collate sv < 'Köpfe';
select 'Kypper' collate de > 'Köpfe';
select 'I' collate tr_ci = 'ı';

-- create table for str_to_map
create table t4 (text string collate utf8_binary, pairDelim string collate utf8_lcase, keyValueDelim string collate utf8_binary) using parquet;

insert into t4 values('a:1,b:2,c:3', ',', ':');

select str_to_map(text, pairDelim, keyValueDelim) from t4;
select str_to_map(text collate utf8_binary, pairDelim collate utf8_lcase, keyValueDelim collate utf8_binary) from t4;
select str_to_map(text collate utf8_binary, pairDelim collate utf8_binary, keyValueDelim collate utf8_binary) from t4;

drop table t4;
59 changes: 59 additions & 0 deletions sql/core/src/test/resources/sql-tests/results/collations.sql.out
Original file line number Diff line number Diff line change
Expand Up @@ -427,3 +427,62 @@ select 'I' collate tr_ci = 'ı'
struct<(collate(I, tr_ci) = ı):boolean>
-- !query output
true


-- !query
create table t4 (text string collate utf8_binary, pairDelim string collate utf8_lcase, keyValueDelim string collate utf8_binary) using parquet
-- !query schema
struct<>
-- !query output



-- !query
insert into t4 values('a:1,b:2,c:3', ',', ':')
-- !query schema
struct<>
-- !query output



-- !query
select str_to_map(text, pairDelim, keyValueDelim) from t4
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.IMPLICIT",
"sqlState" : "42P21"
}


-- !query
select str_to_map(text collate utf8_binary, pairDelim collate utf8_lcase, keyValueDelim collate utf8_binary) from t4
-- !query schema
struct<>
-- !query output
org.apache.spark.sql.AnalysisException
{
"errorClass" : "COLLATION_MISMATCH.EXPLICIT",
"sqlState" : "42P21",
"messageParameters" : {
"explicitTypes" : "`string`, `string collate UTF8_LCASE`"
}
}


-- !query
select str_to_map(text collate utf8_binary, pairDelim collate utf8_binary, keyValueDelim collate utf8_binary) from t4
-- !query schema
struct<str_to_map(collate(text, utf8_binary), collate(pairDelim, utf8_binary), collate(keyValueDelim, utf8_binary)):map<string,string>>
-- !query output
{"a":"1","b":"2","c":"3"}


-- !query
drop table t4
-- !query schema
struct<>
-- !query output

0 comments on commit 511a38a

Please sign in to comment.