[SPARK-49334][SQL] str_to_map should check whether the collation …

…values of all parameter types are the same ### What changes were proposed in this pull request? From the implementation logic of expression `str_to_map`, we found that it only extracts `collation` of the first parameter `text` for the `next` logical processing, as follows: https://github.com/apache/spark/blob/7f1a0c62a03c0d6498503987c4855327c0b6cae1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L588 https://github.com/apache/spark/blob/7f1a0c62a03c0d6498503987c4855327c0b6cae1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L594-L595 https://github.com/apache/spark/blob/7f1a0c62a03c0d6498503987c4855327c0b6cae1/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala#L600-L601 I propose to perform `consistency checks` on the `collation` values of all parameters to avoid potential issues. ### Why are the changes needed? Strengthen the parameter data type check of expression `str_to_map` to avoid potential issues. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? - Add new UT. - Pass GA. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#47825 from panbingkun/SPARK-49334. Authored-by: panbingkun <panbingkun@baidu.com> Signed-off-by: Max Gekk <max.gekk@gmail.com>
attilapiros · Oct 4, 2024 · 511a38a · 511a38a
1 parent 3ecdf34
commit 511a38a
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 1 deletion.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CollationTypeCasts.scala
@@ -97,7 +97,7 @@ object CollationTypeCasts extends TypeCoercionRule {
       _: ArrayIntersect | _: ArrayPosition | _: ArrayRemove | _: ArrayUnion | _: ArraysOverlap |
       _: Contains | _: EndsWith | _: EqualNullSafe | _: EqualTo | _: FindInSet | _: GreaterThan |
       _: GreaterThanOrEqual | _: LessThan | _: LessThanOrEqual | _: StartsWith | _: StringInstr |
-      _: ToNumber | _: TryToNumber) =>
+      _: ToNumber | _: TryToNumber | _: StringToMap) =>
       val newChildren = collateToSingleType(otherExpr.children)
       otherExpr.withNewChildren(newChildren)
   }

diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/collations.sql.out
@@ -389,3 +389,55 @@ select 'I' collate tr_ci = 'ı'
 -- !query analysis
 Project [(collate(I, tr_ci) = cast(ı as string collate tr_CI)) AS (collate(I, tr_ci) = ı)#x]
 +- OneRowRelation
+
+
+-- !query
+create table t4 (text string collate utf8_binary, pairDelim string collate utf8_lcase, keyValueDelim string collate utf8_binary) using parquet
+-- !query analysis
+CreateDataSourceTableCommand `spark_catalog`.`default`.`t4`, false
+
+
+-- !query
+insert into t4 values('a:1,b:2,c:3', ',', ':')
+-- !query analysis
+InsertIntoHadoopFsRelationCommand file:[not included in comparison]/{warehouse_dir}/t4, false, Parquet, [path=file:[not included in comparison]/{warehouse_dir}/t4], Append, `spark_catalog`.`default`.`t4`, org.apache.spark.sql.execution.datasources.InMemoryFileIndex(file:[not included in comparison]/{warehouse_dir}/t4), [text, pairDelim, keyValueDelim]
++- Project [cast(col1#x as string) AS text#x, cast(col2#x as string collate UTF8_LCASE) AS pairDelim#x, cast(col3#x as string) AS keyValueDelim#x]
+   +- LocalRelation [col1#x, col2#x, col3#x]
+
+
+-- !query
+select str_to_map(text, pairDelim, keyValueDelim) from t4
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.IMPLICIT",
+  "sqlState" : "42P21"
+}
+
+
+-- !query
+select str_to_map(text collate utf8_binary, pairDelim collate utf8_lcase, keyValueDelim collate utf8_binary) from t4
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.EXPLICIT",
+  "sqlState" : "42P21",
+  "messageParameters" : {
+    "explicitTypes" : "`string`, `string collate UTF8_LCASE`"
+  }
+}
+
+
+-- !query
+select str_to_map(text collate utf8_binary, pairDelim collate utf8_binary, keyValueDelim collate utf8_binary) from t4
+-- !query analysis
+Project [str_to_map(collate(text#x, utf8_binary), collate(pairDelim#x, utf8_binary), collate(keyValueDelim#x, utf8_binary)) AS str_to_map(collate(text, utf8_binary), collate(pairDelim, utf8_binary), collate(keyValueDelim, utf8_binary))#x]
++- SubqueryAlias spark_catalog.default.t4
+   +- Relation spark_catalog.default.t4[text#x,pairDelim#x,keyValueDelim#x] parquet
+
+
+-- !query
+drop table t4
+-- !query analysis
+DropTable false, false
++- ResolvedIdentifier V2SessionCatalog(spark_catalog), default.t4
diff --git a/sql/core/src/test/resources/sql-tests/inputs/collations.sql b/sql/core/src/test/resources/sql-tests/inputs/collations.sql
@@ -90,3 +90,14 @@ select 'a' collate en_ci_ai = 'Å';
 select 'Kypper' collate sv < 'Köpfe';
 select 'Kypper' collate de > 'Köpfe';
 select 'I' collate tr_ci = 'ı';
+
+-- create table for str_to_map
+create table t4 (text string collate utf8_binary, pairDelim string collate utf8_lcase, keyValueDelim string collate utf8_binary) using parquet;
+
+insert into t4 values('a:1,b:2,c:3', ',', ':');
+
+select str_to_map(text, pairDelim, keyValueDelim) from t4;
+select str_to_map(text collate utf8_binary, pairDelim collate utf8_lcase, keyValueDelim collate utf8_binary) from t4;
+select str_to_map(text collate utf8_binary, pairDelim collate utf8_binary, keyValueDelim collate utf8_binary) from t4;
+
+drop table t4;
diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out
@@ -427,3 +427,62 @@ select 'I' collate tr_ci = 'ı'
 struct<(collate(I, tr_ci) = ı):boolean>
 -- !query output
 true
+
+
+-- !query
+create table t4 (text string collate utf8_binary, pairDelim string collate utf8_lcase, keyValueDelim string collate utf8_binary) using parquet
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+insert into t4 values('a:1,b:2,c:3', ',', ':')
+-- !query schema
+struct<>
+-- !query output
+
+
+
+-- !query
+select str_to_map(text, pairDelim, keyValueDelim) from t4
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.IMPLICIT",
+  "sqlState" : "42P21"
+}
+
+
+-- !query
+select str_to_map(text collate utf8_binary, pairDelim collate utf8_lcase, keyValueDelim collate utf8_binary) from t4
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "COLLATION_MISMATCH.EXPLICIT",
+  "sqlState" : "42P21",
+  "messageParameters" : {
+    "explicitTypes" : "`string`, `string collate UTF8_LCASE`"
+  }
+}
+
+
+-- !query
+select str_to_map(text collate utf8_binary, pairDelim collate utf8_binary, keyValueDelim collate utf8_binary) from t4
+-- !query schema
+struct<str_to_map(collate(text, utf8_binary), collate(pairDelim, utf8_binary), collate(keyValueDelim, utf8_binary)):map<string,string>>
+-- !query output
+{"a":"1","b":"2","c":"3"}
+
+
+-- !query
+drop table t4
+-- !query schema
+struct<>
+-- !query output
+