[SPARK-26402][SQL] Accessing nested fields with different cases in ca…

…se insensitive mode ## What changes were proposed in this pull request? GetStructField with different optional names should be semantically equal. We will use this as building block to compare the nested fields used in the plans to be optimized by catalyst optimizer. This PR also fixes a bug below that accessing nested fields with different cases in case insensitive mode will result `AnalysisException`. ``` sql("create table t (s struct<i: Int>) using json") sql("select s.I from t group by s.i") ``` which is currently failing ``` org.apache.spark.sql.AnalysisException: expression 'default.t.`s`' is neither present in the group by, nor is it an aggregate function ``` as cloud-fan pointed out. ## How was this patch tested? New tests are added. Closes apache#23353 from dbtsai/nestedEqual. Lead-authored-by: DB Tsai <d_tsai@apple.com> Co-authored-by: DB Tsai <dbtsai@dbtsai.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
holdenk · Jan 5, 2019 · c03498e · c03498e
1 parent ce19610
commit c03498e
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 1 deletion.
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Canonicalize.scala
@@ -26,6 +26,7 @@ package org.apache.spark.sql.catalyst.expressions
  *
  * The following rules are applied:
  *  - Names and nullability hints for [[org.apache.spark.sql.types.DataType]]s are stripped.
+ *  - Names for [[GetStructField]] are stripped.
  *  - Commutative and associative operations ([[Add]] and [[Multiply]]) have their children ordered
  *    by `hashCode`.
  *  - [[EqualTo]] and [[EqualNullSafe]] are reordered by `hashCode`.
@@ -37,10 +38,11 @@ object Canonicalize {
     expressionReorder(ignoreNamesTypes(e))
   }
 
-  /** Remove names and nullability from types. */
+  /** Remove names and nullability from types, and names from `GetStructField`. */
   private[expressions] def ignoreNamesTypes(e: Expression): Expression = e match {
     case a: AttributeReference =>
       AttributeReference("none", a.dataType.asNullable)(exprId = a.exprId)
+    case GetStructField(child, ordinal, Some(_)) => GetStructField(child, ordinal, None)
     case _ => e
   }
 

diff --git a/...catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala b/...catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CanonicalizeSuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.sql.catalyst.expressions
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql.catalyst.dsl.plans._
 import org.apache.spark.sql.catalyst.plans.logical.Range
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
 class CanonicalizeSuite extends SparkFunSuite {
 
@@ -50,4 +51,32 @@ class CanonicalizeSuite extends SparkFunSuite {
     assert(range.where(arrays1).sameResult(range.where(arrays2)))
     assert(!range.where(arrays1).sameResult(range.where(arrays3)))
   }
+
+  test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
+    val expId = NamedExpression.newExprId
+    val qualifier = Seq.empty[String]
+    val structType = StructType(
+      StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil)
+
+    // GetStructField with different names are semantically equal
+    val fieldA1 = GetStructField(
+      AttributeReference("data1", structType, false)(expId, qualifier),
+      0, Some("a1"))
+    val fieldA2 = GetStructField(
+      AttributeReference("data2", structType, false)(expId, qualifier),
+      0, Some("a2"))
+    assert(fieldA1.semanticEquals(fieldA2))
+
+    val fieldB1 = GetStructField(
+      GetStructField(
+        AttributeReference("data1", structType, false)(expId, qualifier),
+        0, Some("a1")),
+      0, Some("b1"))
+    val fieldB2 = GetStructField(
+      GetStructField(
+        AttributeReference("data2", structType, false)(expId, qualifier),
+        0, Some("a2")),
+      0, Some("b2"))
+    assert(fieldB1.semanticEquals(fieldB2))
+  }
 }
diff --git a/...t/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala b/...t/scala/org/apache/spark/sql/catalyst/optimizer/BinaryComparisonSimplificationSuite.scala
@@ -25,6 +25,7 @@ import org.apache.spark.sql.catalyst.expressions.Literal.{FalseLiteral, TrueLite
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.rules._
+import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
 
 class BinaryComparisonSimplificationSuite extends PlanTest with PredicateHelper {
 
@@ -92,4 +93,33 @@ class BinaryComparisonSimplificationSuite extends PlanTest with PredicateHelper
     val correctAnswer = nonNullableRelation.analyze
     comparePlans(actual, correctAnswer)
   }
+
+  test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
+    val expId = NamedExpression.newExprId
+    val qualifier = Seq.empty[String]
+    val structType = StructType(
+      StructField("a", StructType(StructField("b", IntegerType, false) :: Nil), false) :: Nil)
+
+    val fieldA1 = GetStructField(
+      GetStructField(
+        AttributeReference("data1", structType, false)(expId, qualifier),
+        0, Some("a1")),
+      0, Some("b1"))
+    val fieldA2 = GetStructField(
+      GetStructField(
+        AttributeReference("data2", structType, false)(expId, qualifier),
+        0, Some("a2")),
+      0, Some("b2"))
+
+    // GetStructField with different names are semantically equal; thus, `EqualTo(fieldA1, fieldA2)`
+    // will be optimized to `TrueLiteral` by `SimplifyBinaryComparison`.
+    val originalQuery = nonNullableRelation
+        .where(EqualTo(fieldA1, fieldA2))
+        .analyze
+
+    val optimized = Optimize.execute(originalQuery)
+    val correctAnswer = nonNullableRelation.analyze
+
+    comparePlans(optimized, correctAnswer)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -2937,6 +2937,25 @@ class SQLQuerySuite extends QueryTest with SharedSQLContext {
       }
     }
   }
+
+  test("SPARK-26402: accessing nested fields with different cases in case insensitive mode") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      val msg = intercept[AnalysisException] {
+        withTable("t") {
+          sql("create table t (s struct<i: Int>) using json")
+          checkAnswer(sql("select s.I from t group by s.i"), Nil)
+        }
+      }.message
+      assert(msg.contains("No such struct field I in i"))
+    }
+
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      withTable("t") {
+        sql("create table t (s struct<i: Int>) using json")
+        checkAnswer(sql("select s.I from t group by s.i"), Nil)
+      }
+    }
+  }
 }
 
 case class Foo(bar: Option[String])