From f72d2177882dc47b043fdc7dec9a46fe65df4ee9 Mon Sep 17 00:00:00 2001 From: Ryan Blue Date: Sat, 2 Feb 2019 09:17:52 -0800 Subject: [PATCH] [SPARK-26677][BUILD] Update Parquet to 1.10.1 with notEq pushdown fix. ## What changes were proposed in this pull request? Update to Parquet Java 1.10.1. ## How was this patch tested? Added a test from HyukjinKwon that validates the notEq case from SPARK-26677. Closes #23704 from rdblue/SPARK-26677-fix-noteq-parquet-bug. Lead-authored-by: Ryan Blue Co-authored-by: Hyukjin Kwon Co-authored-by: Ryan Blue Signed-off-by: Dongjoon Hyun --- dev/deps/spark-deps-hadoop-2.7 | 10 +++++----- dev/deps/spark-deps-hadoop-3.1 | 10 +++++----- pom.xml | 2 +- .../datasources/parquet/ParquetQuerySuite.scala | 15 +++++++++++++++ 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 0154fd26586b7..d41be281a1926 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -161,13 +161,13 @@ orc-shims-1.5.4.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.10.0.jar -parquet-common-1.10.0.jar -parquet-encoding-1.10.0.jar +parquet-column-1.10.1.jar +parquet-common-1.10.1.jar +parquet-encoding-1.10.1.jar parquet-format-2.4.0.jar -parquet-hadoop-1.10.0.jar +parquet-hadoop-1.10.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.10.0.jar +parquet-jackson-1.10.1.jar protobuf-java-2.5.0.jar py4j-0.10.8.1.jar pyrolite-4.13.jar diff --git a/dev/deps/spark-deps-hadoop-3.1 b/dev/deps/spark-deps-hadoop-3.1 index 7d5325c55e2e4..a6069c5f8ae88 100644 --- a/dev/deps/spark-deps-hadoop-3.1 +++ b/dev/deps/spark-deps-hadoop-3.1 @@ -178,13 +178,13 @@ orc-shims-1.5.4.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.10.0.jar -parquet-common-1.10.0.jar -parquet-encoding-1.10.0.jar +parquet-column-1.10.1.jar +parquet-common-1.10.1.jar +parquet-encoding-1.10.1.jar parquet-format-2.4.0.jar -parquet-hadoop-1.10.0.jar +parquet-hadoop-1.10.1.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.10.0.jar +parquet-jackson-1.10.1.jar protobuf-java-2.5.0.jar py4j-0.10.8.1.jar pyrolite-4.13.jar diff --git a/pom.xml b/pom.xml index 6676c5dcf979c..cbac15f1dfad1 100644 --- a/pom.xml +++ b/pom.xml @@ -132,7 +132,7 @@ 2.1.0 10.12.1.1 - 1.10.0 + 1.10.1 1.5.4 nohive 1.6.0 diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala index ce1dc6e159c61..beb89d91c9266 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetQuerySuite.scala @@ -890,6 +890,21 @@ class ParquetQuerySuite extends QueryTest with ParquetTest with SharedSQLContext } } } + + test("SPARK-26677: negated null-safe equality comparison should not filter matched row groups") { + (true :: false :: Nil).foreach { vectorized => + withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> vectorized.toString) { + withTempPath { path => + // Repeated values for dictionary encoding. + Seq(Some("A"), Some("A"), None).toDF.repartition(1) + .write.parquet(path.getAbsolutePath) + val df = spark.read.parquet(path.getAbsolutePath) + checkAnswer(stripSparkFilter(df.where("NOT (value <=> 'A')")), df) + } + } + } + } + } object TestingUDT {