apache · fjh100456 · Sep 13, 2017 · Sep 14, 2017 · Sep 15, 2017 · Sep 15, 2017
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/execution/SaveAsHiveFile.scala
@@ -68,6 +68,30 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
         .get("mapreduce.output.fileoutputformat.compress.type"))
     }
 
+    fileSinkConf.tableInfo.getOutputFileFormatClassName match {
+      case formatName if formatName.toLowerCase.endsWith("parquetoutputformat") =>
+        val compressionConf = "parquet.compression"
+        val compressionCodec = getCompressionByPriority(
+          fileSinkConf,
+          compressionConf,
+          default = sparkSession.sessionState.conf.parquetCompressionCodec) match {
+          case "NONE" => "UNCOMPRESSED"
+          case _@x => x
+        }
+        hadoopConf.set(compressionConf, compressionCodec)
+      case formatName if formatName.endsWith("OrcOutputFormat") =>
+        val compressionConf = "orc.compress"
+        val compressionCodec = getCompressionByPriority(
+          fileSinkConf,
+          compressionConf,
+          default = sparkSession.sessionState.conf.orcCompressionCodec) match {
+          case "UNCOMPRESSED" => "NONE"
+          case _@x => x
+        }
+        hadoopConf.set(compressionConf, compressionCodec)
+      case _ =>
+    }
+
     val committer = FileCommitProtocol.instantiate(
       sparkSession.sessionState.conf.fileCommitProtocolClass,
       jobId = java.util.UUID.randomUUID().toString,
@@ -86,6 +110,19 @@ private[hive] trait SaveAsHiveFile extends DataWritingCommand {
       options = Map.empty)
   }
 
+  // Because compression configurations can come in a variety of ways,
+  // we choose the compression configuration in this order:
+  // For parquet: `compression` > `parquet.compression` > `spark.sql.parquet.compression.codec`
+  // For orc: `compression` > `orc.compress` > `spark.sql.orc.compression.codec`
+  private def getCompressionByPriority(fileSinkConf: FileSinkDesc,
+    compressionConf: String, default: String): String = {
+    // The variable `default` was set to spark sql conf.
+    val props = fileSinkConf.tableInfo.getProperties
+    val priorities = List("compression", compressionConf)
+    priorities.find(props.getProperty(_, null) != null)
+      .map(props.getProperty).getOrElse(default).toUpperCase(Locale.ROOT)
+  }
+
   protected def getExternalTmpPath(
       sparkSession: SparkSession,
       hadoopConf: Configuration,

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/InsertSuite.scala
@@ -19,14 +19,18 @@ package org.apache.spark.sql.hive
 
 import java.io.File
 
+import scala.collection.JavaConverters._
+
+import org.apache.hadoop.fs.Path
 import org.scalatest.BeforeAndAfter
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{QueryTest, _}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.catalyst.plans.logical.InsertIntoTable
+import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
+import org.apache.spark.sql.hive.orc.OrcFileOperator
 import org.apache.spark.sql.hive.test.TestHiveSingleton
-import org.apache.spark.sql.test.SQLTestUtils
 import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 
@@ -35,7 +39,7 @@ case class TestData(key: Int, value: String)
 case class ThreeCloumntable(key: Int, value: String, key1: String)
 
 class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
-    with SQLTestUtils {
+    with ParquetTest {
   import spark.implicits._
 
   override lazy val testData = spark.sparkContext.parallelize(
@@ -728,4 +732,254 @@ class InsertSuite extends QueryTest with TestHiveSingleton with BeforeAndAfter
       assert(e.contains("mismatched input 'ROW'"))
     }
   }
+
+  private def getConvertMetastoreConfName(format: String): String = format match {
+    case "parquet" => "spark.sql.hive.convertMetastoreParquet"
+    case "orc" => "spark.sql.hive.convertMetastoreOrc"
+  }
+
+  private def getSparkCompressionConfName(format: String): String = format match {
+    case "parquet" => "spark.sql.parquet.compression.codec"
+    case "orc" => "spark.sql.orc.compression.codec"
+  }
+
+  private def getTableCompressPropName(format: String): String = {
+    format.toLowerCase match {
+      case "parquet" => "parquet.compression"
+      case "orc" => "orc.compress"
+    }
+  }
+
+  private def getTableCompressionCodec(path: String, format: String): String = {
+    val hadoopConf = spark.sessionState.newHadoopConf()
+    val codecs = format match {
+      case "parquet" => for {
+        footer <- readAllFootersWithoutSummaryFiles(new Path(path), hadoopConf)
+        block <- footer.getParquetMetadata.getBlocks.asScala
+        column <- block.getColumns.asScala
+      } yield column.getCodec.name()
+      case "orc" => new File(path).listFiles().filter{ file =>
+        file.isFile && !file.getName.endsWith(".crc") && file.getName != "_SUCCESS"
+      }.map { orcFile =>
+        OrcFileOperator.getFileReader(orcFile.toPath.toString).get.getCompression.toString
+      }.toSeq
+    }
+
+    assert(codecs.distinct.length == 1)
+    codecs.head
+  }
+
+  private def writeDataToTable(
+      rootDir: File,
+      tableName: String,
+      isPartitioned: Boolean,
+      format: String,
+      compressionCodec: Option[String]) {
+    val tblProperties = compressionCodec match {
+      case Some(prop) => s"TBLPROPERTIES('${getTableCompressPropName(format)}'='$prop')"
+      case _ => ""
+    }
+    val partitionCreate = if (isPartitioned) "PARTITIONED BY (p int)" else ""
+    sql(
+      s"""
+         |CREATE TABLE $tableName(a int)
+         |$partitionCreate
+         |STORED AS $format
+         |LOCATION '${rootDir.toURI.toString.stripSuffix("/")}/$tableName'
+         |$tblProperties
+       """.stripMargin)
+
+    val partitionInsert = if (isPartitioned) s"partition (p=10000)" else ""
+    sql(
+      s"""
+         |INSERT OVERWRITE TABLE $tableName
+         |$partitionInsert
+         |SELECT * from table_source
+       """.stripMargin)
+  }
+
+  private def checkCompressionCodecForTable(
+      format: String,
+      isPartitioned: Boolean,
+      compressionCodec: Option[String])(assertion: String => Unit): Unit = {
+    val tableName = s"tbl_$format${isPartitioned}"
+    withTempDir { tmpDir =>
+      withTable(tableName) {
+        writeDataToTable(tmpDir, tableName, isPartitioned, format, compressionCodec)
+        val partition = if (isPartitioned) "p=10000" else ""
+        val path = s"${tmpDir.getPath.stripSuffix("/")}/${tableName}/$partition"
+        assertion(getTableCompressionCodec(path, format))
+      }
+    }
+  }
+
+  private def checkTableCompressionCodecForCodecs(
+      format: String,
+      isPartitioned: Boolean,
+      convertMetastore: Boolean,
+      compressionCodecs: List[String],
+      tableCompressionCodecs: List[String]) (
+      assertion: (Option[String], String, String) => Unit): Unit = {
+    withSQLConf(getConvertMetastoreConfName(format) -> convertMetastore.toString) {
+      tableCompressionCodecs.foreach { tableCompression =>
+        compressionCodecs.foreach { sessionCompressionCodec =>
+          withSQLConf(getSparkCompressionConfName(format) -> sessionCompressionCodec) {
+            val compression = if (tableCompression == null) None else Some(tableCompression)
+            checkCompressionCodecForTable(format, isPartitioned, compression) {
+              case realCompressionCodec => assertion(compression,
+                sessionCompressionCodec, realCompressionCodec)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private def testCompressionCodec(testCondition: String)(f: => Unit): Unit = {
+    test("[SPARK-21786] - Check the priority between table-level compression and " +
+      s"session-level compression $testCondition") {
+      withTempView("table_source") {
+        (0 until 100000).toDF("a").createOrReplaceTempView("table_source")
+        f
+      }
+    }
+  }
+
+  testCompressionCodec("when table-level and session-level compression are both configured and " +
+    "convertMetastore is false") {
+    def checkForTableWithCompressProp(format: String, compressCodecs: List[String]): Unit = {
+      // For tables with table-level compression property, when
+      // 'spark.sql.hive.convertMetastore[Parquet|Orc]' was set to 'false', partitioned tables
+      // and non-partitioned tables will always take the table-level compression
+      // configuration first and ignore session compression configuration.
+      // Check for partitioned table, when convertMetastore is false
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = true,
+        convertMetastore = false,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = compressCodecs) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect table-level take effect
+          assert(tableCompressionCodec.get == realCompressionCodec)
+      }
+
+      // Check for non-partitioned table, when convertMetastoreParquet is false
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = false,
+        convertMetastore = false,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = compressCodecs) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect table-level take effect
+          assert(tableCompressionCodec.get == realCompressionCodec)
+      }
+    }
+
+    checkForTableWithCompressProp("parquet", List("UNCOMPRESSED", "SNAPPY", "GZIP"))
+    checkForTableWithCompressProp("orc", List("NONE", "SNAPPY", "ZLIB"))
+  }
+
+  testCompressionCodec("when there's no table-level compression and convertMetastore is false") {
+    def checkForTableWithoutCompressProp(format: String, compressCodecs: List[String]): Unit = {
+      // For tables without table-level compression property, session-level compression
+      // configuration will take effect.
+      // Check for partitioned table, when convertMetastore is false
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = true,
+        convertMetastore = false,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = List(null)) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect session-level take effect
+          assert(sessionCompressionCodec == realCompressionCodec)
+      }
+
+      // Check for non-partitioned table, when convertMetastore is false
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = false,
+        convertMetastore = false,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = List(null)) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect session-level take effect
+          assert(sessionCompressionCodec == realCompressionCodec)
+      }
+    }
+
+    checkForTableWithoutCompressProp("parquet", List("UNCOMPRESSED", "SNAPPY", "GZIP"))
+    checkForTableWithoutCompressProp("orc", List("NONE", "SNAPPY", "ZLIB"))
+  }
+
+  testCompressionCodec("when table-level and session-level compression are both configured and " +
+    "convertMetastore is true") {
+    def checkForTableWithCompressProp(format: String, compressCodecs: List[String]): Unit = {
+      // For tables with table-level compression property, when
+      // 'spark.sql.hive.convertMetastore[Parquet|Orc]' was set to 'true', partitioned tables
+      // will always take the table-level compression configuration first, but non-partitioned
+      // tables will take the session-level compression configuration.
+      // Check for partitioned table, when convertMetastore is true
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = true,
+        convertMetastore = true,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = compressCodecs) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect table-level take effect
+          assert(tableCompressionCodec.get == realCompressionCodec)
+      }
+
+      // Check for non-partitioned table, when convertMetastore is true
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = false,
+        convertMetastore = true,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = compressCodecs) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect session-level take effect
+          assert(sessionCompressionCodec == realCompressionCodec)
+      }
+    }
+
+    checkForTableWithCompressProp("parquet", List("UNCOMPRESSED", "SNAPPY", "GZIP"))
+    checkForTableWithCompressProp("orc", List("NONE", "SNAPPY", "ZLIB"))
+  }
+
+  testCompressionCodec("when there's no table-level compression and convertMetastore is true") {
+    def checkForTableWithoutCompressProp(format: String, compressCodecs: List[String]): Unit = {
+      // For tables without table-level compression property, session-level compression
+      // configuration will take effect.
+      // Check for partitioned table, when convertMetastore is true
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = true,
+        convertMetastore = true,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = List(null)) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect session-level take effect
+          assert(sessionCompressionCodec == realCompressionCodec)
+      }
+
+      // Check for non-partitioned table, when convertMetastore is true
+      checkTableCompressionCodecForCodecs(
+        format = format,
+        isPartitioned = false,
+        convertMetastore = true,
+        compressionCodecs = compressCodecs,
+        tableCompressionCodecs = List(null)) {
+        case (tableCompressionCodec, sessionCompressionCodec, realCompressionCodec) =>
+          // expect session-level take effect
+          assert(sessionCompressionCodec == realCompressionCodec)
+      }
+    }
+
+    checkForTableWithoutCompressProp("parquet", List("UNCOMPRESSED", "SNAPPY", "GZIP"))
+    checkForTableWithoutCompressProp("orc", List("NONE", "SNAPPY", "ZLIB"))
+  }
 }