Add GpuDeltaParquetFileFormat

Signed-off-by: Jason Lowe <jlowe@nvidia.com>
NVIDIA · jlowe · Sep 22, 2023 · Sep 19, 2023 · Sep 19, 2023 · Sep 21, 2023
commit 5fa81d2f6f6b642568873334f596172fd931120b
diff --git a/...ke/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DeltaProviderImpl.scala b/...ke/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DeltaProviderImpl.scala
@@ -16,16 +16,20 @@
 
 package com.nvidia.spark.rapids.delta
 
-import com.databricks.sql.transaction.tahoe.DeltaLog
+import com.databricks.sql.transaction.tahoe.{DeltaLog, DeltaParquetFileFormat}
 import com.databricks.sql.transaction.tahoe.commands.{DeleteCommand, DeleteCommandEdge, MergeIntoCommand, MergeIntoCommandEdge, UpdateCommand, UpdateCommandEdge}
 import com.databricks.sql.transaction.tahoe.sources.DeltaDataSource
 import com.nvidia.spark.rapids._
 
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.command.RunnableCommand
-import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand
-import org.apache.spark.sql.rapids.ExternalSource
-import org.apache.spark.sql.sources.CreatableRelationProvider
+import org.apache.spark.sql.execution.datasources.{FileFormat, SaveIntoDataSourceCommand}
+import org.apache.spark.sql.rapids.{ExternalSource, GpuFileSourceScanExec}
+import org.apache.spark.sql.sources.{CreatableRelationProvider, Filter}
+import org.apache.spark.util.SerializableConfiguration
 
 /**
  * Implements the DeltaProvider interface for Databricks Delta Lake.
@@ -72,6 +76,34 @@ object DeltaProviderImpl extends DeltaProviderImplBase {
           .disabledByDefault("Delta Lake update support is experimental")
     ).map(r => (r.getClassFor.asSubclass(classOf[RunnableCommand]), r)).toMap
   }
+
+  override def isSupportedFormat(format: Class[_ <: FileFormat]): Boolean = {
+    format == classOf[DeltaParquetFileFormat] || format.isInstanceOf[GpuDeltaParquetFileFormat]
+  }
+
+  override def tagSupportForGpuFileSourceScan(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+    val format = meta.wrapped.relation.fileFormat
+    if (format.getClass == classOf[DeltaParquetFileFormat]) {
+      GpuReadParquetFileFormat.tagSupport(meta)
+      GpuDeltaParquetFileFormat.tagSupportForGpuFileSourceScan(meta)
+    } else {
+      meta.willNotWorkOnGpu(s"format ${format.getClass} is not supported")
+    }
+  }
+
+  override def getReadFileFormat(format: FileFormat): FileFormat = {
+    val cpuFormat = format.asInstanceOf[DeltaParquetFileFormat]
+    GpuDeltaParquetFileFormat.convertToGpu(cpuFormat)
+  }
+
+  override def createMultiFileReaderFactory(
+      format: FileFormat,
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      pushedFilters: Array[Filter],
+      fileScan: GpuFileSourceScanExec): PartitionReaderFactory = {
+    val gpuFormat = fileScan.relation.fileFormat.asInstanceOf[GpuDeltaParquetFileFormat]
+    gpuFormat.createMultiFileReaderFactory(broadcastedConf, pushedFilters, fileScan)
+  }
 }
 
 class DeltaCreatableRelationProviderMeta(

diff --git a/...c/main/databricks/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatBase.scala b/...c/main/databricks/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatBase.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.delta
+
+import com.databricks.sql.transaction.tahoe.{DeltaColumnMapping, DeltaColumnMappingMode, NoMapping}
+import com.nvidia.spark.rapids.{GpuMetric, GpuParquetMultiFilePartitionReaderFactory, GpuReadParquetFileFormat}
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.rapids.GpuFileSourceScanExec
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.SerializableConfiguration
+
+abstract class GpuDeltaParquetFileFormatBase extends GpuReadParquetFileFormat {
+  val columnMappingMode: DeltaColumnMappingMode
+  val referenceSchema: StructType
+
+  def prepareSchema(inputSchema: StructType): StructType = {
+    DeltaColumnMapping.createPhysicalSchema(inputSchema, referenceSchema, columnMappingMode)
+  }
+
+  def createMultiFileReaderFactory(
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      pushedFilters: Array[Filter],
+      fileScan: GpuFileSourceScanExec): PartitionReaderFactory = {
+    GpuParquetMultiFilePartitionReaderFactory(
+      fileScan.conf,
+      broadcastedConf,
+      prepareSchema(fileScan.relation.dataSchema),
+      prepareSchema(fileScan.requiredSchema),
+      prepareSchema(fileScan.readPartitionSchema),
+      pushedFilters,
+      fileScan.rapidsConf,
+      fileScan.allMetrics,
+      fileScan.queryUsesInputFile,
+      fileScan.alluxioPathsMap)
+  }
+
+  override def buildReaderWithPartitionValuesAndMetrics(
+      sparkSession: SparkSession,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      requiredSchema: StructType,
+      filters: Seq[Filter],
+      options: Map[String, String],
+      hadoopConf: Configuration,
+      metrics: Map[String, GpuMetric],
+      alluxioPathReplacementMap: Option[Map[String, String]])
+  : PartitionedFile => Iterator[InternalRow] = {
+    super.buildReaderWithPartitionValuesAndMetrics(
+      sparkSession,
+      prepareSchema(dataSchema),
+      prepareSchema(partitionSchema),
+      prepareSchema(requiredSchema),
+      filters,
+      options,
+      hadoopConf,
+      metrics,
+      alluxioPathReplacementMap)
+  }
+
+  override def supportFieldName(name: String): Boolean = {
+    if (columnMappingMode != NoMapping) true else super.supportFieldName(name)
+  }
+}
diff --git a/...mon/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala b/...mon/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormat.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.delta
+
+import com.nvidia.spark.rapids.{GpuMetric, GpuParquetMultiFilePartitionReaderFactory, GpuReadParquetFileFormat}
+import org.apache.hadoop.conf.Configuration
+
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.catalyst.InternalRow
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.delta.{DeltaColumnMapping, DeltaColumnMappingMode, NoMapping}
+import org.apache.spark.sql.execution.datasources.PartitionedFile
+import org.apache.spark.sql.rapids.GpuFileSourceScanExec
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.util.SerializableConfiguration
+
+trait GpuDeltaParquetFileFormat extends GpuReadParquetFileFormat {
+  val columnMappingMode: DeltaColumnMappingMode
+  val referenceSchema: StructType
+
+  def prepareSchema(inputSchema: StructType): StructType = {
+    DeltaColumnMapping.createPhysicalSchema(inputSchema, referenceSchema, columnMappingMode)
+  }
+
+  def createMultiFileReaderFactory(
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      pushedFilters: Array[Filter],
+      fileScan: GpuFileSourceScanExec): PartitionReaderFactory = {
+    GpuParquetMultiFilePartitionReaderFactory(
+      fileScan.conf,
+      broadcastedConf,
+      prepareSchema(fileScan.relation.dataSchema),
+      prepareSchema(fileScan.requiredSchema),
+      prepareSchema(fileScan.readPartitionSchema),
+      pushedFilters,
+      fileScan.rapidsConf,
+      fileScan.allMetrics,
+      fileScan.queryUsesInputFile,
+      fileScan.alluxioPathsMap)
+  }
+
+  override def buildReaderWithPartitionValuesAndMetrics(
+      sparkSession: SparkSession,
+      dataSchema: StructType,
+      partitionSchema: StructType,
+      requiredSchema: StructType,
+      filters: Seq[Filter],
+      options: Map[String, String],
+      hadoopConf: Configuration,
+      metrics: Map[String, GpuMetric],
+      alluxioPathReplacementMap: Option[Map[String, String]])
+  : PartitionedFile => Iterator[InternalRow] = {
+    super.buildReaderWithPartitionValuesAndMetrics(
+      sparkSession,
+      prepareSchema(dataSchema),
+      prepareSchema(partitionSchema),
+      prepareSchema(requiredSchema),
+      filters,
+      options,
+      hadoopConf,
+      metrics,
+      alluxioPathReplacementMap)
+  }
+
+  override def supportFieldName(name: String): Boolean = {
+    if (columnMappingMode != NoMapping) true else super.supportFieldName(name)
+  }
+}
diff --git a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProviderImplBase.scala b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/DeltaProviderImplBase.scala
@@ -16,10 +16,11 @@
 
 package com.nvidia.spark.rapids.delta
 
-import com.nvidia.spark.rapids.{ExecChecks, ExecRule, GpuOverrides, TypeSig}
+import com.nvidia.spark.rapids.{ExecChecks, ExecRule, GpuOverrides, RapidsConf, TypeSig}
 
 import org.apache.spark.sql.Strategy
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.datasources.FileFormat
 
 abstract class DeltaProviderImplBase extends DeltaProvider {
 
@@ -40,4 +41,8 @@ abstract class DeltaProviderImplBase extends DeltaProvider {
   override def getStrategyRules: Seq[Strategy] = Seq(
     RapidsProcessDeltaMergeJoinStrategy,
     RapidsDeltaWriteStrategy)
+
+  override def isPerFileReadEnabledForFormat(format: FileFormat, conf: RapidsConf): Boolean = {
+    conf.isParquetPerFileReadEnabled
+  }
 }
diff --git a/...ke/delta-20x/src/main/scala/com/nvidia/spark/rapids/delta/delta20x/Delta20xProvider.scala b/...ke/delta-20x/src/main/scala/com/nvidia/spark/rapids/delta/delta20x/Delta20xProvider.scala
@@ -16,11 +16,19 @@
 
 package com.nvidia.spark.rapids.delta.delta20x
 
-import com.nvidia.spark.rapids.{GpuOverrides, RunnableCommandRule}
-import com.nvidia.spark.rapids.delta.DeltaIOProvider
+import com.nvidia.spark.rapids.{GpuOverrides, GpuReadParquetFileFormat, RunnableCommandRule, SparkPlanMeta}
+import com.nvidia.spark.rapids.delta.{DeltaIOProvider, GpuDeltaParquetFileFormat}
 
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.delta.DeltaParquetFileFormat
 import org.apache.spark.sql.delta.commands.{DeleteCommand, MergeIntoCommand, UpdateCommand}
+import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.command.RunnableCommand
+import org.apache.spark.sql.execution.datasources.FileFormat
+import org.apache.spark.sql.rapids.GpuFileSourceScanExec
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.util.SerializableConfiguration
 
 object Delta20xProvider extends DeltaIOProvider {
 
@@ -41,4 +49,31 @@ object Delta20xProvider extends DeltaIOProvider {
           .disabledByDefault("Delta Lake update support is experimental")
     ).map(r => (r.getClassFor.asSubclass(classOf[RunnableCommand]), r)).toMap
   }
+
+  override def isSupportedFormat(format: Class[_ <: FileFormat]): Boolean = {
+    format == classOf[DeltaParquetFileFormat] || format == classOf[GpuDelta20xParquetFileFormat]
+  }
+
+  override def tagSupportForGpuFileSourceScan(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+    val format = meta.wrapped.relation.fileFormat
+    if (format.getClass == classOf[DeltaParquetFileFormat]) {
+      GpuReadParquetFileFormat.tagSupport(meta)
+    } else {
+      meta.willNotWorkOnGpu(s"format ${format.getClass} is not supported")
+    }
+  }
+
+  override def getReadFileFormat(format: FileFormat): FileFormat = {
+    val cpuFormat = format.asInstanceOf[DeltaParquetFileFormat]
+    GpuDelta20xParquetFileFormat(cpuFormat.columnMappingMode, cpuFormat.referenceSchema)
+  }
+
+  override def createMultiFileReaderFactory(
+      format: FileFormat,
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      pushedFilters: Array[Filter],
+      fileScan: GpuFileSourceScanExec): PartitionReaderFactory = {
+    val gpuFormat = fileScan.relation.fileFormat.asInstanceOf[GpuDeltaParquetFileFormat]
+    gpuFormat.createMultiFileReaderFactory(broadcastedConf, pushedFilters, fileScan)
+  }
 }
diff --git a/.../src/main/scala/com/nvidia/spark/rapids/delta/delta20x/GpuDelta20xParquetFileFormat.scala b/.../src/main/scala/com/nvidia/spark/rapids/delta/delta20x/GpuDelta20xParquetFileFormat.scala
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.delta.delta20x
+
+import com.nvidia.spark.rapids.delta.GpuDeltaParquetFileFormat
+
+import org.apache.spark.sql.delta.DeltaColumnMappingMode
+import org.apache.spark.sql.types.StructType
+
+case class GpuDelta20xParquetFileFormat(
+    override val columnMappingMode: DeltaColumnMappingMode,
+    override val referenceSchema: StructType) extends GpuDeltaParquetFileFormat {
+}
diff --git a/...ke/delta-21x/src/main/scala/com/nvidia/spark/rapids/delta/delta21x/Delta21xProvider.scala b/...ke/delta-21x/src/main/scala/com/nvidia/spark/rapids/delta/delta21x/Delta21xProvider.scala
@@ -16,11 +16,19 @@
 
 package com.nvidia.spark.rapids.delta.delta21x
 
-import com.nvidia.spark.rapids.{GpuOverrides, RunnableCommandRule}
-import com.nvidia.spark.rapids.delta.DeltaIOProvider
+import com.nvidia.spark.rapids.{GpuOverrides, GpuReadParquetFileFormat, RunnableCommandRule, SparkPlanMeta}
+import com.nvidia.spark.rapids.delta.{DeltaIOProvider, GpuDeltaParquetFileFormat}
 
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.sql.connector.read.PartitionReaderFactory
+import org.apache.spark.sql.delta.DeltaParquetFileFormat
 import org.apache.spark.sql.delta.commands.{DeleteCommand, MergeIntoCommand, UpdateCommand}
+import org.apache.spark.sql.execution.FileSourceScanExec
 import org.apache.spark.sql.execution.command.RunnableCommand
+import org.apache.spark.sql.execution.datasources.FileFormat
+import org.apache.spark.sql.rapids.GpuFileSourceScanExec
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.util.SerializableConfiguration
 
 object Delta21xProvider extends DeltaIOProvider {
 
@@ -41,4 +49,31 @@ object Delta21xProvider extends DeltaIOProvider {
           .disabledByDefault("Delta Lake update support is experimental")
     ).map(r => (r.getClassFor.asSubclass(classOf[RunnableCommand]), r)).toMap
   }
+
+  override def isSupportedFormat(format: Class[_ <: FileFormat]): Boolean = {
+    format == classOf[DeltaParquetFileFormat] || format == classOf[GpuDelta21xParquetFileFormat]
+  }
+
+  override def tagSupportForGpuFileSourceScan(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+    val format = meta.wrapped.relation.fileFormat
+    if (format.getClass == classOf[DeltaParquetFileFormat]) {
+      GpuReadParquetFileFormat.tagSupport(meta)
+    } else {
+      meta.willNotWorkOnGpu(s"format ${format.getClass} is not supported")
+    }
+  }
+
+  override def getReadFileFormat(format: FileFormat): FileFormat = {
+    val cpuFormat = format.asInstanceOf[DeltaParquetFileFormat]
+    GpuDelta21xParquetFileFormat(cpuFormat.columnMappingMode, cpuFormat.referenceSchema)
+  }
+
+  override def createMultiFileReaderFactory(
+      format: FileFormat,
+      broadcastedConf: Broadcast[SerializableConfiguration],
+      pushedFilters: Array[Filter],
+      fileScan: GpuFileSourceScanExec): PartitionReaderFactory = {
+    val gpuFormat = fileScan.relation.fileFormat.asInstanceOf[GpuDeltaParquetFileFormat]
+    gpuFormat.createMultiFileReaderFactory(broadcastedConf, pushedFilters, fileScan)
+  }
 }