[Spark] Add Preserving Row Tracking in Update (#2927)

#### Which Delta project/connector is this regarding?  - [X] Spark - [ ] Standalone - [ ] Flink - [ ] Kernel - [ ] Other (fill in here) ## Description Preserve row IDs in UPDATE by reading the metadata column and writing it out to the physical column.  ## How was this patch tested? Added UTs.  ## Does this PR introduce _any_ user-facing changes? No.
delta-io · Apr 22, 2024 · 2878f61 · 2878f61
1 parent 4b835e9
commit 2878f61
Show file tree

Hide file tree

Showing 3 changed files with 340 additions and 6 deletions.
diff --git a/spark/src/main/scala/org/apache/spark/sql/delta/commands/UpdateCommand.scala b/spark/src/main/scala/org/apache/spark/sql/delta/commands/UpdateCommand.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.delta.commands
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.sql.delta.metric.IncrementMetric
-import org.apache.spark.sql.delta.{DeltaConfigs, DeltaLog, DeltaOperations, DeltaTableUtils, OptimisticTransaction}
+import org.apache.spark.sql.delta._
 import org.apache.spark.sql.delta.actions.{AddCDCFile, AddFile, FileAction}
 import org.apache.spark.sql.delta.commands.cdc.CDCReader.{CDC_TYPE_COLUMN_NAME, CDC_TYPE_NOT_CDC, CDC_TYPE_UPDATE_POSTIMAGE, CDC_TYPE_UPDATE_PREIMAGE}
 import org.apache.spark.sql.delta.files.{TahoeBatchFileIndex, TahoeFileIndex}
@@ -295,7 +295,10 @@ case class UpdateCommand(
     txn.registerSQLMetrics(sparkSession, metrics)
 
     val finalActions = createSetTransaction(sparkSession, deltaLog).toSeq ++ totalActions
-    txn.commitIfNeeded(finalActions, DeltaOperations.Update(condition))
+    txn.commitIfNeeded(
+      actions = finalActions,
+      op = DeltaOperations.Update(condition),
+      tags = RowTracking.addPreservedRowTrackingTagIfNotSet(txn.snapshot))
     sendDriverMetrics(sparkSession, metrics)
 
     recordDeltaEvent(
@@ -342,7 +345,12 @@ case class UpdateCommand(
     val baseRelation = buildBaseRelation(
       spark, txn, "update", rootPath, inputLeafFiles.map(_.path), nameToAddFileMap)
     val newTarget = DeltaTableUtils.replaceFileIndex(target, baseRelation.location)
-    val targetDf = Dataset.ofRows(spark, newTarget)
+    val (targetDf, finalOutput, finalUpdateExpressions) = UpdateCommand.preserveRowTrackingColumns(
+      targetDfWithoutRowTrackingColumns = Dataset.ofRows(spark, newTarget),
+      snapshot = txn.snapshot,
+      targetOutput = target.output,
+      updateExpressions)
+
     val targetDfWithEvaluatedCondition = {
       val evalDf = targetDf.withColumn(UpdateCommand.CONDITION_COLUMN_NAME, new Column(condition))
       val copyAndUpdateRowsDf = if (copyUnmodifiedRows) {
@@ -354,8 +362,8 @@ case class UpdateCommand(
     }
 
     val updatedDataFrame = UpdateCommand.withUpdatedColumns(
-      target.output,
-      updateExpressions,
+      finalOutput,
+      finalUpdateExpressions,
       condition,
       targetDfWithEvaluatedCondition,
       UpdateCommand.shouldOutputCdc(txn))
@@ -469,6 +477,42 @@ object UpdateCommand {
 
     resultDf.drop(CONDITION_COLUMN_NAME)
   }
+
+  /**
+   * Preserve the row tracking columns when performing an UPDATE.
+   *
+   * @param targetDfWithoutRowTrackingColumns The target DataFrame on which the UPDATE
+   *                                          operation is to be performed.
+   * @param snapshot                          Snapshot of the Delta table at the start of
+   *                                          the transaction.
+   * @param targetOutput                      The output schema of the target DataFrame.
+   * @param updateExpressions                 The update transformation to perform on the
+   *                                          target DataFrame.
+   * @return
+   * 1. targetDf: The target DataFrame that includes the preserved row tracking columns.
+   * 2. finalOutput: The final output schema, including the preserved row tracking columns.
+   * 3. finalUpdateExpressions: The final update expressions, including transformations
+   * for the preserved row tracking columns.
+   */
+  def preserveRowTrackingColumns(
+      targetDfWithoutRowTrackingColumns: DataFrame,
+      snapshot: Snapshot,
+      targetOutput: Seq[Attribute] = Seq.empty,
+      updateExpressions: Seq[Expression] = Seq.empty):
+    (DataFrame, Seq[Attribute], Seq[Expression]) = {
+    val targetDf = RowTracking.preserveRowTrackingColumns(
+      targetDfWithoutRowTrackingColumns, snapshot)
+
+    val rowIdAttributeOpt = MaterializedRowId.getAttribute(snapshot, targetDf)
+    val rowCommitVersionAttributeOpt =
+      MaterializedRowCommitVersion.getAttribute(snapshot, targetDf)
+    val finalOutput = targetOutput ++ rowIdAttributeOpt ++ rowCommitVersionAttributeOpt
+
+    val finalUpdateExpressions = updateExpressions ++
+      rowIdAttributeOpt ++
+      rowCommitVersionAttributeOpt.map(_ => Literal(null, LongType))
+    (targetDf, finalOutput, finalUpdateExpressions)
+  }
 }
 
 /**

diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowIdTestUtils.scala b/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowIdTestUtils.scala
@@ -234,5 +234,4 @@ trait RowIdTestUtils extends RowTrackingTestUtils with DeltaSQLCommandTest with
   def checkRowTrackingMarkedAsPreservedForCommit(log: DeltaLog)(operation: => Unit): Unit = {
     assert(rowTrackingMarkedAsPreservedForCommit(log)(operation))
   }
-
 }
diff --git a/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowTrackingUpdateSuite.scala b/spark/src/test/scala/org/apache/spark/sql/delta/rowid/RowTrackingUpdateSuite.scala
@@ -0,0 +1,291 @@
+/*
+ * Copyright (2021) The Delta Lake Project Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.delta.rowid
+
+import org.apache.spark.sql.delta._
+import org.apache.spark.sql.delta.DeltaTestUtils.BOOLEAN_DOMAIN
+import org.apache.spark.sql.delta.cdc.UpdateCDCSuite
+import org.apache.spark.sql.delta.rowtracking.RowTrackingTestUtils
+import org.apache.spark.sql.delta.sources.DeltaSQLConf
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.AnalysisException
+import org.apache.spark.sql.catalyst.TableIdentifier
+import org.apache.spark.sql.functions.{col, lit}
+
+trait RowTrackingUpdateSuiteBase
+  extends RowIdTestUtils {
+
+  protected def dvsEnabled: Boolean = false
+
+  protected val numRowsTarget = 3000
+  protected val numRowsPerFile = 250
+  protected val numFiles: Int = numRowsTarget / numRowsPerFile
+
+  protected val targetTableName = "target"
+
+  override protected def sparkConf: SparkConf = {
+    super.sparkConf
+      .set(DeltaConfigs.ROW_TRACKING_ENABLED.defaultTablePropertyKey, value = "true")
+      .set(DeltaConfigs.ENABLE_DELETION_VECTORS_CREATION.defaultTablePropertyKey,
+        dvsEnabled.toString)
+      .set(DeltaSQLConf.DELETE_USE_PERSISTENT_DELETION_VECTORS.key, dvsEnabled.toString)
+      .set(DeltaSQLConf.UPDATE_USE_PERSISTENT_DELETION_VECTORS.key, dvsEnabled.toString)
+      .set(DeltaSQLConf.MERGE_USE_PERSISTENT_DELETION_VECTORS.key, dvsEnabled.toString)
+  }
+
+  protected def writeTestTable(
+      tableName: String,
+      isPartitioned: Boolean,
+      lastModifiedVersion: Long = 0L): Unit = {
+    // Disable optimized writes to write out the specified number of files.
+    withSQLConf(DeltaSQLConf.DELTA_OPTIMIZE_WRITE_ENABLED.key -> "false") {
+      val df = spark.range(
+        start = 0, end = numRowsTarget, step = 1, numPartitions = numFiles)
+        .withColumn("last_modified_version", lit(lastModifiedVersion))
+        .withColumn("partition", (col("id") / (numRowsTarget / 3)).cast("int"))
+        .write.format("delta")
+      if (isPartitioned) {
+        df.partitionBy("partition").saveAsTable(tableName)
+      } else {
+        df.saveAsTable(tableName)
+      }
+      val (_, snapshot) = DeltaLog.forTableWithSnapshot(spark, TableIdentifier(tableName))
+      assert(snapshot.allFiles.count() === numFiles)
+    }
+  }
+
+  protected def withRowIdTestTable(isPartitioned: Boolean)(f: => Unit): Unit = {
+    withTable(targetTableName) {
+      writeTestTable(targetTableName, isPartitioned)
+      f
+    }
+  }
+
+  protected def checkAndExecuteUpdate(
+      tableName: String, condition: Option[String], newVersion: Long = 1L): Unit = {
+    val expectedRowIds =
+      spark.read.table(tableName).select("id", RowId.QUALIFIED_COLUMN_NAME).collect()
+
+    val log = DeltaLog.forTable(spark, TableIdentifier(targetTableName))
+    checkRowTrackingMarkedAsPreservedForCommit(log) {
+      checkFileActionInvariantBeforeAndAfterOperation(log) {
+        executeUpdate(tableName, condition, newVersion)
+      }
+    }
+
+    val actualRowIds = spark.read.table(tableName).select("id", RowId.QUALIFIED_COLUMN_NAME)
+    checkAnswer(actualRowIds, expectedRowIds)
+    assertRowIdsAreValid(log)
+
+    val actualRowCommitVersions =
+      spark.read.table(tableName).select("id", RowCommitVersion.QUALIFIED_COLUMN_NAME)
+    val expectedRowCommitVersions =
+      spark.read.table(tableName).select("id", "last_modified_version").collect()
+    checkAnswer(actualRowCommitVersions, expectedRowCommitVersions)
+  }
+
+  protected def executeUpdate(tableName: String, where: Option[String], newVersion: Long): Unit = {
+    val whereClause = where.map(c => s"WHERE $c").getOrElse("")
+    sql(s"""UPDATE $tableName as t
+         |SET last_modified_version = $newVersion
+         |$whereClause""".stripMargin)
+  }
+}
+
+trait RowTrackingUpdateCommonTests extends RowTrackingUpdateSuiteBase {
+
+  for {
+    isPartitioned <- BOOLEAN_DOMAIN
+    whereClause <- Seq(
+      Some(s"id < ${(numFiles / 2) * numRowsPerFile}"), // 50% of files match
+      Some(s"id < ${numRowsPerFile / 2}"), // One file matches
+      None // No condition, 100% of files match
+    )
+  } {
+    test(s"Preserves row IDs, whereClause = $whereClause, isPartitioned = $isPartitioned") {
+      withRowIdTestTable(isPartitioned = isPartitioned) {
+        checkAndExecuteUpdate(tableName = targetTableName, condition = whereClause)
+      }
+    }
+  }
+
+  for (isPartitioned <- BOOLEAN_DOMAIN)
+  test(s"Preserves row IDs across multiple updates, isPartitioned = $isPartitioned") {
+    withRowIdTestTable(isPartitioned = false) {
+      checkAndExecuteUpdate(targetTableName, condition = Some("id % 20 = 0"))
+
+      checkAndExecuteUpdate(targetTableName, condition = Some("id % 10 = 0"), newVersion = 2L)
+    }
+  }
+
+  test("Preserves row IDs in update on partition column, whole file update") {
+    withRowIdTestTable(isPartitioned = true) {
+      checkAndExecuteUpdate(tableName = targetTableName, condition = Some("partition = 0"))
+    }
+  }
+
+
+  test(s"Preserves row IDs on unpartitioned table with optimized writes") {
+    withRowIdTestTable(isPartitioned = false) {
+      val whereClause = Some(s"id = 0 OR id = $numRowsTarget - 1")
+      withSQLConf(
+        DeltaSQLConf.DELTA_OPTIMIZE_WRITE_ENABLED.key -> "true") {
+        checkAndExecuteUpdate(targetTableName, condition = whereClause)
+      }
+
+      val (_, snapshot) = DeltaLog.forTableWithSnapshot(spark, TableIdentifier(targetTableName))
+
+      val expectedNumFiles = if (dvsEnabled) numFiles + 1 else numFiles - 1
+      assert(snapshot.allFiles.count() === expectedNumFiles)
+    }
+  }
+
+  test("Row tracking marked as not preserved when row tracking disabled") {
+    withRowTrackingEnabled(enabled = false) {
+      withRowIdTestTable(isPartitioned = false) {
+        val log = DeltaLog.forTable(spark, TableIdentifier(targetTableName))
+        assert(
+          !rowTrackingMarkedAsPreservedForCommit(log)(executeUpdate(
+            targetTableName, where = None, newVersion = -1L)))
+      }
+    }
+  }
+
+  test("Preserving Row Tracking - Subqueries are not supported in UPDATE") {
+    withRowTrackingEnabled(enabled = true) {
+      withRowIdTestTable(isPartitioned = false) {
+        val ex = intercept[AnalysisException] {
+          checkAndExecuteUpdate(
+            tableName = targetTableName,
+            condition = Some(
+              s"""id in (SELECT id FROM $targetTableName s
+              WHERE s.id = 0 OR s.id = $numRowsPerFile)"""))
+        }.getMessage
+        assert(ex.contains("Subqueries are not supported in the UPDATE"))
+      }
+    }
+  }
+}
+
+trait RowTrackingUpdateDVTests extends RowTrackingUpdateSuiteBase
+  with DeletionVectorsTestUtils {
+
+  override protected def dvsEnabled: Boolean = true
+
+}
+
+trait RowTrackingCDFTests extends RowTrackingUpdateSuiteBase {
+
+  override protected def sparkConf: SparkConf = {
+    super.sparkConf.set(DeltaConfigs.CHANGE_DATA_FEED.defaultTablePropertyKey, "true")
+  }
+}
+
+class RowTrackingUpdateSuite extends RowTrackingUpdateCommonTests
+
+class RowTrackingUpdateCDFSuite extends RowTrackingUpdateCommonTests with RowTrackingCDFTests
+
+class RowTrackingUpdateDVSuite extends RowTrackingUpdateCommonTests
+  with RowTrackingUpdateDVTests
+
+class RowTrackingUpdateCDFDVSuite extends RowTrackingUpdateCommonTests
+  with RowTrackingUpdateDVTests with RowTrackingCDFTests
+
+class RowTrackingUpdateIdColumnMappingSuite extends RowTrackingUpdateCommonTests
+  with DeltaColumnMappingEnableIdMode
+
+class RowTrackingUpdateNameColumnMappingSuite extends RowTrackingUpdateCommonTests
+  with DeltaColumnMappingEnableNameMode
+
+class RowTrackingUpdateCDFDVIdColumnMappingSuite extends RowTrackingUpdateCommonTests
+  with RowTrackingCDFTests with RowTrackingUpdateDVTests with DeltaColumnMappingEnableIdMode
+
+class RowTrackingUpdateCDFDVNameColumnMappingSuite extends RowTrackingUpdateCommonTests
+  with RowTrackingCDFTests with RowTrackingUpdateDVTests with DeltaColumnMappingEnableNameMode
+
+class RowTrackingUpdateCDFIdColumnMappingSuite extends RowTrackingUpdateCommonTests
+  with RowTrackingCDFTests with DeltaColumnMappingEnableIdMode
+
+class RowTrackingUpdateCDFNameColumnMappingSuite extends RowTrackingUpdateCommonTests
+  with RowTrackingCDFTests with DeltaColumnMappingEnableNameMode
+
+// Base trait for UPDATE tests that will run post-merge only
+trait UpdateWithRowTrackingTests extends UpdateSQLSuite with RowTrackingTestUtils {
+  override protected def sparkConf: SparkConf = super.sparkConf
+    .set(DeltaConfigs.ROW_TRACKING_ENABLED.defaultTablePropertyKey, "true")
+
+  override def excluded: Seq[String] = super.excluded ++
+    Seq(
+      // TODO: UPDATE on views can't find metadata column
+      "test update on temp view - view with too many internal aliases - Dataset TempView",
+      "test update on temp view - view with too many internal aliases - SQL TempView",
+      "test update on temp view - view with too many internal aliases " +
+        "with write amplification reduction - Dataset TempView",
+      "test update on temp view - view with too many internal aliases " +
+        "with write amplification reduction - SQL TempView",
+      "test update on temp view - basic - Partition=true - SQL TempView",
+      "test update on temp view - basic - Partition=false - SQL TempView",
+      "test update on temp view - superset cols - Dataset TempView",
+      "test update on temp view - superset cols - SQL TempView",
+      "test update on temp view - nontrivial projection - Dataset TempView",
+      "test update on temp view - nontrivial projection - SQL TempView",
+      "test update on temp view - nontrivial projection " +
+        "with write amplification reduction - Dataset TempView",
+      "test update on temp view - nontrivial projection " +
+        "with write amplification reduction - SQL TempView",
+      "update a SQL temp view",
+      // Checks file size written out
+      "usage metrics"
+      )
+}
+
+// UPDATE + row tracking
+class UpdateWithRowTrackingSuite extends UpdateSQLSuite with UpdateWithRowTrackingTests {
+  override protected def sparkConf: SparkConf = super.sparkConf
+    .set(DeltaConfigs.ENABLE_DELETION_VECTORS_CREATION.defaultTablePropertyKey, "false")
+}
+
+// UPDATE + CDC + row tracking
+class UpdateWithRowTrackingCDCSuite extends UpdateCDCSuite with UpdateWithRowTrackingTests {
+  override protected def sparkConf: SparkConf = super.sparkConf
+    .set(DeltaConfigs.ENABLE_DELETION_VECTORS_CREATION.defaultTablePropertyKey, "false")
+}
+
+// Tests with only the table feature enabled. Should not break any tests, unless row count stats
+// are missing.
+trait UpdateWithRowTrackingTableFeatureTests extends UpdateSQLSuite with RowTrackingTestUtils {
+  override protected def sparkConf: SparkConf = super.sparkConf
+    .set(DeltaConfigs.ROW_TRACKING_ENABLED.defaultTablePropertyKey, "false")
+    .set(defaultRowTrackingFeatureProperty, "supported")
+}
+
+// UPDATE + row tracking table feature
+class UpdateWithRowTrackingTableFeatureSuite
+  extends UpdateSQLSuite
+  with UpdateWithRowTrackingTableFeatureTests {
+  override protected def sparkConf: SparkConf = super.sparkConf
+    .set(DeltaConfigs.ENABLE_DELETION_VECTORS_CREATION.defaultTablePropertyKey, "false")
+}
+
+// UPDATE + CDC + row tracking table feature
+class UpdateWithRowTrackingTableFeatureCDCSuite
+  extends UpdateCDCSuite
+  with UpdateWithRowTrackingTableFeatureTests {
+  override protected def sparkConf: SparkConf = super.sparkConf
+    .set(DeltaConfigs.ENABLE_DELETION_VECTORS_CREATION.defaultTablePropertyKey, "false")
+}