Skip to content

Commit

Permalink
fix: properly deserialize percent-encoded file paths of Remove action…
Browse files Browse the repository at this point in the history
…s, to make sure tombstone and file paths match (delta-io#2035)

# Description
Percent-encoded file paths of Remove actions were not properly
deserialized, and when compared to active file paths, the paths didn't
match, which caused tombstones to be recognized as active files (be kept
in the state)

# Related Issue(s)
<!---
For example:

- closes delta-io#106
--->

# Documentation

<!---
Share links to useful documentation
--->

---------

Co-authored-by: Igor Borodin <igborodi@microsoft.com>
Co-authored-by: Ion Koutsouris <15728914+ion-elgreco@users.noreply.github.com>
Co-authored-by: R. Tyler Croy <rtyler@brokenco.de>
  • Loading branch information
4 people authored and RobinLin666 committed Feb 2, 2024
1 parent f0ca6cf commit 92c6ea0
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 0 deletions.
1 change: 1 addition & 0 deletions crates/deltalake-core/src/kernel/actions/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ pub struct Remove {
/// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path.
///
/// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt
#[serde(with = "serde_path")]
pub path: String,

/// When `false` the logical file must already be present in the table or the records
Expand Down
32 changes: 32 additions & 0 deletions crates/deltalake-core/src/table/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -363,9 +363,11 @@ impl DeltaTableState {

#[cfg(test)]
mod tests {

use super::*;
use crate::kernel::Txn;
use pretty_assertions::assert_eq;
use serde_json::json;

#[test]
fn state_round_trip() {
Expand Down Expand Up @@ -414,4 +416,34 @@ mod tests {
assert_eq!(2, *state.app_transaction_version().get("abc").unwrap());
assert_eq!(1, *state.app_transaction_version().get("xyz").unwrap());
}

#[test]
fn test_merging_deserialized_special_tombstones_and_files_paths() {
let add = serde_json::from_value(json!({
"path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet",
"partitionValues": {"x": "A/A"},
"size": 460,
"modificationTime": 1631873480,
"dataChange": true
}))
.unwrap();

let remove = serde_json::from_value(json!({
"path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet",
"deletionTimestamp": 1631873481,
"partitionValues": {"x": "A/A"},
"size": 460,
"modificationTime": 1631873481,
"dataChange": true
}))
.unwrap();

let state = DeltaTableState::from_actions(vec![Action::Add(add)], 0).unwrap();
let state_next = DeltaTableState::from_actions(vec![Action::Remove(remove)], 1).unwrap();

let mut merged_state = state.clone();
merged_state.merge(state_next, true, true);

assert_eq!(merged_state.files().len(), 0);
}
}

0 comments on commit 92c6ea0

Please sign in to comment.