From 4ead1f81edf5971426334c67d4a0369e25f4d5f0 Mon Sep 17 00:00:00 2001 From: Igor Borodin Date: Wed, 13 Dec 2023 17:58:34 +0200 Subject: [PATCH 1/2] fix: properly deserialized percent-encoded file paths of Remove actions, to make sure tombstone and file paths match --- crates/deltalake-core/src/kernel/actions/types.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/deltalake-core/src/kernel/actions/types.rs b/crates/deltalake-core/src/kernel/actions/types.rs index f64a5caa08..bb64393ca4 100644 --- a/crates/deltalake-core/src/kernel/actions/types.rs +++ b/crates/deltalake-core/src/kernel/actions/types.rs @@ -640,6 +640,7 @@ pub struct Remove { /// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path. /// /// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt + #[serde(with = "serde_path")] pub path: String, /// When `false` the logical file must already be present in the table or the records From fb18711aec7681e72431c2488285aa1bfbd93b09 Mon Sep 17 00:00:00 2001 From: Igor Borodin Date: Sun, 7 Jan 2024 11:14:59 +0200 Subject: [PATCH 2/2] Add test for merging JSON-deserialized actions with special paths --- crates/deltalake-core/src/table/state.rs | 32 ++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/crates/deltalake-core/src/table/state.rs b/crates/deltalake-core/src/table/state.rs index fa9078997c..f08bc181f6 100644 --- a/crates/deltalake-core/src/table/state.rs +++ b/crates/deltalake-core/src/table/state.rs @@ -387,9 +387,11 @@ impl DeltaTableState { #[cfg(test)] mod tests { + use super::*; use crate::kernel::Txn; use pretty_assertions::assert_eq; + use serde_json::json; #[test] fn state_round_trip() { @@ -438,4 +440,34 @@ mod tests { assert_eq!(2, *state.app_transaction_version().get("abc").unwrap()); assert_eq!(1, *state.app_transaction_version().get("xyz").unwrap()); } + + #[test] + fn test_merging_deserialized_special_tombstones_and_files_paths() { + let add = serde_json::from_value(json!({ + "path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet", + "partitionValues": {"x": "A/A"}, + "size": 460, + "modificationTime": 1631873480, + "dataChange": true + })) + .unwrap(); + + let remove = serde_json::from_value(json!({ + "path": "x=A%252FA/part-00016-94175338-2acc-40c2-a68a-d08ba677975f.c000.snappy.parquet", + "deletionTimestamp": 1631873481, + "partitionValues": {"x": "A/A"}, + "size": 460, + "modificationTime": 1631873481, + "dataChange": true + })) + .unwrap(); + + let state = DeltaTableState::from_actions(vec![Action::Add(add)], 0).unwrap(); + let state_next = DeltaTableState::from_actions(vec![Action::Remove(remove)], 1).unwrap(); + + let mut merged_state = state.clone(); + merged_state.merge(state_next, true, true); + + assert_eq!(merged_state.files().len(), 0); + } }