From 2f7d3c8ff3101a497ca8c45c7cdeb10703e1b774 Mon Sep 17 00:00:00 2001 From: xianwill Date: Wed, 2 Jun 2021 15:23:47 -0400 Subject: [PATCH 01/20] Add checkpoint writer (wip) --- Cargo.lock | 59 ++-- rust/src/action.rs | 11 +- rust/src/delta.rs | 317 +++++++++++++++++- rust/src/lib.rs | 2 +- rust/src/schema.rs | 283 +++++++++++++++- .../data/checkpoints/_delta_log/.gitignore | 3 + .../_delta_log/00000000000000000000.json | 4 + .../_delta_log/00000000000000000001.json | 2 + .../_delta_log/00000000000000000002.json | 2 + .../_delta_log/00000000000000000003.json | 2 + .../_delta_log/00000000000000000004.json | 2 + .../_delta_log/00000000000000000005.json | 2 + .../_delta_log/00000000000000000006.json | 2 + .../_delta_log/00000000000000000007.json | 2 + .../_delta_log/00000000000000000008.json | 2 + .../_delta_log/00000000000000000009.json | 2 + .../_delta_log/00000000000000000010.json | 2 + rust/tests/write_checkpoints.rs | 72 ++++ 18 files changed, 730 insertions(+), 41 deletions(-) create mode 100644 rust/tests/data/checkpoints/_delta_log/.gitignore create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000000.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000001.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000002.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000003.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000004.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000005.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000006.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000007.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000008.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000009.json create mode 100644 rust/tests/data/checkpoints/_delta_log/00000000000000000010.json create mode 100644 rust/tests/write_checkpoints.rs diff --git a/Cargo.lock b/Cargo.lock index d089d80016..f1719e359e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -96,9 +96,9 @@ checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" [[package]] name = "arrow" -version = "4.0.0" +version = "4.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "014c7490f839d9dd4ce28f8232d4731f8b1fb93fa477d69e6fe881814ac2b6bb" +checksum = "93811be1c0f60f4b29d80b34dad4e59fdc397a9e580f849df9e2635701498663" dependencies = [ "cfg_aliases", "chrono", @@ -108,6 +108,7 @@ dependencies = [ "indexmap", "lazy_static", "lexical-core", + "multiversion", "num", "prettytable-rs", "rand 0.7.3", @@ -1438,6 +1439,26 @@ dependencies = [ "winapi", ] +[[package]] +name = "multiversion" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373" +dependencies = [ + "multiversion-macros", +] + +[[package]] +name = "multiversion-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "native-tls" version = "0.2.7" @@ -1479,9 +1500,9 @@ dependencies = [ [[package]] name = "num" -version = "0.3.1" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b7a8e9be5e039e2ff869df49155f1c06bd01ade2117ec783e56ab0932b67a8f" +checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" dependencies = [ "num-bigint", "num-complex", @@ -1493,9 +1514,9 @@ dependencies = [ [[package]] name = "num-bigint" -version = "0.3.2" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d0a3d5e207573f948a9e5376662aa743a2ea13f7c50a554d7af443a73fbfeba" +checksum = "4e0d047c1062aa51e256408c560894e5251f08925980e53cf1aa5bd00eec6512" dependencies = [ "autocfg", "num-integer", @@ -1504,9 +1525,9 @@ dependencies = [ [[package]] name = "num-complex" -version = "0.3.1" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747d632c0c558b87dbabbe6a82f3b4ae03720d0646ac5b7b4dae89394be5f2c5" +checksum = "26873667bbbb7c5182d4a37c1add32cdf09f841af72da53318fdb81543c15085" dependencies = [ "num-traits", ] @@ -1534,9 +1555,9 @@ dependencies = [ [[package]] name = "num-rational" -version = "0.3.2" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12ac428b1cb17fce6f731001d307d351ec70a6d202fc2e60f7d4c5e42d8f4f07" +checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" dependencies = [ "autocfg", "num-bigint", @@ -1702,12 +1723,12 @@ dependencies = [ [[package]] name = "parquet" -version = "4.0.0" +version = "4.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32858ae16bd61fda406be4b76af617d2f632fed4f0093810245aa4f5316ac865" +checksum = "9275a7f8eab04e6ab6918b4fdd50e00aeba3c288e0f91bdc5da87a2c8ff288a6" dependencies = [ "arrow", - "base64 0.12.3", + "base64 0.13.0", "brotli", "byteorder", "chrono", @@ -3268,18 +3289,18 @@ checksum = "81a974bcdd357f0dca4d41677db03436324d45a4c9ed2d0b873a5a360ce41c36" [[package]] name = "zstd" -version = "0.7.0+zstd.1.4.9" +version = "0.8.2+zstd.1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9428752481d8372e15b1bf779ea518a179ad6c771cca2d2c60e4fbff3cc2cd52" +checksum = "c83508bcbbdc9c3abcf77e8e56773d3ffcd2479e0933caab2e7d6b5a9e183aae" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "3.1.0+zstd.1.4.9" +version = "4.1.0+zstd.1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5aa1926623ad7fe406e090555387daf73db555b948134b4d73eac5eb08fb666d" +checksum = "d30375f78e185ca4c91930f42ea2c0162f9aa29737032501f93b79266d985ae7" dependencies = [ "libc", "zstd-sys", @@ -3287,9 +3308,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "1.5.0+zstd.1.4.9" +version = "1.6.0+zstd.1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e6c094340240369025fc6b731b054ee2a834328fa584310ac96aa4baebdc465" +checksum = "2141bed8922b427761470e6bbfeff255da94fa20b0bbeab0d9297fcaf71e3aa7" dependencies = [ "cc", "libc", diff --git a/rust/src/action.rs b/rust/src/action.rs index 73a7715d3d..d8bae98ba3 100644 --- a/rust/src/action.rs +++ b/rust/src/action.rs @@ -441,6 +441,7 @@ impl MetaData { let configuration_map = record .get_map(i) .map_err(|_| gen_action_type_error("metaData", "configuration", "map"))?; + re.configuration = HashMap::new(); populate_hashmap_from_parquet_map(&mut re.configuration, configuration_map) .map_err(|estr| { ActionError::InvalidField(format!( @@ -612,7 +613,7 @@ pub struct Txn { /// An application-specific numeric identifier for this transaction. pub version: DeltaDataTypeVersion, /// The time when this transaction action was created in milliseconds since the Unix epoch. - pub last_updated: DeltaDataTypeTimestamp, + pub last_updated: Option, } impl Txn { @@ -635,9 +636,11 @@ impl Txn { .map_err(|_| gen_action_type_error("txn", "version", "long"))?; } "lastUpdated" => { - re.last_updated = record - .get_long(i) - .map_err(|_| gen_action_type_error("txn", "lastUpdated", "long"))?; + re.last_updated = Some( + record + .get_long(i) + .map_err(|_| gen_action_type_error("txn", "lastUpdated", "long"))?, + ); } _ => { log::warn!( diff --git a/rust/src/delta.rs b/rust/src/delta.rs index 4e558bff38..8b1e36cef1 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -8,11 +8,14 @@ use std::fmt; use std::io::{BufRead, BufReader, Cursor}; use arrow::error::ArrowError; +use arrow::json::reader::ReaderBuilder; use chrono::{DateTime, FixedOffset, Utc}; use futures::StreamExt; use lazy_static::lazy_static; -use log::debug; +use log::*; +use parquet::arrow::ArrowWriter; use parquet::errors::ParquetError; +use parquet::file::writer::InMemoryWriteableCursor; use parquet::file::{ reader::{FileReader, SerializedFileReader}, serialized_reader::SliceableCursor, @@ -21,6 +24,7 @@ use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::Value; use std::convert::TryFrom; +use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use super::action; @@ -196,6 +200,40 @@ impl fmt::Display for DeltaTableMetaData { } } +impl TryFrom<&action::MetaData> for DeltaTableMetaData { + type Error = serde_json::error::Error; + + fn try_from(action_metadata: &action::MetaData) -> Result { + Ok(Self { + id: action_metadata.id.clone(), + name: action_metadata.name.clone(), + description: action_metadata.description.clone(), + format: action_metadata.format.clone(), + schema: action_metadata.get_schema()?, + partition_columns: action_metadata.partition_columns.clone(), + created_time: action_metadata.created_time, + configuration: action_metadata.configuration.clone(), + }) + } +} + +impl TryFrom<&DeltaTableMetaData> for action::MetaData { + type Error = serde_json::error::Error; + + fn try_from(metadata: &DeltaTableMetaData) -> Result { + Ok(Self { + id: metadata.id.clone(), + name: metadata.name.clone(), + description: metadata.description.clone(), + format: metadata.format.clone(), + schema_string: serde_json::to_string(&metadata.schema)?, + partition_columns: metadata.partition_columns.clone(), + created_time: metadata.created_time, + configuration: metadata.configuration.clone(), + }) + } +} + /// Error related to Delta log application #[derive(thiserror::Error, Debug)] pub enum ApplyLogError { @@ -263,8 +301,9 @@ impl From for LoadCheckpointError { } } -#[derive(Default, Debug)] -struct DeltaTableState { +/// State snapshot currently held by the Delta Table instance. +#[derive(Default, Debug, Clone)] +pub struct DeltaTableState { // A remove action should remain in the state of the table as a tombstone until it has expired. // A tombstone expires when the creation timestamp of the delta file exceeds the expiration tombstones: Vec, @@ -276,6 +315,61 @@ struct DeltaTableState { current_metadata: Option, } +impl DeltaTableState { + /// Creates a new instance of DeltaTableState from the supplied components. + pub fn new( + tombstones: Vec, + files: Vec, + commit_infos: Vec, + app_transaction_version: HashMap, + min_reader_version: i32, + min_writer_version: i32, + current_metadata: Option, + ) -> Self { + Self { + tombstones, + files, + commit_infos, + app_transaction_version, + min_reader_version, + min_writer_version, + current_metadata, + } + } + + /// Full list of tombstones (remove actions) representing files removed from table state). + pub fn tombstones(&self) -> &Vec { + self.tombstones.as_ref() + } + + /// Full list of add actions representing all parquet files that are part of the current + /// delta table state. + pub fn files(&self) -> &Vec { + self.files.as_ref() + } + + /// HashMap containing the last txn version stored for every app id writing txn + /// actions. + pub fn app_transaction_version(&self) -> &HashMap { + &self.app_transaction_version + } + + /// The min reader version required by the protocol. + pub fn min_reader_version(&self) -> i32 { + self.min_reader_version + } + + /// The min writer version required by the protocol. + pub fn min_writer_version(&self) -> i32 { + self.min_writer_version + } + + /// The most recent metadata of the table. + pub fn current_metadata(&self) -> Option<&DeltaTableMetaData> { + self.current_metadata.as_ref() + } +} + /// In memory representation of a Delta Table pub struct DeltaTable { /// The version of the table as of the most recent loaded Delta log entry. @@ -419,9 +513,11 @@ impl DeltaTable { async fn restore_checkpoint(&mut self, check_point: CheckPoint) -> Result<(), DeltaTableError> { let checkpoint_data_paths = self.get_checkpoint_data_paths(&check_point); + println!("{:?}", checkpoint_data_paths); // process actions from checkpoint self.state = DeltaTableState::default(); for f in &checkpoint_data_paths { + println!("{:?}", f); let obj = self.storage.get_obj(&f).await?; let preader = SerializedFileReader::new(SliceableCursor::new(obj))?; let schema = preader.metadata().file_metadata().schema(); @@ -486,7 +582,9 @@ impl DeltaTable { match self.get_last_checkpoint().await { Ok(last_check_point) => { self.last_check_point = Some(last_check_point); + println!("Restoring checkpoint {:?}", last_check_point); self.restore_checkpoint(last_check_point).await?; + println!("Checkpoint restored {}", last_check_point.version); self.version = last_check_point.version + 1; } Err(LoadCheckpointError::NotFound) => { @@ -690,6 +788,11 @@ impl DeltaTable { .collect() } + /// Returns the currently loaded state snapshot. + pub fn get_state(&self) -> &DeltaTableState { + &self.state + } + /// Returns the metadata associated with the loaded state. pub fn get_metadata(&self) -> Result<&DeltaTableMetaData, DeltaTableError> { self.state @@ -1296,6 +1399,199 @@ fn log_entry_from_actions(actions: &[Action]) -> Result, +} + +impl CheckPointWriter { + /// Creates a new CheckPointWriter. + pub fn new(table_path: &str, storage: Box) -> Self { + let delta_log_path = storage.join_path(table_path, "_delta_log"); + let last_checkpoint_path = storage.join_path(delta_log_path.as_str(), "_last_checkpoint"); + + Self { + delta_log_path, + last_checkpoint_path, + storage, + } + } + + /// Creates a new checkpoint at the specified version from the given DeltaTableState. + pub async fn create_checkpoint_from_state( + &self, + version: DeltaDataTypeVersion, + state: &DeltaTableState, + ) -> Result<(), CheckPointWriterError> { + // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for + // an appropriate split point yet though so only writing a single part currently. + + info!("Writing parquet bytes to checkpoint buffer."); + let parquet_bytes = parquet_bytes_from_state(state)?; + + let size = parquet_bytes.len() as i64; + + let checkpoint = CheckPoint { + version, + size, + parts: None, + }; + + let file_name = format!("{:020}.parquet", version); + let checkpoint_path = self.storage.join_path(&self.delta_log_path, &file_name); + + info!("Writing checkpoint to {:?}.", checkpoint_path); + self.storage + .put_obj(&checkpoint_path, &parquet_bytes) + .await?; + + let last_checkpoint_content: serde_json::Value = serde_json::to_value(&checkpoint)?; + let last_checkpoint_content = serde_json::to_string(&last_checkpoint_content)?; + + info!( + "Writing _last_checkpoint to {:?}.", + self.last_checkpoint_path + ); + self.storage + .put_obj( + self.last_checkpoint_path.as_str(), + last_checkpoint_content.as_bytes(), + ) + .await?; + + Ok(()) + } +} + +fn parquet_bytes_from_state(state: &DeltaTableState) -> Result, CheckPointWriterError> { + let mut json_buffer: Vec = Vec::new(); + + let protocol = action::Action::protocol(action::Protocol { + min_reader_version: state.min_reader_version(), + min_writer_version: state.min_writer_version(), + }); + + extend_json_byte_buffer(&mut json_buffer, &protocol)?; + + let metadata = state + .current_metadata() + .ok_or_else(|| CheckPointWriterError::MissingMetaData)?; + let metadata = action::Action::metaData(action::MetaData::try_from(metadata)?); + extend_json_byte_buffer(&mut json_buffer, &metadata)?; + + for add in state.files() { + let add = action::Action::add(add.clone()); + extend_json_byte_buffer(&mut json_buffer, &add)?; + } + + for remove in state.tombstones() { + let remove = action::Action::remove(remove.clone()); + extend_json_byte_buffer(&mut json_buffer, &remove)?; + } + + for (app_id, version) in state.app_transaction_version().iter() { + let txn = action::Action::txn(action::Txn { + app_id: app_id.clone(), + version: *version, + last_updated: None, + }); + extend_json_byte_buffer(&mut json_buffer, &txn)?; + } + + let arrow_schema = delta_log_arrow_schema()?; + let arrow_schema = Arc::new(arrow_schema); + + let cursor = Cursor::new(json_buffer); + + let mut json_reader = ReaderBuilder::new() + .with_schema(arrow_schema.clone()) + .build(cursor)?; + + debug!("Preparing checkpoint parquet buffer."); + + let writeable_cursor = InMemoryWriteableCursor::default(); + let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; + + debug!("Writing to checkpoint parquet buffer..."); + + while let Some(batch) = json_reader.next()? { + writer.write(&batch)?; + } + + let _ = writer.close()?; + + debug!("Finsihed writing checkpoint file."); + + Ok(writeable_cursor.data()) +} + +fn extend_json_byte_buffer( + json_byte_buffer: &mut Vec, + json_value: &T, +) -> Result<(), serde_json::error::Error> +where + T: ?Sized + Serialize, +{ + json_byte_buffer.extend(serde_json::to_vec(json_value)?); + json_byte_buffer.push(b'\n'); + + Ok(()) +} + fn process_action( state: &mut DeltaTableState, action: &Action, @@ -1313,16 +1609,7 @@ fn process_action( state.min_writer_version = v.min_writer_version; } Action::metaData(v) => { - state.current_metadata = Some(DeltaTableMetaData { - id: v.id.clone(), - name: v.name.clone(), - description: v.description.clone(), - format: v.format.clone(), - schema: v.get_schema()?, - partition_columns: v.partition_columns.clone(), - created_time: v.created_time, - configuration: v.configuration.clone(), - }); + state.current_metadata = Some(DeltaTableMetaData::try_from(v)?); } Action::txn(v) => { *state @@ -1373,7 +1660,7 @@ pub async fn open_table_with_ds(table_path: &str, ds: &str) -> Result &'static str { env!("CARGO_PKG_VERSION") } @@ -1404,7 +1691,7 @@ mod tests { let txn_action = Action::txn(action::Txn { app_id: "abc".to_string(), version: 2, - last_updated: 0, + last_updated: Some(0), }); let _ = process_action(&mut state, &txn_action).unwrap(); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 226306fc7d..b4d5fb7507 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -32,7 +32,7 @@ //! "2020-05-02T23:47:31-07:00", //! ).await.unwrap(); //! let files = table.get_files(); -//! }; +//! } ; //! ``` //! //! # Optional cargo package features diff --git a/rust/src/schema.rs b/rust/src/schema.rs index 0a71ecdce1..db5da3b4dc 100644 --- a/rust/src/schema.rs +++ b/rust/src/schema.rs @@ -1,8 +1,12 @@ #![allow(non_snake_case, non_camel_case_types)] -use std::collections::HashMap; - +use arrow::datatypes::Schema as ArrowSchema; +use arrow::error::ArrowError; +use parquet::errors::ParquetError; use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::convert::TryFrom; /// Type alias for a string expected to match a GUID/UUID format pub type Guid = String; @@ -160,3 +164,278 @@ impl Schema { &self.fields } } + +/// Error representing a failure while training to create the delta log schema. +#[derive(thiserror::Error, Debug)] +pub enum DeltaLogSchemaError { + /// Error returned when reading the checkpoint failed. + #[error("Failed to read checkpoint: {}", .source)] + ParquetError { + /// Parquet error details returned when reading the checkpoint failed. + #[from] + source: ParquetError, + }, + /// Error returned when converting the schema in Arrow format failed. + #[error("Failed to convert into Arrow schema: {}", .source)] + ArrowError { + /// Arrow error details returned when converting the schema in Arrow format failed + #[from] + source: ArrowError, + }, + /// Passthrough error returned by serde_json. + #[error("serde_json::Error: {source}")] + JSONSerialization { + /// The source serde_json::Error. + #[from] + source: serde_json::Error, + }, +} + +pub(crate) fn delta_log_arrow_schema() -> Result { + let delta_schema = delta_log_schema()?; + let arrow_schema: ArrowSchema = >::try_from(&delta_schema)?; + + Ok(arrow_schema) +} + +pub(crate) fn delta_log_schema() -> Result { + let field_map = delta_log_json_fields(); + + // TODO: receive a table schema parameter and merge into add.stats_parsed in the delta log schema + // TODO: also merge partition column schema fields under add.partitionValues_parsed + // Skipping this for now until I can get the maps to work. + + let json_fields: Vec = field_map.values().map(|v| v.to_owned()).collect(); + let mut json_schema = serde_json::Map::new(); + json_schema.insert("type".to_string(), Value::String("struct".to_string())); + json_schema.insert("fields".to_string(), Value::Array(json_fields)); + let json_schema = Value::Object(json_schema); + + let delta_schema: Schema = serde_json::from_value(json_schema)?; + + Ok(delta_schema) +} + +pub(crate) fn delta_log_json_fields() -> HashMap { + // TODO: Missing feature in arrow - string keys are not supported by Arrow Dictionary. + // Example: https://github.com/apache/arrow-rs/blob/master/arrow/src/json/reader.rs#L858-L898 + // There are many other code refs in arrow besides this one that limit dict keys to numeric + // keys. + let meta_data = json!({ + "name": "metaData", + "type": { + "type": "struct", + "fields": [{ + "name": "id", + "type": "string", + "nullable": true, + "metadata": {}, + },{ + "name": "name", + "type": "string", + "nullable": true, + "metadata": {}, + },{ + "name": "description", + "type": "string", + "nullable": true, + "metadata": {}, + },{ + "name": "schemaString", + "type": "string", + "nullable": true, + "metadata": {}, + },{ + "name": "createdTime", + "type": "long", + "nullable": true, + "metadata": {}, + },{ + "name": "partitionColumns", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true, + }, + "nullable": true, + "metadata": {}, + },{ + "name": "format", + "type": { + "type": "struct", + "fields": [{ + "name": "provider", + "type": "string", + "nullable": true, + "metadata": {}, + },/*{ + "name": "options", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true, + }, + "nullable": true, + "metadata": {} + }*/] + }, + "nullable": true, + "metadata": {} + },/*{ + "name": "configuration", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true, + }, + "nullable": true, + "metadata": {} + }*/] + }, + "nullable": true, + "metadata": {} + }); + + let protocol = json!({ + "name": "protocol", + "type": { + "type": "struct", + "fields": [{ + "name": "minReaderVersion", + "type": "integer", + "nullable": true, + "metadata": {}, + },{ + "name": "minWriterVersion", + "type": "integer", + "nullable": true, + "metadata": {}, + }] + }, + "nullable": true, + "metadata": {} + }); + + let txn = json!({ + "name": "txn", + "type": { + "type": "struct", + "fields": [{ + "name": "appId", + "type": "string", + "nullable": true, + "metadata": {}, + },{ + "name": "version", + "type": "long", + "nullable": true, + "metadata": {}, + }] + }, + "nullable": true, + "metadata": {} + }); + + let add = json!({ + "name": "add", + "type": { + "type": "struct", + "fields": [{ + "name": "path", + "type": "string", + "nullable": true, + "metadata": {}, + },{ + "name": "size", + "type": "long", + "nullable": true, + "metadata": {}, + },{ + "name": "modificationTime", + "type": "long", + "nullable": true, + "metadata": {}, + },{ + "name": "dataChange", + "type": "boolean", + "nullable": true, + "metadata": {}, + },{ + "name": "stats", + "type": "string", + "nullable": true, + "metadata": {}, + },/*{ + "name": "partitionValues", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true, + }, + "nullable": true, + "metadata": {}, + }*/] + }, + "nullable": true, + "metadata": {} + }); + + let remove = json!({ + "name": "remove", + "type": { + "type": "struct", + "fields": [{ + "name": "path", + "type": "string", + "nullable": true, + "metadata": {}, + },{ + "name": "size", + "type": "long", + "nullable": true, + "metadata": {}, + },{ + "name": "modificationTime", + "type": "long", + "nullable": true, + "metadata": {}, + },{ + "name": "dataChange", + "type": "boolean", + "nullable": true, + "metadata": {}, + },{ + "name": "stats", + "type": "string", + "nullable": true, + "metadata": {}, + },/*{ + "name": "partitionValues", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true, + }, + "nullable": true, + "metadata": {}, + + }*/], + }, + "nullable": true, + "metadata": {} + }); + + let mut map = HashMap::new(); + + map.insert("metaData".to_string(), meta_data); + map.insert("protocol".to_string(), protocol); + map.insert("txn".to_string(), txn); + map.insert("add".to_string(), add); + map.insert("remove".to_string(), remove); + + map +} diff --git a/rust/tests/data/checkpoints/_delta_log/.gitignore b/rust/tests/data/checkpoints/_delta_log/.gitignore new file mode 100644 index 0000000000..8624856880 --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/.gitignore @@ -0,0 +1,3 @@ +*.parquet +_last_checkpoint + diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000000.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..90f4a993cf --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1615751699523,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"cf3741a3-5f93-434f-99ac-9a4bebcdf06c","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"version\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1615751699422}} +{"add":{"path":"part-00000-3810fbe0-9892-431d-bcfd-7de5788dfe8d-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751699515,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000001.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..66aa3a4d49 --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751700281,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":0,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-1abe25d3-0da6-46c5-98c1-7a69872fd797-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751700275,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000002.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..c4430292c4 --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000002.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751701120,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":1,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-136c36f5-639d-4e95-bb0f-15cde3fb14eb-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751701112,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000003.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..e91353d69a --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000003.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751701854,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":2,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-8e7dc8c1-337b-40b8-a411-46d4295da531-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751701848,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000004.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000004.json new file mode 100644 index 0000000000..20fa6e4471 --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000004.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751702764,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":3,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-e93060ad-9c8c-4170-a9da-7c6f53f6406b-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751702758,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000005.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000005.json new file mode 100644 index 0000000000..12a3d009fe --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000005.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751703539,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":4,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-e9c6df9a-e585-4c70-bc1f-de9bd8ae025b-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751703532,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000006.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000006.json new file mode 100644 index 0000000000..97d497924e --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000006.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751704301,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":5,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-7d239c98-d74b-4b02-b3f6-9f256992c633-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751704295,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000007.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000007.json new file mode 100644 index 0000000000..f34c112437 --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000007.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751705073,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":6,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-72ecc4d6-2e44-4df4-99e6-23f1ac2b7b7c-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751705065,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000008.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000008.json new file mode 100644 index 0000000000..61314a9df0 --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000008.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751705959,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":7,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-3fa65c69-4e55-4b18-a195-5f1ae583e553-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751705952,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000009.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000009.json new file mode 100644 index 0000000000..4a58463074 --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000009.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751706703,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":8,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-9afd9224-729f-4420-a05e-8032113a6568-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751706698,"dataChange":true}} diff --git a/rust/tests/data/checkpoints/_delta_log/00000000000000000010.json b/rust/tests/data/checkpoints/_delta_log/00000000000000000010.json new file mode 100644 index 0000000000..3f6aaf85df --- /dev/null +++ b/rust/tests/data/checkpoints/_delta_log/00000000000000000010.json @@ -0,0 +1,2 @@ +{"commitInfo":{"timestamp":1615751716705,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"readVersion":9,"isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputBytes":"442","numOutputRows":"1"}}} +{"add":{"path":"part-00000-f0e955c5-a1e3-4eec-834e-dcc098fc9005-c000.snappy.parquet","partitionValues":{},"size":442,"modificationTime":1615751716698,"dataChange":true}} diff --git a/rust/tests/write_checkpoints.rs b/rust/tests/write_checkpoints.rs new file mode 100644 index 0000000000..eecc343919 --- /dev/null +++ b/rust/tests/write_checkpoints.rs @@ -0,0 +1,72 @@ +extern crate deltalake; + +use deltalake::storage; +use deltalake::CheckPointWriter; +use std::fs; +use std::path::{Path, PathBuf}; + +#[tokio::test] +async fn write_simple_checkpoint() { + let table_location = "./tests/data/checkpoints"; + let table_path = PathBuf::from(table_location); + let log_path = table_path.join("_delta_log"); + + // Delete checkpoint files from previous runs + cleanup_checkpoint_files(log_path.as_path()); + + // Load the delta table at version 5 + let table = deltalake::open_table_with_version(table_location, 5) + .await + .unwrap(); + + // Write a checkpoint + let storage_backend = storage::get_backend_for_uri(table_location).unwrap(); + let checkpoint_writer = CheckPointWriter::new(table_location, storage_backend); + let _ = checkpoint_writer + .create_checkpoint_from_state(table.version, table.get_state()) + .await + .unwrap(); + + // checkpoint should exist + let checkpoint_path = log_path.join("00000000000000000005.parquet"); + assert!(checkpoint_path.as_path().exists()); + + // _last_checkpoint should exist + let last_checkpoint_path = log_path.join("_last_checkpoint"); + assert!(last_checkpoint_path.as_path().exists()); + + // _last_checkpoint should point to checkpoint + let last_checkpoint_content = fs::read_to_string(last_checkpoint_path.as_path()).unwrap(); + let last_checkpoint_content: serde_json::Value = + serde_json::from_str(last_checkpoint_content.as_str()).unwrap(); + + println!("{:?}", last_checkpoint_content); + + // delta table should load just fine with the checkpoint in place + let table_result = deltalake::open_table(table_location).await.unwrap(); + let table = table_result; + let files = table.get_files(); + println!("{:?}", files); +} + +fn cleanup_checkpoint_files(log_path: &Path) { + let paths = fs::read_dir(log_path).unwrap(); + + for p in paths { + match p { + Ok(d) => { + let path = d.path(); + + println!("Checking path {:?}", path); + + if path.file_name().unwrap() == "_last_checkpoint" + || path.extension().unwrap() == "parquet" + { + println!("Deleting {:?}", path); + fs::remove_file(path).unwrap(); + } + } + _ => {} + } + } +} From c7caaec85cdf2299dd0a7bdc489b4f3f0a6ab709 Mon Sep 17 00:00:00 2001 From: xianwill Date: Wed, 2 Jun 2021 15:52:29 -0400 Subject: [PATCH 02/20] Remove unnecessary code line --- rust/src/action.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/rust/src/action.rs b/rust/src/action.rs index d8bae98ba3..3b50b8b76c 100644 --- a/rust/src/action.rs +++ b/rust/src/action.rs @@ -441,7 +441,6 @@ impl MetaData { let configuration_map = record .get_map(i) .map_err(|_| gen_action_type_error("metaData", "configuration", "map"))?; - re.configuration = HashMap::new(); populate_hashmap_from_parquet_map(&mut re.configuration, configuration_map) .map_err(|estr| { ActionError::InvalidField(format!( From 5c0186de2780757c225e14402464c47fc2ef0b5f Mon Sep 17 00:00:00 2001 From: xianwill Date: Wed, 2 Jun 2021 18:13:39 -0400 Subject: [PATCH 03/20] fix checkpoint file name --- rust/src/delta.rs | 7 ++++--- rust/tests/write_checkpoints.rs | 12 +++++++++--- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/rust/src/delta.rs b/rust/src/delta.rs index 8b1e36cef1..fbad15d1bf 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -427,7 +427,9 @@ impl DeltaTable { async fn get_last_checkpoint(&self) -> Result { let last_checkpoint_path = self.storage.join_path(&self.log_path, "_last_checkpoint"); + println!("Last checkpoint path {:?}", last_checkpoint_path); let data = self.storage.get_obj(&last_checkpoint_path).await?; + println!("Loaded last checkpoint."); Ok(serde_json::from_slice(&data)?) } @@ -513,11 +515,10 @@ impl DeltaTable { async fn restore_checkpoint(&mut self, check_point: CheckPoint) -> Result<(), DeltaTableError> { let checkpoint_data_paths = self.get_checkpoint_data_paths(&check_point); - println!("{:?}", checkpoint_data_paths); // process actions from checkpoint self.state = DeltaTableState::default(); for f in &checkpoint_data_paths { - println!("{:?}", f); + println!("Checkpoint data path {:?}", f); let obj = self.storage.get_obj(&f).await?; let preader = SerializedFileReader::new(SliceableCursor::new(obj))?; let schema = preader.metadata().file_metadata().schema(); @@ -1491,7 +1492,7 @@ impl CheckPointWriter { parts: None, }; - let file_name = format!("{:020}.parquet", version); + let file_name = format!("{:020}.checkpoint.parquet", version); let checkpoint_path = self.storage.join_path(&self.delta_log_path, &file_name); info!("Writing checkpoint to {:?}.", checkpoint_path); diff --git a/rust/tests/write_checkpoints.rs b/rust/tests/write_checkpoints.rs index eecc343919..d425a7aa62 100644 --- a/rust/tests/write_checkpoints.rs +++ b/rust/tests/write_checkpoints.rs @@ -28,17 +28,23 @@ async fn write_simple_checkpoint() { .unwrap(); // checkpoint should exist - let checkpoint_path = log_path.join("00000000000000000005.parquet"); + let checkpoint_path = log_path.join("00000000000000000005.checkpoint.parquet"); assert!(checkpoint_path.as_path().exists()); + // HACK: seems like fs backend is eventually consistent :/ + std::thread::sleep(std::time::Duration::from_secs(1)); + // _last_checkpoint should exist let last_checkpoint_path = log_path.join("_last_checkpoint"); assert!(last_checkpoint_path.as_path().exists()); // _last_checkpoint should point to checkpoint let last_checkpoint_content = fs::read_to_string(last_checkpoint_path.as_path()).unwrap(); + println!("{:?}", last_checkpoint_content); + let last_checkpoint_content = last_checkpoint_content.trim(); + println!("{:?}", last_checkpoint_content); let last_checkpoint_content: serde_json::Value = - serde_json::from_str(last_checkpoint_content.as_str()).unwrap(); + serde_json::from_str(last_checkpoint_content).unwrap(); println!("{:?}", last_checkpoint_content); @@ -60,7 +66,7 @@ fn cleanup_checkpoint_files(log_path: &Path) { println!("Checking path {:?}", path); if path.file_name().unwrap() == "_last_checkpoint" - || path.extension().unwrap() == "parquet" + || (path.extension().is_some() && path.extension().unwrap() == "parquet") { println!("Deleting {:?}", path); fs::remove_file(path).unwrap(); From a5c05e904c22b2eb1c9bab7b2728a244c177b20b Mon Sep 17 00:00:00 2001 From: xianwill Date: Sat, 5 Jun 2021 16:12:33 -0400 Subject: [PATCH 04/20] Add DeltaLogSchemaFactory --- rust/src/delta.rs | 132 +++---- rust/src/delta_arrow.rs | 5 + rust/src/lib.rs | 2 +- rust/src/schema.rs | 359 +++++++++--------- ...eckpoints.rs => checkpoint_writer_test.rs} | 26 +- 5 files changed, 259 insertions(+), 265 deletions(-) rename rust/tests/{write_checkpoints.rs => checkpoint_writer_test.rs} (75%) diff --git a/rust/src/delta.rs b/rust/src/delta.rs index fbad15d1bf..ef9cb591a5 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -1,12 +1,8 @@ //! Delta Table read and write implementation // Reference: https://github.com/delta-io/delta/blob/master/PROTOCOL.md - -use std::cmp::Ordering; -use std::collections::HashMap; -use std::fmt; -use std::io::{BufRead, BufReader, Cursor}; - +// +use arrow::datatypes::Schema as ArrowSchema; use arrow::error::ArrowError; use arrow::json::reader::ReaderBuilder; use chrono::{DateTime, FixedOffset, Utc}; @@ -23,9 +19,14 @@ use parquet::file::{ use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::Value; +use std::cmp::Ordering; +use std::collections::HashMap; use std::convert::TryFrom; +use std::fmt; +use std::io::{BufRead, BufReader, Cursor}; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use uuid::Uuid; use super::action; use super::action::{Action, DeltaOperation}; @@ -33,7 +34,6 @@ use super::partitions::{DeltaTablePartition, PartitionFilter}; use super::schema::*; use super::storage; use super::storage::{StorageBackend, StorageError, UriError}; -use uuid::Uuid; /// Metadata for a checkpoint file #[derive(Serialize, Deserialize, Debug, Default, Clone, Copy)] @@ -427,9 +427,7 @@ impl DeltaTable { async fn get_last_checkpoint(&self) -> Result { let last_checkpoint_path = self.storage.join_path(&self.log_path, "_last_checkpoint"); - println!("Last checkpoint path {:?}", last_checkpoint_path); let data = self.storage.get_obj(&last_checkpoint_path).await?; - println!("Loaded last checkpoint."); Ok(serde_json::from_slice(&data)?) } @@ -518,7 +516,6 @@ impl DeltaTable { // process actions from checkpoint self.state = DeltaTableState::default(); for f in &checkpoint_data_paths { - println!("Checkpoint data path {:?}", f); let obj = self.storage.get_obj(&f).await?; let preader = SerializedFileReader::new(SliceableCursor::new(obj))?; let schema = preader.metadata().file_metadata().schema(); @@ -583,9 +580,7 @@ impl DeltaTable { match self.get_last_checkpoint().await { Ok(last_check_point) => { self.last_check_point = Some(last_check_point); - println!("Restoring checkpoint {:?}", last_check_point); self.restore_checkpoint(last_check_point).await?; - println!("Checkpoint restored {}", last_check_point.version); self.version = last_check_point.version + 1; } Err(LoadCheckpointError::NotFound) => { @@ -1406,7 +1401,6 @@ pub enum CheckPointWriterError { /// Error returned when the DeltaTableState does not contain a metadata action. #[error("DeltaTableMetadata not present in DeltaTableState")] MissingMetaData, - /// Error returned when creating the checkpoint schema. #[error("DeltaLogSchemaError: {source}")] DeltaLogSchema { @@ -1414,7 +1408,6 @@ pub enum CheckPointWriterError { #[from] source: DeltaLogSchemaError, }, - /// Passthrough error returned when calling DeltaTable. #[error("DeltaTableError: {source}")] DeltaTable { @@ -1422,14 +1415,14 @@ pub enum CheckPointWriterError { #[from] source: DeltaTableError, }, - /// Error returned when reading the checkpoint failed. - #[error("Failed to read checkpoint: {}", .source)] + /// Error returned when the parquet writer fails while writing the checkpoint. + #[error("Failed to write parquet: {}", .source)] ParquetError { - /// Parquet error details returned when reading the checkpoint failed. + /// Parquet error details returned when writing the checkpoint failed. #[from] source: ParquetError, }, - /// Error returned when converting the schema in Arrow format failed. + /// Error returned when converting the schema to Arrow format failed. #[error("Failed to convert into Arrow schema: {}", .source)] ArrowError { /// Arrow error details returned when converting the schema in Arrow format failed @@ -1457,6 +1450,7 @@ pub struct CheckPointWriter { delta_log_path: String, last_checkpoint_path: String, storage: Box, + schema_factory: DeltaLogSchemaFactory, } impl CheckPointWriter { @@ -1464,11 +1458,13 @@ impl CheckPointWriter { pub fn new(table_path: &str, storage: Box) -> Self { let delta_log_path = storage.join_path(table_path, "_delta_log"); let last_checkpoint_path = storage.join_path(delta_log_path.as_str(), "_last_checkpoint"); + let schema_factory = DeltaLogSchemaFactory::new(); Self { delta_log_path, last_checkpoint_path, storage, + schema_factory, } } @@ -1482,7 +1478,7 @@ impl CheckPointWriter { // an appropriate split point yet though so only writing a single part currently. info!("Writing parquet bytes to checkpoint buffer."); - let parquet_bytes = parquet_bytes_from_state(state)?; + let parquet_bytes = self.parquet_bytes_from_state(state)?; let size = parquet_bytes.len() as i64; @@ -1516,68 +1512,76 @@ impl CheckPointWriter { Ok(()) } -} -fn parquet_bytes_from_state(state: &DeltaTableState) -> Result, CheckPointWriterError> { - let mut json_buffer: Vec = Vec::new(); + fn parquet_bytes_from_state( + &self, + state: &DeltaTableState, + ) -> Result, CheckPointWriterError> { + let current_metadata = state + .current_metadata() + .ok_or_else(|| CheckPointWriterError::MissingMetaData)?; - let protocol = action::Action::protocol(action::Protocol { - min_reader_version: state.min_reader_version(), - min_writer_version: state.min_writer_version(), - }); + let mut json_buffer: Vec = Vec::new(); - extend_json_byte_buffer(&mut json_buffer, &protocol)?; + let protocol = action::Action::protocol(action::Protocol { + min_reader_version: state.min_reader_version(), + min_writer_version: state.min_writer_version(), + }); + extend_json_byte_buffer(&mut json_buffer, &protocol)?; - let metadata = state - .current_metadata() - .ok_or_else(|| CheckPointWriterError::MissingMetaData)?; - let metadata = action::Action::metaData(action::MetaData::try_from(metadata)?); - extend_json_byte_buffer(&mut json_buffer, &metadata)?; + let metadata = action::Action::metaData(action::MetaData::try_from(current_metadata)?); + extend_json_byte_buffer(&mut json_buffer, &metadata)?; - for add in state.files() { - let add = action::Action::add(add.clone()); - extend_json_byte_buffer(&mut json_buffer, &add)?; - } + for add in state.files() { + let add = action::Action::add(add.clone()); + extend_json_byte_buffer(&mut json_buffer, &add)?; + } - for remove in state.tombstones() { - let remove = action::Action::remove(remove.clone()); - extend_json_byte_buffer(&mut json_buffer, &remove)?; - } + for remove in state.tombstones() { + let remove = action::Action::remove(remove.clone()); + extend_json_byte_buffer(&mut json_buffer, &remove)?; + } - for (app_id, version) in state.app_transaction_version().iter() { - let txn = action::Action::txn(action::Txn { - app_id: app_id.clone(), - version: *version, - last_updated: None, - }); - extend_json_byte_buffer(&mut json_buffer, &txn)?; - } + for (app_id, version) in state.app_transaction_version().iter() { + let txn = action::Action::txn(action::Txn { + app_id: app_id.clone(), + version: *version, + last_updated: None, + }); + extend_json_byte_buffer(&mut json_buffer, &txn)?; + } - let arrow_schema = delta_log_arrow_schema()?; - let arrow_schema = Arc::new(arrow_schema); + let checkpoint_schema = self.schema_factory.delta_log_schema_for_table( + ¤t_metadata.schema, + current_metadata.partition_columns.as_slice(), + )?; + let arrow_checkpoint_schema: ArrowSchema = + >::try_from(&checkpoint_schema)?; + let arrow_schema = Arc::new(arrow_checkpoint_schema); - let cursor = Cursor::new(json_buffer); + let cursor = Cursor::new(json_buffer); - let mut json_reader = ReaderBuilder::new() - .with_schema(arrow_schema.clone()) - .build(cursor)?; + let mut json_reader = ReaderBuilder::new() + .with_schema(arrow_schema.clone()) + .build(cursor)?; - debug!("Preparing checkpoint parquet buffer."); + debug!("Preparing checkpoint parquet buffer."); - let writeable_cursor = InMemoryWriteableCursor::default(); - let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; + let writeable_cursor = InMemoryWriteableCursor::default(); + let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; - debug!("Writing to checkpoint parquet buffer..."); + debug!("Writing to checkpoint parquet buffer..."); - while let Some(batch) = json_reader.next()? { - writer.write(&batch)?; - } + while let Some(batch) = json_reader.next()? { + writer.write(&batch)?; + } - let _ = writer.close()?; + let _ = writer.close()?; - debug!("Finsihed writing checkpoint file."); + info!("Finished writing checkpoint file."); - Ok(writeable_cursor.data()) + Ok(writeable_cursor.data()) + } } fn extend_json_byte_buffer( diff --git a/rust/src/delta_arrow.rs b/rust/src/delta_arrow.rs index b654e2aa57..d5cb045aae 100644 --- a/rust/src/delta_arrow.rs +++ b/rust/src/delta_arrow.rs @@ -116,6 +116,11 @@ impl TryFrom<&schema::SchemaDataType> for ArrowDataType { a )?))) } + // NOTE: this doesn't currently support maps with string keys + // See below arrow-rs issues for adding arrow::datatypes::DataType::Map to support a + // more general map type: + // https://github.com/apache/arrow-rs/issues/395 + // https://github.com/apache/arrow-rs/issues/396 schema::SchemaDataType::map(m) => Ok(ArrowDataType::Dictionary( Box::new( >::try_from( diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b4d5fb7507..226306fc7d 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -32,7 +32,7 @@ //! "2020-05-02T23:47:31-07:00", //! ).await.unwrap(); //! let files = table.get_files(); -//! } ; +//! }; //! ``` //! //! # Optional cargo package features diff --git a/rust/src/schema.rs b/rust/src/schema.rs index db5da3b4dc..94e490ee82 100644 --- a/rust/src/schema.rs +++ b/rust/src/schema.rs @@ -1,12 +1,11 @@ #![allow(non_snake_case, non_camel_case_types)] -use arrow::datatypes::Schema as ArrowSchema; use arrow::error::ArrowError; +use lazy_static::lazy_static; use parquet::errors::ParquetError; use serde::{Deserialize, Serialize}; -use serde_json::{json, Value}; +use serde_json::json; use std::collections::HashMap; -use std::convert::TryFrom; /// Type alias for a string expected to match a GUID/UUID format pub type Guid = String; @@ -165,7 +164,7 @@ impl Schema { } } -/// Error representing a failure while training to create the delta log schema. +/// Error representing a failure while creating the delta log schema. #[derive(thiserror::Error, Debug)] pub enum DeltaLogSchemaError { /// Error returned when reading the checkpoint failed. @@ -182,7 +181,7 @@ pub enum DeltaLogSchemaError { #[from] source: ArrowError, }, - /// Passthrough error returned by serde_json. + /// Error returned when JSON de-serialization of schema components fails. #[error("serde_json::Error: {source}")] JSONSerialization { /// The source serde_json::Error. @@ -191,66 +190,28 @@ pub enum DeltaLogSchemaError { }, } -pub(crate) fn delta_log_arrow_schema() -> Result { - let delta_schema = delta_log_schema()?; - let arrow_schema: ArrowSchema = >::try_from(&delta_schema)?; - - Ok(arrow_schema) -} - -pub(crate) fn delta_log_schema() -> Result { - let field_map = delta_log_json_fields(); - - // TODO: receive a table schema parameter and merge into add.stats_parsed in the delta log schema - // TODO: also merge partition column schema fields under add.partitionValues_parsed - // Skipping this for now until I can get the maps to work. - - let json_fields: Vec = field_map.values().map(|v| v.to_owned()).collect(); - let mut json_schema = serde_json::Map::new(); - json_schema.insert("type".to_string(), Value::String("struct".to_string())); - json_schema.insert("fields".to_string(), Value::Array(json_fields)); - let json_schema = Value::Object(json_schema); - - let delta_schema: Schema = serde_json::from_value(json_schema)?; - - Ok(delta_schema) +/// Factory for creating a Delta log schema for a specific table schema. +/// REF: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-schema +pub struct DeltaLogSchemaFactory { + common_fields: HashMap>, } -pub(crate) fn delta_log_json_fields() -> HashMap { - // TODO: Missing feature in arrow - string keys are not supported by Arrow Dictionary. - // Example: https://github.com/apache/arrow-rs/blob/master/arrow/src/json/reader.rs#L858-L898 - // There are many other code refs in arrow besides this one that limit dict keys to numeric - // keys. - let meta_data = json!({ - "name": "metaData", - "type": { - "type": "struct", - "fields": [{ - "name": "id", - "type": "string", - "nullable": true, - "metadata": {}, - },{ - "name": "name", - "type": "string", - "nullable": true, - "metadata": {}, - },{ - "name": "description", - "type": "string", - "nullable": true, - "metadata": {}, - },{ - "name": "schemaString", - "type": "string", - "nullable": true, - "metadata": {}, - },{ - "name": "createdTime", - "type": "long", - "nullable": true, - "metadata": {}, - },{ +impl DeltaLogSchemaFactory { + /// Creates a new DeltaLogSchemaFactory which can be used to create Schema's representing the + /// Delta log for specific tables. + pub fn new() -> Self { + // TODO: map is not supported by arrow currently. + // See: + // * https://github.com/apache/arrow-rs/issues/395 + // * https://github.com/apache/arrow-rs/issues/396 + + let meta_data_fields = json!([ + { "name": "id", "type": "string", "nullable": true, "metadata": {} }, + { "name": "name", "type": "string", "nullable": true, "metadata": {} }, + { "name": "description", "type": "string", "nullable": true, "metadata": {} }, + { "name": "schemaString", "type": "string", "nullable": true, "metadata": {} }, + { "name": "createdTime", "type": "long", "nullable": true, "metadata": {} }, + { "name": "partitionColumns", "type": { "type": "array", @@ -258,8 +219,8 @@ pub(crate) fn delta_log_json_fields() -> HashMap { "containsNull": true, }, "nullable": true, - "metadata": {}, - },{ + "metadata": {} }, + { "name": "format", "type": { "type": "struct", @@ -282,7 +243,8 @@ pub(crate) fn delta_log_json_fields() -> HashMap { }, "nullable": true, "metadata": {} - },/*{ + }, + /*{ "name": "configuration", "type": { "type": "map", @@ -292,82 +254,25 @@ pub(crate) fn delta_log_json_fields() -> HashMap { }, "nullable": true, "metadata": {} - }*/] - }, - "nullable": true, - "metadata": {} - }); - - let protocol = json!({ - "name": "protocol", - "type": { - "type": "struct", - "fields": [{ - "name": "minReaderVersion", - "type": "integer", - "nullable": true, - "metadata": {}, - },{ - "name": "minWriterVersion", - "type": "integer", - "nullable": true, - "metadata": {}, - }] - }, - "nullable": true, - "metadata": {} - }); - - let txn = json!({ - "name": "txn", - "type": { - "type": "struct", - "fields": [{ - "name": "appId", - "type": "string", - "nullable": true, - "metadata": {}, - },{ - "name": "version", - "type": "long", - "nullable": true, - "metadata": {}, - }] - }, - "nullable": true, - "metadata": {} - }); - - let add = json!({ - "name": "add", - "type": { - "type": "struct", - "fields": [{ - "name": "path", - "type": "string", - "nullable": true, - "metadata": {}, - },{ - "name": "size", - "type": "long", - "nullable": true, - "metadata": {}, - },{ - "name": "modificationTime", - "type": "long", - "nullable": true, - "metadata": {}, - },{ - "name": "dataChange", - "type": "boolean", - "nullable": true, - "metadata": {}, - },{ - "name": "stats", - "type": "string", - "nullable": true, - "metadata": {}, - },/*{ + }*/]); + + let protocol_fields = json!([ + { "name": "minReaderVersion", "type": "integer", "nullable": true, "metadata": {} }, + { "name": "minWriterVersion", "type": "integer", "nullable": true, "metadata": {} } + ]); + + let txn_fields = json!([ + { "name": "appId", "type": "string", "nullable": true, "metadata": {} }, + { "name": "version", "type": "long", "nullable": true, "metadata": {} } + ]); + + let add_fields = json!([ + { "name": "path", "type": "string", "nullable": true, "metadata": {} }, + { "name": "size", "type": "long", "nullable": true, "metadata": {} }, + { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, + { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {} }, + { "name": "stats", "type": "string", "nullable": true, "metadata": {} }, + /*{ "name": "partitionValues", "type": { "type": "map", @@ -377,41 +282,15 @@ pub(crate) fn delta_log_json_fields() -> HashMap { }, "nullable": true, "metadata": {}, - }*/] - }, - "nullable": true, - "metadata": {} - }); - - let remove = json!({ - "name": "remove", - "type": { - "type": "struct", - "fields": [{ - "name": "path", - "type": "string", - "nullable": true, - "metadata": {}, - },{ - "name": "size", - "type": "long", - "nullable": true, - "metadata": {}, - },{ - "name": "modificationTime", - "type": "long", - "nullable": true, - "metadata": {}, - },{ - "name": "dataChange", - "type": "boolean", - "nullable": true, - "metadata": {}, - },{ - "name": "stats", - "type": "string", - "nullable": true, - "metadata": {}, + }*/ + ]); + + let remove_fields = json!([ + { "name": "path", "type": "string", "nullable": true, "metadata": {} }, + { "name": "size", "type": "long", "nullable": true, "metadata": {} }, + { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, + { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {}, }, + { "name": "stats", "type": "string", "nullable": true, "metadata": {}, },/*{ "name": "partitionValues", "type": { @@ -423,19 +302,121 @@ pub(crate) fn delta_log_json_fields() -> HashMap { "nullable": true, "metadata": {}, - }*/], - }, - "nullable": true, - "metadata": {} - }); + }*/]); + + let mut map = HashMap::new(); + + map.insert( + "metaData".to_string(), + serde_json::from_value(meta_data_fields).unwrap(), + ); + map.insert( + "protocol".to_string(), + serde_json::from_value(protocol_fields).unwrap(), + ); + map.insert( + "txn".to_string(), + serde_json::from_value(txn_fields).unwrap(), + ); + map.insert( + "add".to_string(), + serde_json::from_value(add_fields).unwrap(), + ); + map.insert( + "remove".to_string(), + serde_json::from_value(remove_fields).unwrap(), + ); + + Self { common_fields: map } + } + + /// Creates a Schema representing the delta log for a specific delta table. + /// Merges fields from the table schema into the delta log schema. + pub fn delta_log_schema_for_table( + &self, + table_schema: &Schema, + partition_columns: &[String], + ) -> Result { + let (partition_fields, non_partition_fields): (Vec, Vec) = + table_schema + .fields + .iter() + .map(|f| f.to_owned()) + .partition(|field| partition_columns.contains(&field.name)); + + let fields: Vec = self + .common_fields + .iter() + .map(|(name, fields)| match name.as_str() { + "add" => { + let mut fields = fields.clone(); + + if partition_fields.len() > 0 { + let partition_values_parsed = SchemaField { + name: "partitionValues_parsed".to_string(), + nullable: true, + metadata: HashMap::new(), + r#type: SchemaDataType::r#struct(SchemaTypeStruct { + r#type: "struct".to_string(), + fields: partition_fields.clone(), + }), + }; + fields.push(partition_values_parsed); + } + + if non_partition_fields.len() > 0 { + let stats_parsed = SchemaField { + name: "stats_parsed".to_string(), + nullable: true, + metadata: HashMap::new(), + r#type: SchemaDataType::r#struct(SchemaTypeStruct { + r#type: "struct".to_string(), + fields: non_partition_fields.clone(), + }), + }; + + fields.push(stats_parsed); + } + + + SchemaField { + name: name.clone(), + nullable: true, + metadata: HashMap::new(), + r#type: SchemaDataType::r#struct(SchemaTypeStruct { + r#type: "struct".to_string(), + fields, + }), + } + } + _ => SchemaField { + name: name.clone(), + nullable: true, + metadata: HashMap::new(), + r#type: SchemaDataType::r#struct(SchemaTypeStruct { + r#type: "struct".to_string(), + fields: fields.clone(), + }), + }, + }) + .collect(); - let mut map = HashMap::new(); + Ok(Schema { + r#type: "struct".to_string(), + fields, + }) + } +} - map.insert("metaData".to_string(), meta_data); - map.insert("protocol".to_string(), protocol); - map.insert("txn".to_string(), txn); - map.insert("add".to_string(), add); - map.insert("remove".to_string(), remove); +#[cfg(test)] +mod tests { + use super::*; - map + #[test] + fn delta_log_schema_factory_creates_schema() { + let _factory = DeltaLogSchemaFactory::new(); + + // TODO: + } } + diff --git a/rust/tests/write_checkpoints.rs b/rust/tests/checkpoint_writer_test.rs similarity index 75% rename from rust/tests/write_checkpoints.rs rename to rust/tests/checkpoint_writer_test.rs index d425a7aa62..58b06e2907 100644 --- a/rust/tests/write_checkpoints.rs +++ b/rust/tests/checkpoint_writer_test.rs @@ -5,6 +5,9 @@ use deltalake::CheckPointWriter; use std::fs; use std::path::{Path, PathBuf}; +// NOTE: The below is a useful external command for inspecting the written checkpoint schema visually: +// parquet-tools inspect tests/data/checkpoints/_delta_log/00000000000000000005.checkpoint.parquet + #[tokio::test] async fn write_simple_checkpoint() { let table_location = "./tests/data/checkpoints"; @@ -31,28 +34,32 @@ async fn write_simple_checkpoint() { let checkpoint_path = log_path.join("00000000000000000005.checkpoint.parquet"); assert!(checkpoint_path.as_path().exists()); - // HACK: seems like fs backend is eventually consistent :/ + // HACK: seems like a race condition exists reading the file back in. + // Without the sleep, frequently fails with: + // Error("EOF while parsing a value", line: 1, column: 0)' std::thread::sleep(std::time::Duration::from_secs(1)); // _last_checkpoint should exist let last_checkpoint_path = log_path.join("_last_checkpoint"); assert!(last_checkpoint_path.as_path().exists()); - // _last_checkpoint should point to checkpoint + // _last_checkpoint should point to the correct version let last_checkpoint_content = fs::read_to_string(last_checkpoint_path.as_path()).unwrap(); - println!("{:?}", last_checkpoint_content); - let last_checkpoint_content = last_checkpoint_content.trim(); - println!("{:?}", last_checkpoint_content); let last_checkpoint_content: serde_json::Value = - serde_json::from_str(last_checkpoint_content).unwrap(); + serde_json::from_str(last_checkpoint_content.trim()).unwrap(); - println!("{:?}", last_checkpoint_content); + let version = last_checkpoint_content + .get("version") + .unwrap() + .as_i64() + .unwrap(); + assert_eq!(5, version); // delta table should load just fine with the checkpoint in place let table_result = deltalake::open_table(table_location).await.unwrap(); let table = table_result; let files = table.get_files(); - println!("{:?}", files); + assert_eq!(11, files.len()); } fn cleanup_checkpoint_files(log_path: &Path) { @@ -63,12 +70,9 @@ fn cleanup_checkpoint_files(log_path: &Path) { Ok(d) => { let path = d.path(); - println!("Checking path {:?}", path); - if path.file_name().unwrap() == "_last_checkpoint" || (path.extension().is_some() && path.extension().unwrap() == "parquet") { - println!("Deleting {:?}", path); fs::remove_file(path).unwrap(); } } From 794bab3b35553b981d5f701ba35d9a924027f311 Mon Sep 17 00:00:00 2001 From: xianwill Date: Sat, 5 Jun 2021 19:20:19 -0400 Subject: [PATCH 05/20] Adding tests --- rust/src/schema.rs | 261 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 248 insertions(+), 13 deletions(-) diff --git a/rust/src/schema.rs b/rust/src/schema.rs index 3b413e7c44..f1bd3d2211 100644 --- a/rust/src/schema.rs +++ b/rust/src/schema.rs @@ -19,7 +19,7 @@ pub type DeltaDataTypeInt = i32; /// Represents a struct field defined in the Delta table schema. // https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format -#[derive(Serialize, Deserialize, Debug, Default, Clone)] +#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] pub struct SchemaTypeStruct { // type field is always the string "struct", so we are ignoring it here r#type: String, @@ -34,7 +34,7 @@ impl SchemaTypeStruct { } /// Describes a specific field of the Delta table schema. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] pub struct SchemaField { // Name of this (possibly nested) column name: String, @@ -69,7 +69,7 @@ impl SchemaField { } /// Schema definition for array type fields. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] pub struct SchemaTypeArray { // type field is always the string "array", so we are ignoring it here r#type: String, @@ -93,7 +93,7 @@ impl SchemaTypeArray { } /// Schema definition for map type fields. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] pub struct SchemaTypeMap { r#type: String, keyType: Box, @@ -135,7 +135,7 @@ impl SchemaTypeMap { * timestamp: Microsecond precision timestamp without a timezone */ /// Enum with variants for each top level schema data type. -#[derive(Serialize, Deserialize, Debug, Clone)] +#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] #[serde(untagged)] pub enum SchemaDataType { /// Variant representing non-array, non-map, non-struct fields. Wrapped value will contain the @@ -210,7 +210,7 @@ impl DeltaLogSchemaFactory { { "name": "description", "type": "string", "nullable": true, "metadata": {} }, { "name": "schemaString", "type": "string", "nullable": true, "metadata": {} }, { "name": "createdTime", "type": "long", "nullable": true, "metadata": {} }, - { + { "name": "partitionColumns", "type": { "type": "array", @@ -219,7 +219,7 @@ impl DeltaLogSchemaFactory { }, "nullable": true, "metadata": {} }, - { + { "name": "format", "type": { "type": "struct", @@ -364,20 +364,49 @@ impl DeltaLogSchemaFactory { } if non_partition_fields.len() > 0 { + let min_values = SchemaField { + name: "minValues".to_string(), + nullable: true, + metadata: HashMap::new(), + r#type: SchemaDataType::r#struct(SchemaTypeStruct { + r#type: "struct".to_string(), + fields: non_partition_fields.clone(), + }), + }; + + let max_values = SchemaField { + name: "maxValues".to_string(), + nullable: true, + metadata: HashMap::new(), + r#type: SchemaDataType::r#struct(SchemaTypeStruct { + r#type: "struct".to_string(), + fields: non_partition_fields.clone(), + }), + }; + + let null_counts = SchemaField { + name: "nullCounts".to_string(), + nullable: true, + metadata: HashMap::new(), + r#type: SchemaDataType::r#struct(SchemaTypeStruct { + r#type: "struct".to_string(), + fields: non_partition_fields.clone(), + }), + }; + let stats_parsed = SchemaField { name: "stats_parsed".to_string(), nullable: true, metadata: HashMap::new(), r#type: SchemaDataType::r#struct(SchemaTypeStruct { r#type: "struct".to_string(), - fields: non_partition_fields.clone(), + fields: vec![min_values, max_values, null_counts], }), }; fields.push(stats_parsed); } - SchemaField { name: name.clone(), nullable: true, @@ -413,9 +442,215 @@ mod tests { #[test] fn delta_log_schema_factory_creates_schema() { - let _factory = DeltaLogSchemaFactory::new(); - - // TODO: + let factory = DeltaLogSchemaFactory::new(); + + let table_schema = json!({ + "type": "struct", + "fields": [ + { "name": "pcol", "type": "integer", "nullable": true, "metadata": {} }, + { "name": "col1", "type": "integer", "nullable": true, "metadata": {} }, + ] + }); + let table_schema = serde_json::from_value(table_schema).unwrap(); + + let partition_columns = vec!["pcol".to_string()]; + + let log_schema = factory + .delta_log_schema_for_table(&table_schema, partition_columns.as_slice()) + .unwrap(); + + assert_eq!("struct", log_schema.r#type); + assert_eq!(5, log_schema.get_fields().len()); + + for f in log_schema.get_fields().iter() { + match f.get_name() { + "txn" => { + if let SchemaDataType::r#struct(txn) = f.get_type() { + assert_eq!(2, txn.get_fields().len()); + for f in txn.get_fields().iter() { + match f.get_name() { + "appId" => { + assert_eq!( + SchemaDataType::primitive("string".to_string()), + f.get_type().to_owned() + ); + } + "version" => { + assert_eq!( + SchemaDataType::primitive("long".to_string()), + f.get_type().to_owned() + ); + } + _ => panic!("Unhandled schema field name"), + } + } + } else { + panic!("txn must be a struct"); + } + } + "protocol" => { + if let SchemaDataType::r#struct(protocol) = f.get_type() { + assert_eq!(2, protocol.get_fields().len()); + for f in protocol.get_fields().iter() { + match f.get_name() { + "minReaderVersion" | "minWriterVersion" => { + assert_eq!( + SchemaDataType::primitive("integer".to_string()), + f.get_type().to_owned() + ); + } + _ => panic!("Unhandled schema field name"), + } + } + } else { + panic!("protocol must be a struct"); + } + } + "metaData" => { + if let SchemaDataType::r#struct(metadata) = f.get_type() { + assert_eq!(7, metadata.get_fields().len()); + for f in metadata.get_fields().iter() { + match f.get_name() { + "id" | "name" | "description" | "schemaString" => { + assert_eq!( + SchemaDataType::primitive("string".to_string()), + f.get_type().to_owned() + ); + } + "createdTime" => { + assert_eq!( + SchemaDataType::primitive("long".to_string()), + f.get_type().to_owned() + ); + } + "partitionColumns" => match f.get_type() { + SchemaDataType::array(partition_columns) => { + assert_eq!("array", partition_columns.r#type); + assert_eq!( + Box::new(SchemaDataType::primitive( + "string".to_string() + )), + partition_columns.elementType + ); + } + _ => panic!("partitionColumns should be an array"), + }, + "format" => { + // TODO + } + _ => panic!("Unhandled schema field name"), + } + } + } else { + panic!("metaData must be a struct"); + } + } + "add" => { + if let SchemaDataType::r#struct(add) = f.get_type() { + assert_eq!(7, add.get_fields().len()); + for f in add.get_fields().iter() { + match f.get_name() { + "path" | "stats" => { + assert_eq!( + SchemaDataType::primitive("string".to_string()), + f.r#type + ); + } + "size" | "modificationTime" => { + assert_eq!( + SchemaDataType::primitive("long".to_string()), + f.r#type + ); + } + "dataChange" => { + assert_eq!( + SchemaDataType::primitive("boolean".to_string()), + f.r#type + ); + } + "stats_parsed" => match f.get_type() { + SchemaDataType::r#struct(stats_parsed) => { + let expected_fields: Vec<&SchemaField> = table_schema + .get_fields() + .iter() + .filter(|f| !partition_columns.contains(&f.name)) + .collect(); + for stat_field in stats_parsed.get_fields() { + match stat_field.get_name() { + "minValues" | "maxValues" | "nullCounts" => { + if let SchemaDataType::r#struct(f) = + stat_field.get_type() + { + for (i, e) in + f.get_fields().iter().enumerate() + { + assert_eq!(e, expected_fields[i]); + } + } else { + panic!("Unexpected type for stat field"); + } + } + _ => panic!("Unhandled schema field name"), + } + } + } + _ => panic!("'stats_parsed' must be a struct"), + }, + "partitionValues_parsed" => match f.get_type() { + SchemaDataType::r#struct(partition_values_parsed) => { + let expected_fields: Vec<&SchemaField> = table_schema + .get_fields() + .iter() + .filter(|f| partition_columns.contains(&f.name)) + .collect(); + + for (i, e) in + partition_values_parsed.get_fields().iter().enumerate() + { + assert_eq!(e, expected_fields[i], "'partitionValues_parsed' should contain SchemaFields for all partition columns"); + } + } + _ => panic!("'partition_values_parsed' must be a struct"), + }, + _ => panic!("Unhandled schema field name"), + } + } + } else { + panic!("'add' must be a struct"); + } + } + "remove" => { + if let SchemaDataType::r#struct(remove) = f.get_type() { + assert_eq!(5, remove.get_fields().len()); + for f in remove.get_fields().iter() { + match f.get_name() { + "path" | "stats" => { + assert_eq!( + SchemaDataType::primitive("string".to_string()), + f.get_type().to_owned() + ); + } + "size" | "modificationTime" => { + assert_eq!( + SchemaDataType::primitive("long".to_string()), + f.get_type().to_owned() + ); + } + "dataChange" => { + assert_eq!( + SchemaDataType::primitive("boolean".to_string()), + f.get_type().to_owned() + ); + } + _ => panic!("Unhandled schema field name"), + } + } + } else { + panic!("'remove' must be a struct"); + } + } + _ => panic!("Unhandled schema field name"), + } + } } } - From 34d8c07c7346bfcb63dfda7ce06acab9ce0326d3 Mon Sep 17 00:00:00 2001 From: xianwill Date: Sat, 5 Jun 2021 19:52:20 -0400 Subject: [PATCH 06/20] Fix clippy errors --- rust/src/delta.rs | 2 +- rust/src/schema.rs | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/rust/src/delta.rs b/rust/src/delta.rs index e84d4d41ea..073d360917 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -1579,7 +1579,7 @@ impl CheckPointWriter { ) -> Result, CheckPointWriterError> { let current_metadata = state .current_metadata() - .ok_or_else(|| CheckPointWriterError::MissingMetaData)?; + .ok_or(CheckPointWriterError::MissingMetaData)?; let mut json_buffer: Vec = Vec::new(); diff --git a/rust/src/schema.rs b/rust/src/schema.rs index f1bd3d2211..225719d55b 100644 --- a/rust/src/schema.rs +++ b/rust/src/schema.rs @@ -350,7 +350,7 @@ impl DeltaLogSchemaFactory { "add" => { let mut fields = fields.clone(); - if partition_fields.len() > 0 { + if !partition_fields.is_empty() { let partition_values_parsed = SchemaField { name: "partitionValues_parsed".to_string(), nullable: true, @@ -363,7 +363,7 @@ impl DeltaLogSchemaFactory { fields.push(partition_values_parsed); } - if non_partition_fields.len() > 0 { + if !non_partition_fields.is_empty() { let min_values = SchemaField { name: "minValues".to_string(), nullable: true, @@ -436,6 +436,12 @@ impl DeltaLogSchemaFactory { } } +impl Default for DeltaLogSchemaFactory { + fn default() -> Self { + Self::new() + } +} + #[cfg(test)] mod tests { use super::*; From 8fff84cea760fa74132429a474f25a71e2c66e69 Mon Sep 17 00:00:00 2001 From: xianwill Date: Sat, 5 Jun 2021 19:55:07 -0400 Subject: [PATCH 07/20] Remove unnecessary error sources --- rust/src/schema.rs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/rust/src/schema.rs b/rust/src/schema.rs index 225719d55b..b45c73f7c6 100644 --- a/rust/src/schema.rs +++ b/rust/src/schema.rs @@ -1,7 +1,5 @@ #![allow(non_snake_case, non_camel_case_types)] -use arrow::error::ArrowError; -use parquet::errors::ParquetError; use serde::{Deserialize, Serialize}; use serde_json::json; use std::collections::HashMap; @@ -166,20 +164,6 @@ impl Schema { /// Error representing a failure while creating the delta log schema. #[derive(thiserror::Error, Debug)] pub enum DeltaLogSchemaError { - /// Error returned when reading the checkpoint failed. - #[error("Failed to read checkpoint: {}", .source)] - ParquetError { - /// Parquet error details returned when reading the checkpoint failed. - #[from] - source: ParquetError, - }, - /// Error returned when converting the schema in Arrow format failed. - #[error("Failed to convert into Arrow schema: {}", .source)] - ArrowError { - /// Arrow error details returned when converting the schema in Arrow format failed - #[from] - source: ArrowError, - }, /// Error returned when JSON de-serialization of schema components fails. #[error("serde_json::Error: {source}")] JSONSerialization { From ec9c5ffa5a95324e5ea552ea8415b6634ac6dfb6 Mon Sep 17 00:00:00 2001 From: xianwill Date: Sat, 5 Jun 2021 20:02:15 -0400 Subject: [PATCH 08/20] Add checkpoint_for_version method --- rust/src/delta.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/rust/src/delta.rs b/rust/src/delta.rs index 073d360917..4eb4b7cd61 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -1507,6 +1507,7 @@ pub enum CheckPointWriterError { /// Struct for writing checkpoints to the delta log. pub struct CheckPointWriter { + table_uri: String, delta_log_path: String, last_checkpoint_path: String, storage: Box, @@ -1515,12 +1516,13 @@ pub struct CheckPointWriter { impl CheckPointWriter { /// Creates a new CheckPointWriter. - pub fn new(table_path: &str, storage: Box) -> Self { - let delta_log_path = storage.join_path(table_path, "_delta_log"); + pub fn new(table_uri: &str, storage: Box) -> Self { + let delta_log_path = storage.join_path(table_uri, "_delta_log"); let last_checkpoint_path = storage.join_path(delta_log_path.as_str(), "_last_checkpoint"); let schema_factory = DeltaLogSchemaFactory::new(); Self { + table_uri: table_uri.to_string(), delta_log_path, last_checkpoint_path, storage, @@ -1528,6 +1530,19 @@ impl CheckPointWriter { } } + /// Creates a new checkpoint at the specified version. + /// NOTE: This method loads a new instance of delta table to determine the state to + /// checkpoint. + pub async fn create_checkpoint_for_version( + &self, + version: DeltaDataTypeVersion, + ) -> Result<(), CheckPointWriterError> { + let table = open_table_with_version(self.table_uri.as_str(), version).await?; + + self.create_checkpoint_from_state(version, table.get_state()) + .await + } + /// Creates a new checkpoint at the specified version from the given DeltaTableState. pub async fn create_checkpoint_from_state( &self, From 503cd61c2fa8eb18f1569dbed3680c132662e949 Mon Sep 17 00:00:00 2001 From: xianwill Date: Tue, 8 Jun 2021 08:17:08 -0400 Subject: [PATCH 09/20] Fix debug statement --- rust/src/delta.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/src/delta.rs b/rust/src/delta.rs index 4eb4b7cd61..39abf5078d 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -1653,7 +1653,7 @@ impl CheckPointWriter { let _ = writer.close()?; - info!("Finished writing checkpoint file."); + debug!("Finished writing checkpoint parquet buffer."); Ok(writeable_cursor.data()) } From ea1c442af11dd6da2d441cb2cfd6d1fcfef46480 Mon Sep 17 00:00:00 2001 From: Qingping Hou Date: Tue, 8 Jun 2021 14:02:14 -0700 Subject: [PATCH 10/20] avoid unnecessary clones --- rust/src/action.rs | 2 +- rust/src/delta.rs | 49 ++++++++++++++++++++++++---------------------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/rust/src/action.rs b/rust/src/action.rs index 3b50b8b76c..de5b9a10bb 100644 --- a/rust/src/action.rs +++ b/rust/src/action.rs @@ -364,7 +364,7 @@ pub struct Format { /// Action that describes the metadata of the table. /// This is a top-level action in Delta log entries. -#[derive(Serialize, Deserialize, Debug, Default)] +#[derive(Serialize, Deserialize, Debug, Default, Clone)] #[serde(rename_all = "camelCase")] pub struct MetaData { /// Unique identifier for this table diff --git a/rust/src/delta.rs b/rust/src/delta.rs index 39abf5078d..23a9a6687a 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -204,36 +204,38 @@ impl fmt::Display for DeltaTableMetaData { } } -impl TryFrom<&action::MetaData> for DeltaTableMetaData { +impl TryFrom for DeltaTableMetaData { type Error = serde_json::error::Error; - fn try_from(action_metadata: &action::MetaData) -> Result { + fn try_from(action_metadata: action::MetaData) -> Result { + let schema = action_metadata.get_schema()?; Ok(Self { - id: action_metadata.id.clone(), - name: action_metadata.name.clone(), - description: action_metadata.description.clone(), - format: action_metadata.format.clone(), - schema: action_metadata.get_schema()?, - partition_columns: action_metadata.partition_columns.clone(), + id: action_metadata.id, + name: action_metadata.name, + description: action_metadata.description, + format: action_metadata.format, + schema, + partition_columns: action_metadata.partition_columns, created_time: action_metadata.created_time, - configuration: action_metadata.configuration.clone(), + configuration: action_metadata.configuration, }) } } -impl TryFrom<&DeltaTableMetaData> for action::MetaData { +impl TryFrom for action::MetaData { type Error = serde_json::error::Error; - fn try_from(metadata: &DeltaTableMetaData) -> Result { + fn try_from(metadata: DeltaTableMetaData) -> Result { + let schema_string = serde_json::to_string(&metadata.schema)?; Ok(Self { - id: metadata.id.clone(), - name: metadata.name.clone(), - description: metadata.description.clone(), - format: metadata.format.clone(), - schema_string: serde_json::to_string(&metadata.schema)?, - partition_columns: metadata.partition_columns.clone(), + id: metadata.id, + name: metadata.name, + description: metadata.description, + format: metadata.format, + schema_string, + partition_columns: metadata.partition_columns, created_time: metadata.created_time, - configuration: metadata.configuration.clone(), + configuration: metadata.configuration, }) } } @@ -518,7 +520,7 @@ impl DeltaTable { ) -> Result<(), ApplyLogError> { for line in reader.lines() { let action: Action = serde_json::from_str(line?.as_str())?; - process_action(&mut self.state, &action)?; + process_action(&mut self.state, action)?; } Ok(()) @@ -548,7 +550,7 @@ impl DeltaTable { for record in preader.get_row_iter(None)? { process_action( &mut self.state, - &Action::from_parquet_record(&schema, &record)?, + Action::from_parquet_record(&schema, &record)?, )?; } } @@ -1604,7 +1606,8 @@ impl CheckPointWriter { }); extend_json_byte_buffer(&mut json_buffer, &protocol)?; - let metadata = action::Action::metaData(action::MetaData::try_from(current_metadata)?); + let metadata = + action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?); extend_json_byte_buffer(&mut json_buffer, &metadata)?; for add in state.files() { @@ -1674,7 +1677,7 @@ where fn process_action( state: &mut DeltaTableState, - action: &Action, + action: Action, ) -> Result<(), serde_json::error::Error> { match action { Action::add(v) => { @@ -1773,7 +1776,7 @@ mod tests { last_updated: Some(0), }); - let _ = process_action(&mut state, &txn_action).unwrap(); + let _ = process_action(&mut state, txn_action).unwrap(); assert_eq!(2, *state.app_transaction_version.get("abc").unwrap()); assert_eq!(1, *state.app_transaction_version.get("xyz").unwrap()); From 0f7dba53e7fb283d186216b21e4aaa018ceeeb1f Mon Sep 17 00:00:00 2001 From: xianwill Date: Wed, 9 Jun 2021 09:56:26 -0400 Subject: [PATCH 11/20] Remove redundant clones --- rust/src/delta.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rust/src/delta.rs b/rust/src/delta.rs index 23a9a6687a..d83381111d 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -1681,11 +1681,11 @@ fn process_action( ) -> Result<(), serde_json::error::Error> { match action { Action::add(v) => { - state.files.push(v.clone()); + state.files.push(v); } Action::remove(v) => { state.files.retain(|a| *a.path != v.path); - state.tombstones.push(v.clone()); + state.tombstones.push(v); } Action::protocol(v) => { state.min_reader_version = v.min_reader_version; @@ -1697,11 +1697,11 @@ fn process_action( Action::txn(v) => { *state .app_transaction_version - .entry(v.app_id.clone()) + .entry(v.app_id) .or_insert(v.version) = v.version; } Action::commitInfo(v) => { - state.commit_infos.push(v.clone()); + state.commit_infos.push(v); } } From 3bfb2ca7e4a0204f09ed545e803dd4045ad4d8a5 Mon Sep 17 00:00:00 2001 From: xianwill Date: Wed, 9 Jun 2021 11:43:42 -0400 Subject: [PATCH 12/20] Move CheckPointWriter to checkpoints mod --- rust/src/checkpoints.rs | 233 ++++++++++++++++++++++++ rust/src/delta.rs | 259 ++------------------------- rust/src/lib.rs | 1 + rust/tests/checkpoint_writer_test.rs | 2 +- 4 files changed, 250 insertions(+), 245 deletions(-) create mode 100644 rust/src/checkpoints.rs diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs new file mode 100644 index 0000000000..42c58e49b0 --- /dev/null +++ b/rust/src/checkpoints.rs @@ -0,0 +1,233 @@ +//! Implementation for writing delta checkpoints. + +use arrow::datatypes::Schema as ArrowSchema; +use arrow::error::ArrowError; +use arrow::json::reader::ReaderBuilder; +use log::*; +use parquet::arrow::ArrowWriter; +use parquet::errors::ParquetError; +use parquet::file::writer::InMemoryWriteableCursor; +use serde::Serialize; +use std::convert::TryFrom; +use std::io::Cursor; +use std::sync::Arc; + +use super::action; +use super::open_table_with_version; +use super::schema::*; +use super::storage::{StorageBackend, StorageError}; +use super::{CheckPoint, DeltaTableError, DeltaTableState}; + +/// Error returned when the CheckPointWriter is unable to write a checkpoint. +#[derive(thiserror::Error, Debug)] +pub enum CheckPointWriterError { + /// Error returned when the DeltaTableState does not contain a metadata action. + #[error("DeltaTableMetadata not present in DeltaTableState")] + MissingMetaData, + /// Error returned when creating the checkpoint schema. + #[error("DeltaLogSchemaError: {source}")] + DeltaLogSchema { + /// The source DeltaLogSchemaError + #[from] + source: DeltaLogSchemaError, + }, + /// Passthrough error returned when calling DeltaTable. + #[error("DeltaTableError: {source}")] + DeltaTable { + /// The source DeltaTableError. + #[from] + source: DeltaTableError, + }, + /// Error returned when the parquet writer fails while writing the checkpoint. + #[error("Failed to write parquet: {}", .source)] + ParquetError { + /// Parquet error details returned when writing the checkpoint failed. + #[from] + source: ParquetError, + }, + /// Error returned when converting the schema to Arrow format failed. + #[error("Failed to convert into Arrow schema: {}", .source)] + ArrowError { + /// Arrow error details returned when converting the schema in Arrow format failed + #[from] + source: ArrowError, + }, + /// Passthrough error returned when calling StorageBackend. + #[error("StorageError: {source}")] + Storage { + /// The source StorageError. + #[from] + source: StorageError, + }, + /// Passthrough error returned by serde_json. + #[error("serde_json::Error: {source}")] + JSONSerialization { + /// The source serde_json::Error. + #[from] + source: serde_json::Error, + }, +} + +/// Struct for writing checkpoints to the delta log. +pub struct CheckPointWriter { + table_uri: String, + delta_log_path: String, + last_checkpoint_path: String, + storage: Box, + schema_factory: DeltaLogSchemaFactory, +} + +impl CheckPointWriter { + /// Creates a new CheckPointWriter. + pub fn new(table_uri: &str, storage: Box) -> Self { + let delta_log_path = storage.join_path(table_uri, "_delta_log"); + let last_checkpoint_path = storage.join_path(delta_log_path.as_str(), "_last_checkpoint"); + let schema_factory = DeltaLogSchemaFactory::new(); + + Self { + table_uri: table_uri.to_string(), + delta_log_path, + last_checkpoint_path, + storage, + schema_factory, + } + } + + /// Creates a new checkpoint at the specified version. + /// NOTE: This method loads a new instance of delta table to determine the state to + /// checkpoint. + pub async fn create_checkpoint_for_version( + &self, + version: DeltaDataTypeVersion, + ) -> Result<(), CheckPointWriterError> { + let table = open_table_with_version(self.table_uri.as_str(), version).await?; + + self.create_checkpoint_from_state(version, table.get_state()) + .await + } + + /// Creates a new checkpoint at the specified version from the given DeltaTableState. + pub async fn create_checkpoint_from_state( + &self, + version: DeltaDataTypeVersion, + state: &DeltaTableState, + ) -> Result<(), CheckPointWriterError> { + // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for + // an appropriate split point yet though so only writing a single part currently. + + info!("Writing parquet bytes to checkpoint buffer."); + let parquet_bytes = self.parquet_bytes_from_state(state)?; + + let size = parquet_bytes.len() as i64; + + let checkpoint = CheckPoint::new(version, size, None); + + let file_name = format!("{:020}.checkpoint.parquet", version); + let checkpoint_path = self.storage.join_path(&self.delta_log_path, &file_name); + + info!("Writing checkpoint to {:?}.", checkpoint_path); + self.storage + .put_obj(&checkpoint_path, &parquet_bytes) + .await?; + + let last_checkpoint_content: serde_json::Value = serde_json::to_value(&checkpoint)?; + let last_checkpoint_content = serde_json::to_string(&last_checkpoint_content)?; + + info!( + "Writing _last_checkpoint to {:?}.", + self.last_checkpoint_path + ); + self.storage + .put_obj( + self.last_checkpoint_path.as_str(), + last_checkpoint_content.as_bytes(), + ) + .await?; + + Ok(()) + } + + fn parquet_bytes_from_state( + &self, + state: &DeltaTableState, + ) -> Result, CheckPointWriterError> { + let current_metadata = state + .current_metadata() + .ok_or(CheckPointWriterError::MissingMetaData)?; + + let mut json_buffer: Vec = Vec::new(); + + let protocol = action::Action::protocol(action::Protocol { + min_reader_version: state.min_reader_version(), + min_writer_version: state.min_writer_version(), + }); + extend_json_byte_buffer(&mut json_buffer, &protocol)?; + + let metadata = + action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?); + extend_json_byte_buffer(&mut json_buffer, &metadata)?; + + for add in state.files() { + let add = action::Action::add(add.clone()); + extend_json_byte_buffer(&mut json_buffer, &add)?; + } + + for remove in state.tombstones() { + let remove = action::Action::remove(remove.clone()); + extend_json_byte_buffer(&mut json_buffer, &remove)?; + } + + for (app_id, version) in state.app_transaction_version().iter() { + let txn = action::Action::txn(action::Txn { + app_id: app_id.clone(), + version: *version, + last_updated: None, + }); + extend_json_byte_buffer(&mut json_buffer, &txn)?; + } + + let checkpoint_schema = self.schema_factory.delta_log_schema_for_table( + ¤t_metadata.schema, + current_metadata.partition_columns.as_slice(), + )?; + let arrow_checkpoint_schema: ArrowSchema = + >::try_from(&checkpoint_schema)?; + let arrow_schema = Arc::new(arrow_checkpoint_schema); + + let cursor = Cursor::new(json_buffer); + + let mut json_reader = ReaderBuilder::new() + .with_schema(arrow_schema.clone()) + .build(cursor)?; + + debug!("Preparing checkpoint parquet buffer."); + + let writeable_cursor = InMemoryWriteableCursor::default(); + let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; + + debug!("Writing to checkpoint parquet buffer..."); + + while let Some(batch) = json_reader.next()? { + writer.write(&batch)?; + } + + let _ = writer.close()?; + + debug!("Finished writing checkpoint parquet buffer."); + + Ok(writeable_cursor.data()) + } +} + +fn extend_json_byte_buffer( + json_byte_buffer: &mut Vec, + json_value: &T, +) -> Result<(), serde_json::error::Error> +where + T: ?Sized + Serialize, +{ + json_byte_buffer.extend(serde_json::to_vec(json_value)?); + json_byte_buffer.push(b'\n'); + + Ok(()) +} diff --git a/rust/src/delta.rs b/rust/src/delta.rs index d83381111d..b39d348d8a 100644 --- a/rust/src/delta.rs +++ b/rust/src/delta.rs @@ -3,16 +3,12 @@ // Reference: https://github.com/delta-io/delta/blob/master/PROTOCOL.md // -use arrow::datatypes::Schema as ArrowSchema; use arrow::error::ArrowError; -use arrow::json::reader::ReaderBuilder; use chrono::{DateTime, FixedOffset, Utc}; use futures::StreamExt; use lazy_static::lazy_static; use log::*; -use parquet::arrow::ArrowWriter; use parquet::errors::ParquetError; -use parquet::file::writer::InMemoryWriteableCursor; use parquet::file::{ reader::{FileReader, SerializedFileReader}, serialized_reader::SliceableCursor, @@ -24,7 +20,6 @@ use std::collections::HashMap; use std::convert::TryFrom; use std::fmt; use std::io::{BufRead, BufReader, Cursor}; -use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::{cmp::Ordering, collections::HashSet}; use uuid::Uuid; @@ -45,6 +40,21 @@ pub struct CheckPoint { parts: Option, // 10 digits decimals } +impl CheckPoint { + /// Creates a new checkpoint from the given parameters. + pub(crate) fn new( + version: DeltaDataTypeVersion, + size: DeltaDataTypeLong, + parts: Option, + ) -> Self { + Self { + version, + size, + parts, + } + } +} + impl PartialEq for CheckPoint { fn eq(&self, other: &Self) -> bool { self.version == other.version @@ -322,27 +332,6 @@ pub struct DeltaTableState { } impl DeltaTableState { - /// Creates a new instance of DeltaTableState from the supplied components. - pub fn new( - tombstones: Vec, - files: Vec, - commit_infos: Vec, - app_transaction_version: HashMap, - min_reader_version: i32, - min_writer_version: i32, - current_metadata: Option, - ) -> Self { - Self { - tombstones, - files, - commit_infos, - app_transaction_version, - min_reader_version, - min_writer_version, - current_metadata, - } - } - /// Full list of tombstones (remove actions) representing files removed from table state). pub fn tombstones(&self) -> &Vec { self.tombstones.as_ref() @@ -1457,224 +1446,6 @@ fn log_entry_from_actions(actions: &[Action]) -> Result, - schema_factory: DeltaLogSchemaFactory, -} - -impl CheckPointWriter { - /// Creates a new CheckPointWriter. - pub fn new(table_uri: &str, storage: Box) -> Self { - let delta_log_path = storage.join_path(table_uri, "_delta_log"); - let last_checkpoint_path = storage.join_path(delta_log_path.as_str(), "_last_checkpoint"); - let schema_factory = DeltaLogSchemaFactory::new(); - - Self { - table_uri: table_uri.to_string(), - delta_log_path, - last_checkpoint_path, - storage, - schema_factory, - } - } - - /// Creates a new checkpoint at the specified version. - /// NOTE: This method loads a new instance of delta table to determine the state to - /// checkpoint. - pub async fn create_checkpoint_for_version( - &self, - version: DeltaDataTypeVersion, - ) -> Result<(), CheckPointWriterError> { - let table = open_table_with_version(self.table_uri.as_str(), version).await?; - - self.create_checkpoint_from_state(version, table.get_state()) - .await - } - - /// Creates a new checkpoint at the specified version from the given DeltaTableState. - pub async fn create_checkpoint_from_state( - &self, - version: DeltaDataTypeVersion, - state: &DeltaTableState, - ) -> Result<(), CheckPointWriterError> { - // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for - // an appropriate split point yet though so only writing a single part currently. - - info!("Writing parquet bytes to checkpoint buffer."); - let parquet_bytes = self.parquet_bytes_from_state(state)?; - - let size = parquet_bytes.len() as i64; - - let checkpoint = CheckPoint { - version, - size, - parts: None, - }; - - let file_name = format!("{:020}.checkpoint.parquet", version); - let checkpoint_path = self.storage.join_path(&self.delta_log_path, &file_name); - - info!("Writing checkpoint to {:?}.", checkpoint_path); - self.storage - .put_obj(&checkpoint_path, &parquet_bytes) - .await?; - - let last_checkpoint_content: serde_json::Value = serde_json::to_value(&checkpoint)?; - let last_checkpoint_content = serde_json::to_string(&last_checkpoint_content)?; - - info!( - "Writing _last_checkpoint to {:?}.", - self.last_checkpoint_path - ); - self.storage - .put_obj( - self.last_checkpoint_path.as_str(), - last_checkpoint_content.as_bytes(), - ) - .await?; - - Ok(()) - } - - fn parquet_bytes_from_state( - &self, - state: &DeltaTableState, - ) -> Result, CheckPointWriterError> { - let current_metadata = state - .current_metadata() - .ok_or(CheckPointWriterError::MissingMetaData)?; - - let mut json_buffer: Vec = Vec::new(); - - let protocol = action::Action::protocol(action::Protocol { - min_reader_version: state.min_reader_version(), - min_writer_version: state.min_writer_version(), - }); - extend_json_byte_buffer(&mut json_buffer, &protocol)?; - - let metadata = - action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?); - extend_json_byte_buffer(&mut json_buffer, &metadata)?; - - for add in state.files() { - let add = action::Action::add(add.clone()); - extend_json_byte_buffer(&mut json_buffer, &add)?; - } - - for remove in state.tombstones() { - let remove = action::Action::remove(remove.clone()); - extend_json_byte_buffer(&mut json_buffer, &remove)?; - } - - for (app_id, version) in state.app_transaction_version().iter() { - let txn = action::Action::txn(action::Txn { - app_id: app_id.clone(), - version: *version, - last_updated: None, - }); - extend_json_byte_buffer(&mut json_buffer, &txn)?; - } - - let checkpoint_schema = self.schema_factory.delta_log_schema_for_table( - ¤t_metadata.schema, - current_metadata.partition_columns.as_slice(), - )?; - let arrow_checkpoint_schema: ArrowSchema = - >::try_from(&checkpoint_schema)?; - let arrow_schema = Arc::new(arrow_checkpoint_schema); - - let cursor = Cursor::new(json_buffer); - - let mut json_reader = ReaderBuilder::new() - .with_schema(arrow_schema.clone()) - .build(cursor)?; - - debug!("Preparing checkpoint parquet buffer."); - - let writeable_cursor = InMemoryWriteableCursor::default(); - let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; - - debug!("Writing to checkpoint parquet buffer..."); - - while let Some(batch) = json_reader.next()? { - writer.write(&batch)?; - } - - let _ = writer.close()?; - - debug!("Finished writing checkpoint parquet buffer."); - - Ok(writeable_cursor.data()) - } -} - -fn extend_json_byte_buffer( - json_byte_buffer: &mut Vec, - json_value: &T, -) -> Result<(), serde_json::error::Error> -where - T: ?Sized + Serialize, -{ - json_byte_buffer.extend(serde_json::to_vec(json_value)?); - json_byte_buffer.push(b'\n'); - - Ok(()) -} - fn process_action( state: &mut DeltaTableState, action: Action, diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 226306fc7d..0bc128e73a 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -58,6 +58,7 @@ extern crate serde_json; extern crate thiserror; pub mod action; +pub mod checkpoints; mod delta; pub mod delta_arrow; pub mod partitions; diff --git a/rust/tests/checkpoint_writer_test.rs b/rust/tests/checkpoint_writer_test.rs index 58b06e2907..a6e8edf584 100644 --- a/rust/tests/checkpoint_writer_test.rs +++ b/rust/tests/checkpoint_writer_test.rs @@ -1,7 +1,7 @@ extern crate deltalake; +use deltalake::checkpoints::CheckPointWriter; use deltalake::storage; -use deltalake::CheckPointWriter; use std::fs; use std::path::{Path, PathBuf}; From 5d5d26d6c786d6827b437a9775093ecd59d1ce31 Mon Sep 17 00:00:00 2001 From: xianwill Date: Wed, 9 Jun 2021 12:46:06 -0400 Subject: [PATCH 13/20] Stop calling URIs as paths in checkpoint writer --- rust/src/checkpoints.rs | 69 ++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs index 42c58e49b0..f9311d6ff3 100644 --- a/rust/src/checkpoints.rs +++ b/rust/src/checkpoints.rs @@ -71,8 +71,8 @@ pub enum CheckPointWriterError { /// Struct for writing checkpoints to the delta log. pub struct CheckPointWriter { table_uri: String, - delta_log_path: String, - last_checkpoint_path: String, + delta_log_uri: String, + last_checkpoint_uri: String, storage: Box, schema_factory: DeltaLogSchemaFactory, } @@ -80,14 +80,14 @@ pub struct CheckPointWriter { impl CheckPointWriter { /// Creates a new CheckPointWriter. pub fn new(table_uri: &str, storage: Box) -> Self { - let delta_log_path = storage.join_path(table_uri, "_delta_log"); - let last_checkpoint_path = storage.join_path(delta_log_path.as_str(), "_last_checkpoint"); + let delta_log_uri = storage.join_path(table_uri, "_delta_log"); + let last_checkpoint_uri = storage.join_path(delta_log_uri.as_str(), "_last_checkpoint"); let schema_factory = DeltaLogSchemaFactory::new(); Self { table_uri: table_uri.to_string(), - delta_log_path, - last_checkpoint_path, + delta_log_uri, + last_checkpoint_uri, storage, schema_factory, } @@ -123,11 +123,11 @@ impl CheckPointWriter { let checkpoint = CheckPoint::new(version, size, None); let file_name = format!("{:020}.checkpoint.parquet", version); - let checkpoint_path = self.storage.join_path(&self.delta_log_path, &file_name); + let checkpoint_uri = self.storage.join_path(&self.delta_log_uri, &file_name); - info!("Writing checkpoint to {:?}.", checkpoint_path); + info!("Writing checkpoint to {:?}.", checkpoint_uri); self.storage - .put_obj(&checkpoint_path, &parquet_bytes) + .put_obj(&checkpoint_uri, &parquet_bytes) .await?; let last_checkpoint_content: serde_json::Value = serde_json::to_value(&checkpoint)?; @@ -135,11 +135,11 @@ impl CheckPointWriter { info!( "Writing _last_checkpoint to {:?}.", - self.last_checkpoint_path + self.last_checkpoint_uri ); self.storage .put_obj( - self.last_checkpoint_path.as_str(), + self.last_checkpoint_uri.as_str(), last_checkpoint_content.as_bytes(), ) .await?; @@ -155,6 +155,53 @@ impl CheckPointWriter { .current_metadata() .ok_or(CheckPointWriterError::MissingMetaData)?; + // let jsons: Iterator> = [ + // action::Action::protocol(action::Protocol { + // min_reader_version: state.min_reader_version(), + // min_writer_version: state.min_writer_version(), + // }), + // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), + // ]; + // + + // let things: Vec = state + // .files() + // .iter() + // .map(|f| action::Action::add(f.clone())) + // .collect(); + + // let jsons: dyn Iterator = [ + // action::Action::protocol(action::Protocol { + // min_reader_version: state.min_reader_version(), + // min_writer_version: state.min_writer_version(), + // }), + // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), + // ] + // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))); + + // jsons + // .iter() + // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) + // .chain( + // state + // .tombstones() + // .iter() + // .map(|t| action::Action::remove(t.clone())), + // ) + // .chain( + // state + // .app_transaction_version() + // .iter() + // .map(|(app_id, version)| { + // action::Action::txn(action::Txn { + // app_id: app_id.clone(), + // version: *version, + // last_updated: None, + // }) + // }), + // ) + // .map(|action| Ok(action.into())); + let mut json_buffer: Vec = Vec::new(); let protocol = action::Action::protocol(action::Protocol { From 54512740929a1a163fb5b266f6edb788ba182933 Mon Sep 17 00:00:00 2001 From: xianwill Date: Wed, 9 Jun 2021 17:38:10 -0400 Subject: [PATCH 14/20] Change checkpoint writer action serialization to use decoder --- rust/' | 324 ++++++++++++++++++++++++++++++++++++++++ rust/src/checkpoints.rs | 140 +++++------------ rust/src/writer.rs | 4 +- 3 files changed, 364 insertions(+), 104 deletions(-) create mode 100644 rust/' diff --git a/rust/' b/rust/' new file mode 100644 index 0000000000..105f38e4fc --- /dev/null +++ b/rust/' @@ -0,0 +1,324 @@ +//! Implementation for writing delta checkpoints. + +use arrow::datatypes::Schema as ArrowSchema; +use arrow::error::ArrowError; +use arrow::json::reader::ReaderBuilder; +use log::*; +use parquet::arrow::ArrowWriter; +use parquet::errors::ParquetError; +use parquet::file::writer::InMemoryWriteableCursor; +use serde::Serialize; +use std::convert::TryFrom; +use std::io::Cursor; +use std::sync::Arc; + +use super::action; +use super::open_table_with_version; +use super::schema::*; +use super::storage::{StorageBackend, StorageError}; +use super::{CheckPoint, DeltaTableError, DeltaTableState}; + +/// Error returned when the CheckPointWriter is unable to write a checkpoint. +#[derive(thiserror::Error, Debug)] +pub enum CheckPointWriterError { + /// Error returned when the DeltaTableState does not contain a metadata action. + #[error("DeltaTableMetadata not present in DeltaTableState")] + MissingMetaData, + /// Error returned when creating the checkpoint schema. + #[error("DeltaLogSchemaError: {source}")] + DeltaLogSchema { + /// The source DeltaLogSchemaError + #[from] + source: DeltaLogSchemaError, + }, + /// Passthrough error returned when calling DeltaTable. + #[error("DeltaTableError: {source}")] + DeltaTable { + /// The source DeltaTableError. + #[from] + source: DeltaTableError, + }, + /// Error returned when the parquet writer fails while writing the checkpoint. + #[error("Failed to write parquet: {}", .source)] + ParquetError { + /// Parquet error details returned when writing the checkpoint failed. + #[from] + source: ParquetError, + }, + /// Error returned when converting the schema to Arrow format failed. + #[error("Failed to convert into Arrow schema: {}", .source)] + ArrowError { + /// Arrow error details returned when converting the schema in Arrow format failed + #[from] + source: ArrowError, + }, + /// Passthrough error returned when calling StorageBackend. + #[error("StorageError: {source}")] + Storage { + /// The source StorageError. + #[from] + source: StorageError, + }, + /// Passthrough error returned by serde_json. + #[error("serde_json::Error: {source}")] + JSONSerialization { + /// The source serde_json::Error. + #[from] + source: serde_json::Error, + }, +} + +/// Struct for writing checkpoints to the delta log. +pub struct CheckPointWriter { + table_uri: String, + delta_log_uri: String, + last_checkpoint_uri: String, + storage: Box, + schema_factory: DeltaLogSchemaFactory, +} + +impl CheckPointWriter { + /// Creates a new CheckPointWriter. + pub fn new(table_uri: &str, storage: Box) -> Self { + let delta_log_uri = storage.join_path(table_uri, "_delta_log"); + let last_checkpoint_uri = storage.join_path(delta_log_uri.as_str(), "_last_checkpoint"); + let schema_factory = DeltaLogSchemaFactory::new(); + + Self { + table_uri: table_uri.to_string(), + delta_log_uri, + last_checkpoint_uri, + storage, + schema_factory, + } + } + + /// Creates a new checkpoint at the specified version. + /// NOTE: This method loads a new instance of delta table to determine the state to + /// checkpoint. + pub async fn create_checkpoint_for_version( + &self, + version: DeltaDataTypeVersion, + ) -> Result<(), CheckPointWriterError> { + let table = open_table_with_version(self.table_uri.as_str(), version).await?; + + self.create_checkpoint_from_state(version, table.get_state()) + .await + } + + /// Creates a new checkpoint at the specified version from the given DeltaTableState. + pub async fn create_checkpoint_from_state( + &self, + version: DeltaDataTypeVersion, + state: &DeltaTableState, + ) -> Result<(), CheckPointWriterError> { + // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for + // an appropriate split point yet though so only writing a single part currently. + + info!("Writing parquet bytes to checkpoint buffer."); + let parquet_bytes = self.parquet_bytes_from_state(state)?; + + let size = parquet_bytes.len() as i64; + + let checkpoint = CheckPoint::new(version, size, None); + + let file_name = format!("{:020}.checkpoint.parquet", version); + let checkpoint_uri = self.storage.join_path(&self.delta_log_uri, &file_name); + + info!("Writing checkpoint to {:?}.", checkpoint_uri); + self.storage + .put_obj(&checkpoint_uri, &parquet_bytes) + .await?; + + let last_checkpoint_content: serde_json::Value = serde_json::to_value(&checkpoint)?; + let last_checkpoint_content = serde_json::to_string(&last_checkpoint_content)?; + + info!( + "Writing _last_checkpoint to {:?}.", + self.last_checkpoint_uri + ); + self.storage + .put_obj( + self.last_checkpoint_uri.as_str(), + last_checkpoint_content.as_bytes(), + ) + .await?; + + Ok(()) + } + + fn parquet_bytes_from_state( + &self, + state: &DeltaTableState, + ) -> Result, CheckPointWriterError> { + let current_metadata = state + .current_metadata() + .ok_or(CheckPointWriterError::MissingMetaData)?; + + // let adds: Vec = state + let adds = state + .files() + .iter() + .map(|f| action::Action::add(f.clone())) + // .collect() + // TODO: + ; + + let removes = state + .tombstones() + .iter() + .map(|f| action::Action::remove(f.clone())); + + let jsons: Vec = /*[ + // let jsons: Iterator = [ + action::Action::protocol(action::Protocol { + min_reader_version: state.min_reader_version(), + min_writer_version: state.min_writer_version(), + }), + action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), + ]*/ + adds + // .iter() + // .chain(adds) + .chain(removes) + .map(|v| serde_json::to_value(v).unwrap()) + // TODO: + .collect() + // TODO: + ; + + // let jsons: Iterator = [ + // action::Action::protocol(action::Protocol { + // min_reader_version: state.min_reader_version(), + // min_writer_version: state.min_writer_version(), + // }), + // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), + // ] + // // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) + // // TODO: + // ; + + // let jsons: Iterator> = [ + // action::Action::protocol(action::Protocol { + // min_reader_version: state.min_reader_version(), + // min_writer_version: state.min_writer_version(), + // }), + // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), + // ]; + + // let things: Vec = state + // .files() + // .iter() + // .map(|f| action::Action::add(f.clone())) + // .collect(); + + // let jsons: dyn Iterator = [ + // action::Action::protocol(action::Protocol { + // min_reader_version: state.min_reader_version(), + // min_writer_version: state.min_writer_version(), + // }), + // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), + // ] + // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))); + + // jsons + // .iter() + // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) + // .chain( + // state + // .tombstones() + // .iter() + // .map(|t| action::Action::remove(t.clone())), + // ) + // .chain( + // state + // .app_transaction_version() + // .iter() + // .map(|(app_id, version)| { + // action::Action::txn(action::Txn { + // app_id: app_id.clone(), + // version: *version, + // last_updated: None, + // }) + // }), + // ) + // .map(|action| Ok(action.into())); + + todo!() + + // let mut json_buffer: Vec = Vec::new(); + + // let protocol = action::Action::protocol(action::Protocol { + // min_reader_version: state.min_reader_version(), + // min_writer_version: state.min_writer_version(), + // }); + // extend_json_byte_buffer(&mut json_buffer, &protocol)?; + + // let metadata = + // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?); + // extend_json_byte_buffer(&mut json_buffer, &metadata)?; + + // for add in state.files() { + // let add = action::Action::add(add.clone()); + // extend_json_byte_buffer(&mut json_buffer, &add)?; + // } + + // for remove in state.tombstones() { + // let remove = action::Action::remove(remove.clone()); + // extend_json_byte_buffer(&mut json_buffer, &remove)?; + // } + + // for (app_id, version) in state.app_transaction_version().iter() { + // let txn = action::Action::txn(action::Txn { + // app_id: app_id.clone(), + // version: *version, + // last_updated: None, + // }); + // extend_json_byte_buffer(&mut json_buffer, &txn)?; + // } + + // let checkpoint_schema = self.schema_factory.delta_log_schema_for_table( + // ¤t_metadata.schema, + // current_metadata.partition_columns.as_slice(), + // )?; + // let arrow_checkpoint_schema: ArrowSchema = + // >::try_from(&checkpoint_schema)?; + // let arrow_schema = Arc::new(arrow_checkpoint_schema); + + // let cursor = Cursor::new(json_buffer); + + // let mut json_reader = ReaderBuilder::new() + // .with_schema(arrow_schema.clone()) + // .build(cursor)?; + + // debug!("Preparing checkpoint parquet buffer."); + + // let writeable_cursor = InMemoryWriteableCursor::default(); + // let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; + + // debug!("Writing to checkpoint parquet buffer..."); + + // while let Some(batch) = json_reader.next()? { + // writer.write(&batch)?; + // } + + // let _ = writer.close()?; + + // debug!("Finished writing checkpoint parquet buffer."); + + // Ok(writeable_cursor.data()) + } +} + +// fn extend_json_byte_buffer( +// json_byte_buffer: &mut Vec, +// json_value: &T, +// ) -> Result<(), serde_json::error::Error> +// where +// T: ?Sized + Serialize, +// { +// json_byte_buffer.extend(serde_json::to_vec(json_value)?); +// json_byte_buffer.push(b'\n'); + +// Ok(()) +// } diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs index f9311d6ff3..bf93e0890d 100644 --- a/rust/src/checkpoints.rs +++ b/rust/src/checkpoints.rs @@ -2,20 +2,19 @@ use arrow::datatypes::Schema as ArrowSchema; use arrow::error::ArrowError; -use arrow::json::reader::ReaderBuilder; +use arrow::json::reader::Decoder; use log::*; use parquet::arrow::ArrowWriter; use parquet::errors::ParquetError; use parquet::file::writer::InMemoryWriteableCursor; -use serde::Serialize; use std::convert::TryFrom; -use std::io::Cursor; use std::sync::Arc; use super::action; use super::open_table_with_version; use super::schema::*; use super::storage::{StorageBackend, StorageError}; +use super::writer::InMemValueIter; use super::{CheckPoint, DeltaTableError, DeltaTableState}; /// Error returned when the CheckPointWriter is unable to write a checkpoint. @@ -155,83 +154,37 @@ impl CheckPointWriter { .current_metadata() .ok_or(CheckPointWriterError::MissingMetaData)?; - // let jsons: Iterator> = [ - // action::Action::protocol(action::Protocol { - // min_reader_version: state.min_reader_version(), - // min_writer_version: state.min_writer_version(), - // }), - // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), - // ]; - // + let jsons: Vec = vec![ + action::Action::protocol(action::Protocol { + min_reader_version: state.min_reader_version(), + min_writer_version: state.min_writer_version(), + }), + action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), + ] + .into_iter() + .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) + .chain( + state + .tombstones() + .iter() + .map(|f| action::Action::remove(f.clone())), + ) + .chain( + state + .app_transaction_version() + .iter() + .map(|(app_id, version)| { + action::Action::txn(action::Txn { + app_id: app_id.clone(), + version: *version, + last_updated: None, + }) + }), + ) + .filter_map(|a| serde_json::to_value(a).ok()) + .collect(); - // let things: Vec = state - // .files() - // .iter() - // .map(|f| action::Action::add(f.clone())) - // .collect(); - - // let jsons: dyn Iterator = [ - // action::Action::protocol(action::Protocol { - // min_reader_version: state.min_reader_version(), - // min_writer_version: state.min_writer_version(), - // }), - // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), - // ] - // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))); - - // jsons - // .iter() - // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) - // .chain( - // state - // .tombstones() - // .iter() - // .map(|t| action::Action::remove(t.clone())), - // ) - // .chain( - // state - // .app_transaction_version() - // .iter() - // .map(|(app_id, version)| { - // action::Action::txn(action::Txn { - // app_id: app_id.clone(), - // version: *version, - // last_updated: None, - // }) - // }), - // ) - // .map(|action| Ok(action.into())); - - let mut json_buffer: Vec = Vec::new(); - - let protocol = action::Action::protocol(action::Protocol { - min_reader_version: state.min_reader_version(), - min_writer_version: state.min_writer_version(), - }); - extend_json_byte_buffer(&mut json_buffer, &protocol)?; - - let metadata = - action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?); - extend_json_byte_buffer(&mut json_buffer, &metadata)?; - - for add in state.files() { - let add = action::Action::add(add.clone()); - extend_json_byte_buffer(&mut json_buffer, &add)?; - } - - for remove in state.tombstones() { - let remove = action::Action::remove(remove.clone()); - extend_json_byte_buffer(&mut json_buffer, &remove)?; - } - - for (app_id, version) in state.app_transaction_version().iter() { - let txn = action::Action::txn(action::Txn { - app_id: app_id.clone(), - version: *version, - last_updated: None, - }); - extend_json_byte_buffer(&mut json_buffer, &txn)?; - } + debug!("Preparing checkpoint parquet buffer."); let checkpoint_schema = self.schema_factory.delta_log_schema_for_table( ¤t_metadata.schema, @@ -241,20 +194,16 @@ impl CheckPointWriter { >::try_from(&checkpoint_schema)?; let arrow_schema = Arc::new(arrow_checkpoint_schema); - let cursor = Cursor::new(json_buffer); - - let mut json_reader = ReaderBuilder::new() - .with_schema(arrow_schema.clone()) - .build(cursor)?; - - debug!("Preparing checkpoint parquet buffer."); - let writeable_cursor = InMemoryWriteableCursor::default(); - let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; + let mut writer = + ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema.clone(), None)?; debug!("Writing to checkpoint parquet buffer..."); - while let Some(batch) = json_reader.next()? { + let mut value_iter = InMemValueIter::from_vec(jsons.as_slice()); + let decoder = Decoder::new(arrow_schema, jsons.len(), None); + + while let Some(batch) = decoder.next_batch(&mut value_iter)? { writer.write(&batch)?; } @@ -265,16 +214,3 @@ impl CheckPointWriter { Ok(writeable_cursor.data()) } } - -fn extend_json_byte_buffer( - json_byte_buffer: &mut Vec, - json_value: &T, -) -> Result<(), serde_json::error::Error> -where - T: ?Sized + Serialize, -{ - json_byte_buffer.extend(serde_json::to_vec(json_value)?); - json_byte_buffer.push(b'\n'); - - Ok(()) -} diff --git a/rust/src/writer.rs b/rust/src/writer.rs index d2eede8703..33a631a268 100644 --- a/rust/src/writer.rs +++ b/rust/src/writer.rs @@ -176,13 +176,13 @@ impl ParquetBuffer { } } -struct InMemValueIter<'a> { +pub(crate) struct InMemValueIter<'a> { buffer: &'a [Value], current_index: usize, } impl<'a> InMemValueIter<'a> { - fn from_vec(buffer: &'a [Value]) -> Self { + pub(crate) fn from_vec(buffer: &'a [Value]) -> Self { Self { buffer, current_index: 0, From ef166747f47f324f83ddc57bcac1e4781679e274 Mon Sep 17 00:00:00 2001 From: xianwill Date: Thu, 10 Jun 2021 10:28:06 -0400 Subject: [PATCH 15/20] Write Arrow schema for delta log directly instead of deserializing from json --- rust/' | 324 -------------- rust/src/checkpoints.rs | 31 +- rust/src/delta_arrow.rs | 136 +++++- rust/src/schema.rs | 967 ++++++++++++++++++++-------------------- 4 files changed, 626 insertions(+), 832 deletions(-) delete mode 100644 rust/' diff --git a/rust/' b/rust/' deleted file mode 100644 index 105f38e4fc..0000000000 --- a/rust/' +++ /dev/null @@ -1,324 +0,0 @@ -//! Implementation for writing delta checkpoints. - -use arrow::datatypes::Schema as ArrowSchema; -use arrow::error::ArrowError; -use arrow::json::reader::ReaderBuilder; -use log::*; -use parquet::arrow::ArrowWriter; -use parquet::errors::ParquetError; -use parquet::file::writer::InMemoryWriteableCursor; -use serde::Serialize; -use std::convert::TryFrom; -use std::io::Cursor; -use std::sync::Arc; - -use super::action; -use super::open_table_with_version; -use super::schema::*; -use super::storage::{StorageBackend, StorageError}; -use super::{CheckPoint, DeltaTableError, DeltaTableState}; - -/// Error returned when the CheckPointWriter is unable to write a checkpoint. -#[derive(thiserror::Error, Debug)] -pub enum CheckPointWriterError { - /// Error returned when the DeltaTableState does not contain a metadata action. - #[error("DeltaTableMetadata not present in DeltaTableState")] - MissingMetaData, - /// Error returned when creating the checkpoint schema. - #[error("DeltaLogSchemaError: {source}")] - DeltaLogSchema { - /// The source DeltaLogSchemaError - #[from] - source: DeltaLogSchemaError, - }, - /// Passthrough error returned when calling DeltaTable. - #[error("DeltaTableError: {source}")] - DeltaTable { - /// The source DeltaTableError. - #[from] - source: DeltaTableError, - }, - /// Error returned when the parquet writer fails while writing the checkpoint. - #[error("Failed to write parquet: {}", .source)] - ParquetError { - /// Parquet error details returned when writing the checkpoint failed. - #[from] - source: ParquetError, - }, - /// Error returned when converting the schema to Arrow format failed. - #[error("Failed to convert into Arrow schema: {}", .source)] - ArrowError { - /// Arrow error details returned when converting the schema in Arrow format failed - #[from] - source: ArrowError, - }, - /// Passthrough error returned when calling StorageBackend. - #[error("StorageError: {source}")] - Storage { - /// The source StorageError. - #[from] - source: StorageError, - }, - /// Passthrough error returned by serde_json. - #[error("serde_json::Error: {source}")] - JSONSerialization { - /// The source serde_json::Error. - #[from] - source: serde_json::Error, - }, -} - -/// Struct for writing checkpoints to the delta log. -pub struct CheckPointWriter { - table_uri: String, - delta_log_uri: String, - last_checkpoint_uri: String, - storage: Box, - schema_factory: DeltaLogSchemaFactory, -} - -impl CheckPointWriter { - /// Creates a new CheckPointWriter. - pub fn new(table_uri: &str, storage: Box) -> Self { - let delta_log_uri = storage.join_path(table_uri, "_delta_log"); - let last_checkpoint_uri = storage.join_path(delta_log_uri.as_str(), "_last_checkpoint"); - let schema_factory = DeltaLogSchemaFactory::new(); - - Self { - table_uri: table_uri.to_string(), - delta_log_uri, - last_checkpoint_uri, - storage, - schema_factory, - } - } - - /// Creates a new checkpoint at the specified version. - /// NOTE: This method loads a new instance of delta table to determine the state to - /// checkpoint. - pub async fn create_checkpoint_for_version( - &self, - version: DeltaDataTypeVersion, - ) -> Result<(), CheckPointWriterError> { - let table = open_table_with_version(self.table_uri.as_str(), version).await?; - - self.create_checkpoint_from_state(version, table.get_state()) - .await - } - - /// Creates a new checkpoint at the specified version from the given DeltaTableState. - pub async fn create_checkpoint_from_state( - &self, - version: DeltaDataTypeVersion, - state: &DeltaTableState, - ) -> Result<(), CheckPointWriterError> { - // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for - // an appropriate split point yet though so only writing a single part currently. - - info!("Writing parquet bytes to checkpoint buffer."); - let parquet_bytes = self.parquet_bytes_from_state(state)?; - - let size = parquet_bytes.len() as i64; - - let checkpoint = CheckPoint::new(version, size, None); - - let file_name = format!("{:020}.checkpoint.parquet", version); - let checkpoint_uri = self.storage.join_path(&self.delta_log_uri, &file_name); - - info!("Writing checkpoint to {:?}.", checkpoint_uri); - self.storage - .put_obj(&checkpoint_uri, &parquet_bytes) - .await?; - - let last_checkpoint_content: serde_json::Value = serde_json::to_value(&checkpoint)?; - let last_checkpoint_content = serde_json::to_string(&last_checkpoint_content)?; - - info!( - "Writing _last_checkpoint to {:?}.", - self.last_checkpoint_uri - ); - self.storage - .put_obj( - self.last_checkpoint_uri.as_str(), - last_checkpoint_content.as_bytes(), - ) - .await?; - - Ok(()) - } - - fn parquet_bytes_from_state( - &self, - state: &DeltaTableState, - ) -> Result, CheckPointWriterError> { - let current_metadata = state - .current_metadata() - .ok_or(CheckPointWriterError::MissingMetaData)?; - - // let adds: Vec = state - let adds = state - .files() - .iter() - .map(|f| action::Action::add(f.clone())) - // .collect() - // TODO: - ; - - let removes = state - .tombstones() - .iter() - .map(|f| action::Action::remove(f.clone())); - - let jsons: Vec = /*[ - // let jsons: Iterator = [ - action::Action::protocol(action::Protocol { - min_reader_version: state.min_reader_version(), - min_writer_version: state.min_writer_version(), - }), - action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), - ]*/ - adds - // .iter() - // .chain(adds) - .chain(removes) - .map(|v| serde_json::to_value(v).unwrap()) - // TODO: - .collect() - // TODO: - ; - - // let jsons: Iterator = [ - // action::Action::protocol(action::Protocol { - // min_reader_version: state.min_reader_version(), - // min_writer_version: state.min_writer_version(), - // }), - // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), - // ] - // // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) - // // TODO: - // ; - - // let jsons: Iterator> = [ - // action::Action::protocol(action::Protocol { - // min_reader_version: state.min_reader_version(), - // min_writer_version: state.min_writer_version(), - // }), - // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), - // ]; - - // let things: Vec = state - // .files() - // .iter() - // .map(|f| action::Action::add(f.clone())) - // .collect(); - - // let jsons: dyn Iterator = [ - // action::Action::protocol(action::Protocol { - // min_reader_version: state.min_reader_version(), - // min_writer_version: state.min_writer_version(), - // }), - // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), - // ] - // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))); - - // jsons - // .iter() - // .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) - // .chain( - // state - // .tombstones() - // .iter() - // .map(|t| action::Action::remove(t.clone())), - // ) - // .chain( - // state - // .app_transaction_version() - // .iter() - // .map(|(app_id, version)| { - // action::Action::txn(action::Txn { - // app_id: app_id.clone(), - // version: *version, - // last_updated: None, - // }) - // }), - // ) - // .map(|action| Ok(action.into())); - - todo!() - - // let mut json_buffer: Vec = Vec::new(); - - // let protocol = action::Action::protocol(action::Protocol { - // min_reader_version: state.min_reader_version(), - // min_writer_version: state.min_writer_version(), - // }); - // extend_json_byte_buffer(&mut json_buffer, &protocol)?; - - // let metadata = - // action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?); - // extend_json_byte_buffer(&mut json_buffer, &metadata)?; - - // for add in state.files() { - // let add = action::Action::add(add.clone()); - // extend_json_byte_buffer(&mut json_buffer, &add)?; - // } - - // for remove in state.tombstones() { - // let remove = action::Action::remove(remove.clone()); - // extend_json_byte_buffer(&mut json_buffer, &remove)?; - // } - - // for (app_id, version) in state.app_transaction_version().iter() { - // let txn = action::Action::txn(action::Txn { - // app_id: app_id.clone(), - // version: *version, - // last_updated: None, - // }); - // extend_json_byte_buffer(&mut json_buffer, &txn)?; - // } - - // let checkpoint_schema = self.schema_factory.delta_log_schema_for_table( - // ¤t_metadata.schema, - // current_metadata.partition_columns.as_slice(), - // )?; - // let arrow_checkpoint_schema: ArrowSchema = - // >::try_from(&checkpoint_schema)?; - // let arrow_schema = Arc::new(arrow_checkpoint_schema); - - // let cursor = Cursor::new(json_buffer); - - // let mut json_reader = ReaderBuilder::new() - // .with_schema(arrow_schema.clone()) - // .build(cursor)?; - - // debug!("Preparing checkpoint parquet buffer."); - - // let writeable_cursor = InMemoryWriteableCursor::default(); - // let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema, None)?; - - // debug!("Writing to checkpoint parquet buffer..."); - - // while let Some(batch) = json_reader.next()? { - // writer.write(&batch)?; - // } - - // let _ = writer.close()?; - - // debug!("Finished writing checkpoint parquet buffer."); - - // Ok(writeable_cursor.data()) - } -} - -// fn extend_json_byte_buffer( -// json_byte_buffer: &mut Vec, -// json_value: &T, -// ) -> Result<(), serde_json::error::Error> -// where -// T: ?Sized + Serialize, -// { -// json_byte_buffer.extend(serde_json::to_vec(json_value)?); -// json_byte_buffer.push(b'\n'); - -// Ok(()) -// } diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs index bf93e0890d..e1aae7fccc 100644 --- a/rust/src/checkpoints.rs +++ b/rust/src/checkpoints.rs @@ -8,13 +8,12 @@ use parquet::arrow::ArrowWriter; use parquet::errors::ParquetError; use parquet::file::writer::InMemoryWriteableCursor; use std::convert::TryFrom; -use std::sync::Arc; use super::action; +use super::delta_arrow::delta_log_schema_for_table; use super::open_table_with_version; use super::schema::*; use super::storage::{StorageBackend, StorageError}; -use super::writer::InMemValueIter; use super::{CheckPoint, DeltaTableError, DeltaTableState}; /// Error returned when the CheckPointWriter is unable to write a checkpoint. @@ -23,13 +22,6 @@ pub enum CheckPointWriterError { /// Error returned when the DeltaTableState does not contain a metadata action. #[error("DeltaTableMetadata not present in DeltaTableState")] MissingMetaData, - /// Error returned when creating the checkpoint schema. - #[error("DeltaLogSchemaError: {source}")] - DeltaLogSchema { - /// The source DeltaLogSchemaError - #[from] - source: DeltaLogSchemaError, - }, /// Passthrough error returned when calling DeltaTable. #[error("DeltaTableError: {source}")] DeltaTable { @@ -73,7 +65,6 @@ pub struct CheckPointWriter { delta_log_uri: String, last_checkpoint_uri: String, storage: Box, - schema_factory: DeltaLogSchemaFactory, } impl CheckPointWriter { @@ -81,14 +72,12 @@ impl CheckPointWriter { pub fn new(table_uri: &str, storage: Box) -> Self { let delta_log_uri = storage.join_path(table_uri, "_delta_log"); let last_checkpoint_uri = storage.join_path(delta_log_uri.as_str(), "_last_checkpoint"); - let schema_factory = DeltaLogSchemaFactory::new(); Self { table_uri: table_uri.to_string(), delta_log_uri, last_checkpoint_uri, storage, - schema_factory, } } @@ -113,6 +102,7 @@ impl CheckPointWriter { ) -> Result<(), CheckPointWriterError> { // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for // an appropriate split point yet though so only writing a single part currently. + // Will post a research issue to follow-up on this later. info!("Writing parquet bytes to checkpoint buffer."); let parquet_bytes = self.parquet_bytes_from_state(state)?; @@ -154,7 +144,7 @@ impl CheckPointWriter { .current_metadata() .ok_or(CheckPointWriterError::MissingMetaData)?; - let jsons: Vec = vec![ + let jsons: Vec> = vec![ action::Action::protocol(action::Protocol { min_reader_version: state.min_reader_version(), min_writer_version: state.min_writer_version(), @@ -181,18 +171,15 @@ impl CheckPointWriter { }) }), ) - .filter_map(|a| serde_json::to_value(a).ok()) + .map(|a| serde_json::to_value(a).map_err(|e| ArrowError::from(e))) .collect(); debug!("Preparing checkpoint parquet buffer."); - let checkpoint_schema = self.schema_factory.delta_log_schema_for_table( - ¤t_metadata.schema, + let arrow_schema = delta_log_schema_for_table( + >::try_from(¤t_metadata.schema)?, current_metadata.partition_columns.as_slice(), - )?; - let arrow_checkpoint_schema: ArrowSchema = - >::try_from(&checkpoint_schema)?; - let arrow_schema = Arc::new(arrow_checkpoint_schema); + ); let writeable_cursor = InMemoryWriteableCursor::default(); let mut writer = @@ -200,13 +187,11 @@ impl CheckPointWriter { debug!("Writing to checkpoint parquet buffer..."); - let mut value_iter = InMemValueIter::from_vec(jsons.as_slice()); let decoder = Decoder::new(arrow_schema, jsons.len(), None); - + let mut value_iter = jsons.into_iter(); while let Some(batch) = decoder.next_batch(&mut value_iter)? { writer.write(&batch)?; } - let _ = writer.close()?; debug!("Finished writing checkpoint parquet buffer."); diff --git a/rust/src/delta_arrow.rs b/rust/src/delta_arrow.rs index d5cb045aae..e1d2dd7b31 100644 --- a/rust/src/delta_arrow.rs +++ b/rust/src/delta_arrow.rs @@ -2,7 +2,7 @@ use crate::schema; use arrow::datatypes::{ - DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, TimeUnit, + DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, SchemaRef, TimeUnit, }; use arrow::error::ArrowError; use lazy_static::lazy_static; @@ -136,3 +136,137 @@ impl TryFrom<&schema::SchemaDataType> for ArrowDataType { } } } + +pub(crate) fn delta_log_schema_for_table( + table_schema: ArrowSchema, + partition_columns: &[String], +) -> SchemaRef { + lazy_static! { + static ref SCHEMA_FIELDS: Vec = vec![ + ArrowField::new( + "metaData", + ArrowDataType::Struct(vec![ + ArrowField::new("id", ArrowDataType::Utf8, true), + ArrowField::new("name", ArrowDataType::Utf8, true), + ArrowField::new("description", ArrowDataType::Utf8, true), + ArrowField::new("schemaString", ArrowDataType::Utf8, true), + ArrowField::new("createdTime", ArrowDataType::Int64, true), + ArrowField::new("partitionColumns", ArrowDataType::List(Box::new( + ArrowField::new("element", ArrowDataType::Utf8, true))), true), + ArrowField::new("format", ArrowDataType::Struct(vec![ + ArrowField::new("provider", ArrowDataType::Utf8, true), + // TODO: Add "options" after ArrowDataType::Map support + ]), true), + ]), + true + ), + ArrowField::new( + "protocol", + ArrowDataType::Struct(vec![ + ArrowField::new("minReaderVersion", ArrowDataType::Int32, true), + ArrowField::new("minWriterVersion", ArrowDataType::Int32, true), + ]), + true + ), + ArrowField::new( + "txn", + ArrowDataType::Struct(vec![ + ArrowField::new("appId", ArrowDataType::Utf8, true), + ArrowField::new("version", ArrowDataType::Int64, true), + ]), + true + ), + ArrowField::new( + "remove", + ArrowDataType::Struct(vec![ + ArrowField::new("path", ArrowDataType::Utf8, true), + ArrowField::new("deletionTimestamp", ArrowDataType::Int64, true), + ArrowField::new("dataChange", ArrowDataType::Boolean, true), + ArrowField::new("extendedFileMetadata", ArrowDataType::Boolean, true), + ArrowField::new("size", ArrowDataType::Int64, true), + // TODO: Add "partitionValues" after ArrowDataType::Map support + // TODO: Add "tags" after ArrowDataType::Map support + ]), + true + ) + ]; + static ref ADD_FIELDS: Vec = vec![ + ArrowField::new("path", ArrowDataType::Utf8, true), + ArrowField::new("size", ArrowDataType::Int64, true), + ArrowField::new("modificationTime", ArrowDataType::Int64, true), + ArrowField::new("dataChange", ArrowDataType::Boolean, true), + ArrowField::new("stats", ArrowDataType::Utf8, true), + // TODO: Add "partitionValues" after ArrowDataType::Map support + // TODO: Add "tags" after ArrowDataType::Map support + ]; + } + + let (partition_fields, non_partition_fields): (Vec, Vec) = table_schema + .fields() + .iter() + .map(|f| f.to_owned()) + .partition(|field| partition_columns.contains(&field.name())); + + let mut stats_parsed_fields: Vec = + vec![ArrowField::new("numRecords", ArrowDataType::Int64, true)]; + + if !non_partition_fields.is_empty() { + stats_parsed_fields.extend(["minValues", "maxValues", "nullCounts"].iter().map(|name| { + ArrowField::new( + name, + ArrowDataType::Struct(non_partition_fields.clone()), + true, + ) + })); + } + + let mut add_fields = ADD_FIELDS.clone(); + + add_fields.push(ArrowField::new( + "stats_parsed", + ArrowDataType::Struct(stats_parsed_fields), + true, + )); + + if !partition_fields.is_empty() { + add_fields.push(ArrowField::new( + "partitionValues_parsed", + ArrowDataType::Struct(partition_fields), + true, + )); + } + + let mut schema_fields = SCHEMA_FIELDS.clone(); + schema_fields.push(ArrowField::new( + "add", + ArrowDataType::Struct(add_fields), + true, + )); + + let arrow_schema = ArrowSchema::new(schema_fields); + + std::sync::Arc::new(arrow_schema) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn delta_log_schema_for_table_test() { + let table_schema = json!({ + "type": "struct", + "fields": [ + { "name": "pcol", "type": "integer", "nullable": true, "metadata": {} }, + { "name": "col1", "type": "integer", "nullable": true, "metadata": {} }, + ] + }); + let table_schema = serde_json::from_value(table_schema).unwrap(); + + let partition_columns = vec!["pcol".to_string()]; + + let _log_schema = delta_log_schema_for_table(table_schema, partition_columns.as_slice()); + + todo!("Add tests for delta_log_schema_for_table"); + } +} diff --git a/rust/src/schema.rs b/rust/src/schema.rs index b45c73f7c6..163026da80 100644 --- a/rust/src/schema.rs +++ b/rust/src/schema.rs @@ -1,7 +1,6 @@ #![allow(non_snake_case, non_camel_case_types)] use serde::{Deserialize, Serialize}; -use serde_json::json; use std::collections::HashMap; /// Type alias for a string expected to match a GUID/UUID format @@ -161,486 +160,486 @@ impl Schema { } } -/// Error representing a failure while creating the delta log schema. -#[derive(thiserror::Error, Debug)] -pub enum DeltaLogSchemaError { - /// Error returned when JSON de-serialization of schema components fails. - #[error("serde_json::Error: {source}")] - JSONSerialization { - /// The source serde_json::Error. - #[from] - source: serde_json::Error, - }, -} - -/// Factory for creating a Delta log schema for a specific table schema. -/// REF: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-schema -pub struct DeltaLogSchemaFactory { - common_fields: HashMap>, -} - -impl DeltaLogSchemaFactory { - /// Creates a new DeltaLogSchemaFactory which can be used to create Schema's representing the - /// Delta log for specific tables. - pub fn new() -> Self { - // TODO: map is not supported by arrow currently. - // See: - // * https://github.com/apache/arrow-rs/issues/395 - // * https://github.com/apache/arrow-rs/issues/396 - - let meta_data_fields = json!([ - { "name": "id", "type": "string", "nullable": true, "metadata": {} }, - { "name": "name", "type": "string", "nullable": true, "metadata": {} }, - { "name": "description", "type": "string", "nullable": true, "metadata": {} }, - { "name": "schemaString", "type": "string", "nullable": true, "metadata": {} }, - { "name": "createdTime", "type": "long", "nullable": true, "metadata": {} }, - { - "name": "partitionColumns", - "type": { - "type": "array", - "elementType": "string", - "containsNull": true, - }, - "nullable": true, - "metadata": {} }, - { - "name": "format", - "type": { - "type": "struct", - "fields": [{ - "name": "provider", - "type": "string", - "nullable": true, - "metadata": {}, - },/*{ - "name": "options", - "type": { - "type": "map", - "keyType": "string", - "valueType": "string", - "valueContainsNull": true, - }, - "nullable": true, - "metadata": {} - }*/] - }, - "nullable": true, - "metadata": {} - }, - /*{ - "name": "configuration", - "type": { - "type": "map", - "keyType": "string", - "valueType": "string", - "valueContainsNull": true, - }, - "nullable": true, - "metadata": {} - }*/]); - - let protocol_fields = json!([ - { "name": "minReaderVersion", "type": "integer", "nullable": true, "metadata": {} }, - { "name": "minWriterVersion", "type": "integer", "nullable": true, "metadata": {} } - ]); - - let txn_fields = json!([ - { "name": "appId", "type": "string", "nullable": true, "metadata": {} }, - { "name": "version", "type": "long", "nullable": true, "metadata": {} } - ]); - - let add_fields = json!([ - { "name": "path", "type": "string", "nullable": true, "metadata": {} }, - { "name": "size", "type": "long", "nullable": true, "metadata": {} }, - { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, - { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {} }, - { "name": "stats", "type": "string", "nullable": true, "metadata": {} }, - /*{ - "name": "partitionValues", - "type": { - "type": "map", - "keyType": "string", - "valueType": "string", - "valueContainsNull": true, - }, - "nullable": true, - "metadata": {}, - }*/ - ]); - - let remove_fields = json!([ - { "name": "path", "type": "string", "nullable": true, "metadata": {} }, - { "name": "size", "type": "long", "nullable": true, "metadata": {} }, - { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, - { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {}, }, - { "name": "stats", "type": "string", "nullable": true, "metadata": {}, - },/*{ - "name": "partitionValues", - "type": { - "type": "map", - "keyType": "string", - "valueType": "string", - "valueContainsNull": true, - }, - "nullable": true, - "metadata": {}, - - }*/]); - - let mut map = HashMap::new(); - - map.insert( - "metaData".to_string(), - serde_json::from_value(meta_data_fields).unwrap(), - ); - map.insert( - "protocol".to_string(), - serde_json::from_value(protocol_fields).unwrap(), - ); - map.insert( - "txn".to_string(), - serde_json::from_value(txn_fields).unwrap(), - ); - map.insert( - "add".to_string(), - serde_json::from_value(add_fields).unwrap(), - ); - map.insert( - "remove".to_string(), - serde_json::from_value(remove_fields).unwrap(), - ); - - Self { common_fields: map } - } - - /// Creates a Schema representing the delta log for a specific delta table. - /// Merges fields from the table schema into the delta log schema. - pub fn delta_log_schema_for_table( - &self, - table_schema: &Schema, - partition_columns: &[String], - ) -> Result { - let (partition_fields, non_partition_fields): (Vec, Vec) = - table_schema - .fields - .iter() - .map(|f| f.to_owned()) - .partition(|field| partition_columns.contains(&field.name)); - - let fields: Vec = self - .common_fields - .iter() - .map(|(name, fields)| match name.as_str() { - "add" => { - let mut fields = fields.clone(); - - if !partition_fields.is_empty() { - let partition_values_parsed = SchemaField { - name: "partitionValues_parsed".to_string(), - nullable: true, - metadata: HashMap::new(), - r#type: SchemaDataType::r#struct(SchemaTypeStruct { - r#type: "struct".to_string(), - fields: partition_fields.clone(), - }), - }; - fields.push(partition_values_parsed); - } - - if !non_partition_fields.is_empty() { - let min_values = SchemaField { - name: "minValues".to_string(), - nullable: true, - metadata: HashMap::new(), - r#type: SchemaDataType::r#struct(SchemaTypeStruct { - r#type: "struct".to_string(), - fields: non_partition_fields.clone(), - }), - }; - - let max_values = SchemaField { - name: "maxValues".to_string(), - nullable: true, - metadata: HashMap::new(), - r#type: SchemaDataType::r#struct(SchemaTypeStruct { - r#type: "struct".to_string(), - fields: non_partition_fields.clone(), - }), - }; - - let null_counts = SchemaField { - name: "nullCounts".to_string(), - nullable: true, - metadata: HashMap::new(), - r#type: SchemaDataType::r#struct(SchemaTypeStruct { - r#type: "struct".to_string(), - fields: non_partition_fields.clone(), - }), - }; - - let stats_parsed = SchemaField { - name: "stats_parsed".to_string(), - nullable: true, - metadata: HashMap::new(), - r#type: SchemaDataType::r#struct(SchemaTypeStruct { - r#type: "struct".to_string(), - fields: vec![min_values, max_values, null_counts], - }), - }; - - fields.push(stats_parsed); - } - - SchemaField { - name: name.clone(), - nullable: true, - metadata: HashMap::new(), - r#type: SchemaDataType::r#struct(SchemaTypeStruct { - r#type: "struct".to_string(), - fields, - }), - } - } - _ => SchemaField { - name: name.clone(), - nullable: true, - metadata: HashMap::new(), - r#type: SchemaDataType::r#struct(SchemaTypeStruct { - r#type: "struct".to_string(), - fields: fields.clone(), - }), - }, - }) - .collect(); - - Ok(Schema { - r#type: "struct".to_string(), - fields, - }) - } -} - -impl Default for DeltaLogSchemaFactory { - fn default() -> Self { - Self::new() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn delta_log_schema_factory_creates_schema() { - let factory = DeltaLogSchemaFactory::new(); - - let table_schema = json!({ - "type": "struct", - "fields": [ - { "name": "pcol", "type": "integer", "nullable": true, "metadata": {} }, - { "name": "col1", "type": "integer", "nullable": true, "metadata": {} }, - ] - }); - let table_schema = serde_json::from_value(table_schema).unwrap(); - - let partition_columns = vec!["pcol".to_string()]; - - let log_schema = factory - .delta_log_schema_for_table(&table_schema, partition_columns.as_slice()) - .unwrap(); - - assert_eq!("struct", log_schema.r#type); - assert_eq!(5, log_schema.get_fields().len()); - - for f in log_schema.get_fields().iter() { - match f.get_name() { - "txn" => { - if let SchemaDataType::r#struct(txn) = f.get_type() { - assert_eq!(2, txn.get_fields().len()); - for f in txn.get_fields().iter() { - match f.get_name() { - "appId" => { - assert_eq!( - SchemaDataType::primitive("string".to_string()), - f.get_type().to_owned() - ); - } - "version" => { - assert_eq!( - SchemaDataType::primitive("long".to_string()), - f.get_type().to_owned() - ); - } - _ => panic!("Unhandled schema field name"), - } - } - } else { - panic!("txn must be a struct"); - } - } - "protocol" => { - if let SchemaDataType::r#struct(protocol) = f.get_type() { - assert_eq!(2, protocol.get_fields().len()); - for f in protocol.get_fields().iter() { - match f.get_name() { - "minReaderVersion" | "minWriterVersion" => { - assert_eq!( - SchemaDataType::primitive("integer".to_string()), - f.get_type().to_owned() - ); - } - _ => panic!("Unhandled schema field name"), - } - } - } else { - panic!("protocol must be a struct"); - } - } - "metaData" => { - if let SchemaDataType::r#struct(metadata) = f.get_type() { - assert_eq!(7, metadata.get_fields().len()); - for f in metadata.get_fields().iter() { - match f.get_name() { - "id" | "name" | "description" | "schemaString" => { - assert_eq!( - SchemaDataType::primitive("string".to_string()), - f.get_type().to_owned() - ); - } - "createdTime" => { - assert_eq!( - SchemaDataType::primitive("long".to_string()), - f.get_type().to_owned() - ); - } - "partitionColumns" => match f.get_type() { - SchemaDataType::array(partition_columns) => { - assert_eq!("array", partition_columns.r#type); - assert_eq!( - Box::new(SchemaDataType::primitive( - "string".to_string() - )), - partition_columns.elementType - ); - } - _ => panic!("partitionColumns should be an array"), - }, - "format" => { - // TODO - } - _ => panic!("Unhandled schema field name"), - } - } - } else { - panic!("metaData must be a struct"); - } - } - "add" => { - if let SchemaDataType::r#struct(add) = f.get_type() { - assert_eq!(7, add.get_fields().len()); - for f in add.get_fields().iter() { - match f.get_name() { - "path" | "stats" => { - assert_eq!( - SchemaDataType::primitive("string".to_string()), - f.r#type - ); - } - "size" | "modificationTime" => { - assert_eq!( - SchemaDataType::primitive("long".to_string()), - f.r#type - ); - } - "dataChange" => { - assert_eq!( - SchemaDataType::primitive("boolean".to_string()), - f.r#type - ); - } - "stats_parsed" => match f.get_type() { - SchemaDataType::r#struct(stats_parsed) => { - let expected_fields: Vec<&SchemaField> = table_schema - .get_fields() - .iter() - .filter(|f| !partition_columns.contains(&f.name)) - .collect(); - for stat_field in stats_parsed.get_fields() { - match stat_field.get_name() { - "minValues" | "maxValues" | "nullCounts" => { - if let SchemaDataType::r#struct(f) = - stat_field.get_type() - { - for (i, e) in - f.get_fields().iter().enumerate() - { - assert_eq!(e, expected_fields[i]); - } - } else { - panic!("Unexpected type for stat field"); - } - } - _ => panic!("Unhandled schema field name"), - } - } - } - _ => panic!("'stats_parsed' must be a struct"), - }, - "partitionValues_parsed" => match f.get_type() { - SchemaDataType::r#struct(partition_values_parsed) => { - let expected_fields: Vec<&SchemaField> = table_schema - .get_fields() - .iter() - .filter(|f| partition_columns.contains(&f.name)) - .collect(); - - for (i, e) in - partition_values_parsed.get_fields().iter().enumerate() - { - assert_eq!(e, expected_fields[i], "'partitionValues_parsed' should contain SchemaFields for all partition columns"); - } - } - _ => panic!("'partition_values_parsed' must be a struct"), - }, - _ => panic!("Unhandled schema field name"), - } - } - } else { - panic!("'add' must be a struct"); - } - } - "remove" => { - if let SchemaDataType::r#struct(remove) = f.get_type() { - assert_eq!(5, remove.get_fields().len()); - for f in remove.get_fields().iter() { - match f.get_name() { - "path" | "stats" => { - assert_eq!( - SchemaDataType::primitive("string".to_string()), - f.get_type().to_owned() - ); - } - "size" | "modificationTime" => { - assert_eq!( - SchemaDataType::primitive("long".to_string()), - f.get_type().to_owned() - ); - } - "dataChange" => { - assert_eq!( - SchemaDataType::primitive("boolean".to_string()), - f.get_type().to_owned() - ); - } - _ => panic!("Unhandled schema field name"), - } - } - } else { - panic!("'remove' must be a struct"); - } - } - _ => panic!("Unhandled schema field name"), - } - } - } -} +// /// Error representing a failure while creating the delta log schema. +// #[derive(thiserror::Error, Debug)] +// pub enum DeltaLogSchemaError { +// /// Error returned when JSON de-serialization of schema components fails. +// #[error("serde_json::Error: {source}")] +// JSONSerialization { +// /// The source serde_json::Error. +// #[from] +// source: serde_json::Error, +// }, +// } + +// /// Factory for creating a Delta log schema for a specific table schema. +// /// REF: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-schema +// pub struct DeltaLogSchemaFactory { +// common_fields: HashMap>, +// } + +// impl DeltaLogSchemaFactory { +// /// Creates a new DeltaLogSchemaFactory which can be used to create Schema's representing the +// /// Delta log for specific tables. +// pub fn new() -> Self { +// // TODO: map is not supported by arrow currently. +// // See: +// // * https://github.com/apache/arrow-rs/issues/395 +// // * https://github.com/apache/arrow-rs/issues/396 + +// let meta_data_fields = json!([ +// { "name": "id", "type": "string", "nullable": true, "metadata": {} }, +// { "name": "name", "type": "string", "nullable": true, "metadata": {} }, +// { "name": "description", "type": "string", "nullable": true, "metadata": {} }, +// { "name": "schemaString", "type": "string", "nullable": true, "metadata": {} }, +// { "name": "createdTime", "type": "long", "nullable": true, "metadata": {} }, +// { +// "name": "partitionColumns", +// "type": { +// "type": "array", +// "elementType": "string", +// "containsNull": true, +// }, +// "nullable": true, +// "metadata": {} }, +// { +// "name": "format", +// "type": { +// "type": "struct", +// "fields": [{ +// "name": "provider", +// "type": "string", +// "nullable": true, +// "metadata": {}, +// },/*{ +// "name": "options", +// "type": { +// "type": "map", +// "keyType": "string", +// "valueType": "string", +// "valueContainsNull": true, +// }, +// "nullable": true, +// "metadata": {} +// }*/] +// }, +// "nullable": true, +// "metadata": {} +// }, +// /*{ +// "name": "configuration", +// "type": { +// "type": "map", +// "keyType": "string", +// "valueType": "string", +// "valueContainsNull": true, +// }, +// "nullable": true, +// "metadata": {} +// }*/]); + +// let protocol_fields = json!([ +// { "name": "minReaderVersion", "type": "integer", "nullable": true, "metadata": {} }, +// { "name": "minWriterVersion", "type": "integer", "nullable": true, "metadata": {} } +// ]); + +// let txn_fields = json!([ +// { "name": "appId", "type": "string", "nullable": true, "metadata": {} }, +// { "name": "version", "type": "long", "nullable": true, "metadata": {} } +// ]); + +// let add_fields = json!([ +// { "name": "path", "type": "string", "nullable": true, "metadata": {} }, +// { "name": "size", "type": "long", "nullable": true, "metadata": {} }, +// { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, +// { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {} }, +// { "name": "stats", "type": "string", "nullable": true, "metadata": {} }, +// /*{ +// "name": "partitionValues", +// "type": { +// "type": "map", +// "keyType": "string", +// "valueType": "string", +// "valueContainsNull": true, +// }, +// "nullable": true, +// "metadata": {}, +// }*/ +// ]); + +// let remove_fields = json!([ +// { "name": "path", "type": "string", "nullable": true, "metadata": {} }, +// { "name": "size", "type": "long", "nullable": true, "metadata": {} }, +// { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, +// { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {}, }, +// { "name": "stats", "type": "string", "nullable": true, "metadata": {}, +// },/*{ +// "name": "partitionValues", +// "type": { +// "type": "map", +// "keyType": "string", +// "valueType": "string", +// "valueContainsNull": true, +// }, +// "nullable": true, +// "metadata": {}, + +// }*/]); + +// let mut map = HashMap::new(); + +// map.insert( +// "metaData".to_string(), +// serde_json::from_value(meta_data_fields).unwrap(), +// ); +// map.insert( +// "protocol".to_string(), +// serde_json::from_value(protocol_fields).unwrap(), +// ); +// map.insert( +// "txn".to_string(), +// serde_json::from_value(txn_fields).unwrap(), +// ); +// map.insert( +// "add".to_string(), +// serde_json::from_value(add_fields).unwrap(), +// ); +// map.insert( +// "remove".to_string(), +// serde_json::from_value(remove_fields).unwrap(), +// ); + +// Self { common_fields: map } +// } + +// /// Creates a Schema representing the delta log for a specific delta table. +// /// Merges fields from the table schema into the delta log schema. +// pub fn delta_log_schema_for_table( +// &self, +// table_schema: &Schema, +// partition_columns: &[String], +// ) -> Result { +// let (partition_fields, non_partition_fields): (Vec, Vec) = +// table_schema +// .fields +// .iter() +// .map(|f| f.to_owned()) +// .partition(|field| partition_columns.contains(&field.name)); + +// let fields: Vec = self +// .common_fields +// .iter() +// .map(|(name, fields)| match name.as_str() { +// "add" => { +// let mut fields = fields.clone(); + +// if !partition_fields.is_empty() { +// let partition_values_parsed = SchemaField { +// name: "partitionValues_parsed".to_string(), +// nullable: true, +// metadata: HashMap::new(), +// r#type: SchemaDataType::r#struct(SchemaTypeStruct { +// r#type: "struct".to_string(), +// fields: partition_fields.clone(), +// }), +// }; +// fields.push(partition_values_parsed); +// } + +// if !non_partition_fields.is_empty() { +// let min_values = SchemaField { +// name: "minValues".to_string(), +// nullable: true, +// metadata: HashMap::new(), +// r#type: SchemaDataType::r#struct(SchemaTypeStruct { +// r#type: "struct".to_string(), +// fields: non_partition_fields.clone(), +// }), +// }; + +// let max_values = SchemaField { +// name: "maxValues".to_string(), +// nullable: true, +// metadata: HashMap::new(), +// r#type: SchemaDataType::r#struct(SchemaTypeStruct { +// r#type: "struct".to_string(), +// fields: non_partition_fields.clone(), +// }), +// }; + +// let null_counts = SchemaField { +// name: "nullCounts".to_string(), +// nullable: true, +// metadata: HashMap::new(), +// r#type: SchemaDataType::r#struct(SchemaTypeStruct { +// r#type: "struct".to_string(), +// fields: non_partition_fields.clone(), +// }), +// }; + +// let stats_parsed = SchemaField { +// name: "stats_parsed".to_string(), +// nullable: true, +// metadata: HashMap::new(), +// r#type: SchemaDataType::r#struct(SchemaTypeStruct { +// r#type: "struct".to_string(), +// fields: vec![min_values, max_values, null_counts], +// }), +// }; + +// fields.push(stats_parsed); +// } + +// SchemaField { +// name: name.clone(), +// nullable: true, +// metadata: HashMap::new(), +// r#type: SchemaDataType::r#struct(SchemaTypeStruct { +// r#type: "struct".to_string(), +// fields, +// }), +// } +// } +// _ => SchemaField { +// name: name.clone(), +// nullable: true, +// metadata: HashMap::new(), +// r#type: SchemaDataType::r#struct(SchemaTypeStruct { +// r#type: "struct".to_string(), +// fields: fields.clone(), +// }), +// }, +// }) +// .collect(); + +// Ok(Schema { +// r#type: "struct".to_string(), +// fields, +// }) +// } +// } + +// impl Default for DeltaLogSchemaFactory { +// fn default() -> Self { +// Self::new() +// } +// } + +// #[cfg(test)] +// mod tests { +// use super::*; + +// #[test] +// fn delta_log_schema_factory_creates_schema() { +// let factory = DeltaLogSchemaFactory::new(); + +// let table_schema = json!({ +// "type": "struct", +// "fields": [ +// { "name": "pcol", "type": "integer", "nullable": true, "metadata": {} }, +// { "name": "col1", "type": "integer", "nullable": true, "metadata": {} }, +// ] +// }); +// let table_schema = serde_json::from_value(table_schema).unwrap(); + +// let partition_columns = vec!["pcol".to_string()]; + +// let log_schema = factory +// .delta_log_schema_for_table(&table_schema, partition_columns.as_slice()) +// .unwrap(); + +// assert_eq!("struct", log_schema.r#type); +// assert_eq!(5, log_schema.get_fields().len()); + +// for f in log_schema.get_fields().iter() { +// match f.get_name() { +// "txn" => { +// if let SchemaDataType::r#struct(txn) = f.get_type() { +// assert_eq!(2, txn.get_fields().len()); +// for f in txn.get_fields().iter() { +// match f.get_name() { +// "appId" => { +// assert_eq!( +// SchemaDataType::primitive("string".to_string()), +// f.get_type().to_owned() +// ); +// } +// "version" => { +// assert_eq!( +// SchemaDataType::primitive("long".to_string()), +// f.get_type().to_owned() +// ); +// } +// _ => panic!("Unhandled schema field name"), +// } +// } +// } else { +// panic!("txn must be a struct"); +// } +// } +// "protocol" => { +// if let SchemaDataType::r#struct(protocol) = f.get_type() { +// assert_eq!(2, protocol.get_fields().len()); +// for f in protocol.get_fields().iter() { +// match f.get_name() { +// "minReaderVersion" | "minWriterVersion" => { +// assert_eq!( +// SchemaDataType::primitive("integer".to_string()), +// f.get_type().to_owned() +// ); +// } +// _ => panic!("Unhandled schema field name"), +// } +// } +// } else { +// panic!("protocol must be a struct"); +// } +// } +// "metaData" => { +// if let SchemaDataType::r#struct(metadata) = f.get_type() { +// assert_eq!(7, metadata.get_fields().len()); +// for f in metadata.get_fields().iter() { +// match f.get_name() { +// "id" | "name" | "description" | "schemaString" => { +// assert_eq!( +// SchemaDataType::primitive("string".to_string()), +// f.get_type().to_owned() +// ); +// } +// "createdTime" => { +// assert_eq!( +// SchemaDataType::primitive("long".to_string()), +// f.get_type().to_owned() +// ); +// } +// "partitionColumns" => match f.get_type() { +// SchemaDataType::array(partition_columns) => { +// assert_eq!("array", partition_columns.r#type); +// assert_eq!( +// Box::new(SchemaDataType::primitive( +// "string".to_string() +// )), +// partition_columns.elementType +// ); +// } +// _ => panic!("partitionColumns should be an array"), +// }, +// "format" => { +// // TODO +// } +// _ => panic!("Unhandled schema field name"), +// } +// } +// } else { +// panic!("metaData must be a struct"); +// } +// } +// "add" => { +// if let SchemaDataType::r#struct(add) = f.get_type() { +// assert_eq!(7, add.get_fields().len()); +// for f in add.get_fields().iter() { +// match f.get_name() { +// "path" | "stats" => { +// assert_eq!( +// SchemaDataType::primitive("string".to_string()), +// f.r#type +// ); +// } +// "size" | "modificationTime" => { +// assert_eq!( +// SchemaDataType::primitive("long".to_string()), +// f.r#type +// ); +// } +// "dataChange" => { +// assert_eq!( +// SchemaDataType::primitive("boolean".to_string()), +// f.r#type +// ); +// } +// "stats_parsed" => match f.get_type() { +// SchemaDataType::r#struct(stats_parsed) => { +// let expected_fields: Vec<&SchemaField> = table_schema +// .get_fields() +// .iter() +// .filter(|f| !partition_columns.contains(&f.name)) +// .collect(); +// for stat_field in stats_parsed.get_fields() { +// match stat_field.get_name() { +// "minValues" | "maxValues" | "nullCounts" => { +// if let SchemaDataType::r#struct(f) = +// stat_field.get_type() +// { +// for (i, e) in +// f.get_fields().iter().enumerate() +// { +// assert_eq!(e, expected_fields[i]); +// } +// } else { +// panic!("Unexpected type for stat field"); +// } +// } +// _ => panic!("Unhandled schema field name"), +// } +// } +// } +// _ => panic!("'stats_parsed' must be a struct"), +// }, +// "partitionValues_parsed" => match f.get_type() { +// SchemaDataType::r#struct(partition_values_parsed) => { +// let expected_fields: Vec<&SchemaField> = table_schema +// .get_fields() +// .iter() +// .filter(|f| partition_columns.contains(&f.name)) +// .collect(); + +// for (i, e) in +// partition_values_parsed.get_fields().iter().enumerate() +// { +// assert_eq!(e, expected_fields[i], "'partitionValues_parsed' should contain SchemaFields for all partition columns"); +// } +// } +// _ => panic!("'partition_values_parsed' must be a struct"), +// }, +// _ => panic!("Unhandled schema field name"), +// } +// } +// } else { +// panic!("'add' must be a struct"); +// } +// } +// "remove" => { +// if let SchemaDataType::r#struct(remove) = f.get_type() { +// assert_eq!(5, remove.get_fields().len()); +// for f in remove.get_fields().iter() { +// match f.get_name() { +// "path" | "stats" => { +// assert_eq!( +// SchemaDataType::primitive("string".to_string()), +// f.get_type().to_owned() +// ); +// } +// "size" | "modificationTime" => { +// assert_eq!( +// SchemaDataType::primitive("long".to_string()), +// f.get_type().to_owned() +// ); +// } +// "dataChange" => { +// assert_eq!( +// SchemaDataType::primitive("boolean".to_string()), +// f.get_type().to_owned() +// ); +// } +// _ => panic!("Unhandled schema field name"), +// } +// } +// } else { +// panic!("'remove' must be a struct"); +// } +// } +// _ => panic!("Unhandled schema field name"), +// } +// } +// } +// } From 68217dd5e9556df96b35c00af89aff0151d3114b Mon Sep 17 00:00:00 2001 From: xianwill Date: Thu, 10 Jun 2021 11:03:17 -0400 Subject: [PATCH 16/20] Fix clippy warnings --- rust/src/checkpoints.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs index e1aae7fccc..0f62f02673 100644 --- a/rust/src/checkpoints.rs +++ b/rust/src/checkpoints.rs @@ -171,7 +171,7 @@ impl CheckPointWriter { }) }), ) - .map(|a| serde_json::to_value(a).map_err(|e| ArrowError::from(e))) + .map(|a| serde_json::to_value(a).map_err(ArrowError::from)) .collect(); debug!("Preparing checkpoint parquet buffer."); From 44340e5fda2dc92fa0b1b0297bd7000afd47bad8 Mon Sep 17 00:00:00 2001 From: xianwill Date: Thu, 10 Jun 2021 17:55:46 -0400 Subject: [PATCH 17/20] Add unit test for delta log schema and issue links for checkpoint enhancements --- rust/src/checkpoints.rs | 7 +--- rust/src/delta_arrow.rs | 76 +++++++++++++++++++++++++++++++++++------ 2 files changed, 67 insertions(+), 16 deletions(-) diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs index 0f62f02673..c061277a81 100644 --- a/rust/src/checkpoints.rs +++ b/rust/src/checkpoints.rs @@ -102,7 +102,7 @@ impl CheckPointWriter { ) -> Result<(), CheckPointWriterError> { // TODO: checkpoints _can_ be multi-part... haven't actually found a good reference for // an appropriate split point yet though so only writing a single part currently. - // Will post a research issue to follow-up on this later. + // See https://github.com/delta-io/delta-rs/issues/288 info!("Writing parquet bytes to checkpoint buffer."); let parquet_bytes = self.parquet_bytes_from_state(state)?; @@ -143,7 +143,6 @@ impl CheckPointWriter { let current_metadata = state .current_metadata() .ok_or(CheckPointWriterError::MissingMetaData)?; - let jsons: Vec> = vec![ action::Action::protocol(action::Protocol { min_reader_version: state.min_reader_version(), @@ -175,25 +174,21 @@ impl CheckPointWriter { .collect(); debug!("Preparing checkpoint parquet buffer."); - let arrow_schema = delta_log_schema_for_table( >::try_from(¤t_metadata.schema)?, current_metadata.partition_columns.as_slice(), ); - let writeable_cursor = InMemoryWriteableCursor::default(); let mut writer = ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema.clone(), None)?; debug!("Writing to checkpoint parquet buffer..."); - let decoder = Decoder::new(arrow_schema, jsons.len(), None); let mut value_iter = jsons.into_iter(); while let Some(batch) = decoder.next_batch(&mut value_iter)? { writer.write(&batch)?; } let _ = writer.close()?; - debug!("Finished writing checkpoint parquet buffer."); Ok(writeable_cursor.data()) diff --git a/rust/src/delta_arrow.rs b/rust/src/delta_arrow.rs index e1d2dd7b31..79af411088 100644 --- a/rust/src/delta_arrow.rs +++ b/rust/src/delta_arrow.rs @@ -251,22 +251,78 @@ pub(crate) fn delta_log_schema_for_table( #[cfg(test)] mod tests { use super::*; + use std::collections::HashMap; #[test] fn delta_log_schema_for_table_test() { - let table_schema = json!({ - "type": "struct", - "fields": [ - { "name": "pcol", "type": "integer", "nullable": true, "metadata": {} }, - { "name": "col1", "type": "integer", "nullable": true, "metadata": {} }, - ] - }); - let table_schema = serde_json::from_value(table_schema).unwrap(); + // NOTE: We should future proof the checkpoint schema in case action schema changes. + // See https://github.com/delta-io/delta-rs/issues/287 + let table_schema = ArrowSchema::new(vec![ + ArrowField::new("pcol", ArrowDataType::Int32, true), + ArrowField::new("col1", ArrowDataType::Int32, true), + ]); let partition_columns = vec!["pcol".to_string()]; + let log_schema = delta_log_schema_for_table(table_schema, partition_columns.as_slice()); - let _log_schema = delta_log_schema_for_table(table_schema, partition_columns.as_slice()); + let expected_fields = vec!["metaData", "protocol", "txn", "remove", "add"]; + for f in log_schema.fields().iter() { + assert!(expected_fields.contains(&f.name().as_str())); + } + let add_fields: Vec<_> = log_schema + .fields() + .iter() + .filter(|f| f.name() == "add") + .map(|f| { + if let ArrowDataType::Struct(fields) = f.data_type() { + fields.iter().map(|f| f.clone()) + } else { + unreachable!(); + } + }) + .flatten() + .collect(); + assert_eq!(7, add_fields.len()); + + let add_field_map: HashMap<_, _> = add_fields + .iter() + .map(|f| (f.name().to_owned(), f.clone())) + .collect(); + + let partition_values_parsed = add_field_map.get("partitionValues_parsed").unwrap(); + if let ArrowDataType::Struct(fields) = partition_values_parsed.data_type() { + assert_eq!(1, fields.len()); + let field = fields.get(0).unwrap().to_owned(); + assert_eq!(ArrowField::new("pcol", ArrowDataType::Int32, true), field); + } else { + unreachable!(); + } - todo!("Add tests for delta_log_schema_for_table"); + let stats_parsed = add_field_map.get("stats_parsed").unwrap(); + if let ArrowDataType::Struct(fields) = stats_parsed.data_type() { + assert_eq!(4, fields.len()); + + let field_map: HashMap<_, _> = fields + .iter() + .map(|f| (f.name().to_owned(), f.clone())) + .collect(); + + for (k, v) in field_map.iter() { + match k.as_ref() { + "minValues" | "maxValues" | "nullCounts" => match v.data_type() { + ArrowDataType::Struct(fields) => { + assert_eq!(1, fields.len()); + let field = fields.get(0).unwrap().to_owned(); + assert_eq!(ArrowField::new("col1", ArrowDataType::Int32, true), field); + } + _ => unreachable!(), + }, + "numRecords" => {} + _ => panic!(), + } + } + } else { + unreachable!(); + } } } From 7f878a8beffcc806acf244abbded9191e2e62ea8 Mon Sep 17 00:00:00 2001 From: xianwill Date: Thu, 10 Jun 2021 18:00:39 -0400 Subject: [PATCH 18/20] Remove commented code from schema.rs --- rust/src/schema.rs | 484 --------------------------------------------- 1 file changed, 484 deletions(-) diff --git a/rust/src/schema.rs b/rust/src/schema.rs index 163026da80..098f66caed 100644 --- a/rust/src/schema.rs +++ b/rust/src/schema.rs @@ -159,487 +159,3 @@ impl Schema { &self.fields } } - -// /// Error representing a failure while creating the delta log schema. -// #[derive(thiserror::Error, Debug)] -// pub enum DeltaLogSchemaError { -// /// Error returned when JSON de-serialization of schema components fails. -// #[error("serde_json::Error: {source}")] -// JSONSerialization { -// /// The source serde_json::Error. -// #[from] -// source: serde_json::Error, -// }, -// } - -// /// Factory for creating a Delta log schema for a specific table schema. -// /// REF: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#checkpoint-schema -// pub struct DeltaLogSchemaFactory { -// common_fields: HashMap>, -// } - -// impl DeltaLogSchemaFactory { -// /// Creates a new DeltaLogSchemaFactory which can be used to create Schema's representing the -// /// Delta log for specific tables. -// pub fn new() -> Self { -// // TODO: map is not supported by arrow currently. -// // See: -// // * https://github.com/apache/arrow-rs/issues/395 -// // * https://github.com/apache/arrow-rs/issues/396 - -// let meta_data_fields = json!([ -// { "name": "id", "type": "string", "nullable": true, "metadata": {} }, -// { "name": "name", "type": "string", "nullable": true, "metadata": {} }, -// { "name": "description", "type": "string", "nullable": true, "metadata": {} }, -// { "name": "schemaString", "type": "string", "nullable": true, "metadata": {} }, -// { "name": "createdTime", "type": "long", "nullable": true, "metadata": {} }, -// { -// "name": "partitionColumns", -// "type": { -// "type": "array", -// "elementType": "string", -// "containsNull": true, -// }, -// "nullable": true, -// "metadata": {} }, -// { -// "name": "format", -// "type": { -// "type": "struct", -// "fields": [{ -// "name": "provider", -// "type": "string", -// "nullable": true, -// "metadata": {}, -// },/*{ -// "name": "options", -// "type": { -// "type": "map", -// "keyType": "string", -// "valueType": "string", -// "valueContainsNull": true, -// }, -// "nullable": true, -// "metadata": {} -// }*/] -// }, -// "nullable": true, -// "metadata": {} -// }, -// /*{ -// "name": "configuration", -// "type": { -// "type": "map", -// "keyType": "string", -// "valueType": "string", -// "valueContainsNull": true, -// }, -// "nullable": true, -// "metadata": {} -// }*/]); - -// let protocol_fields = json!([ -// { "name": "minReaderVersion", "type": "integer", "nullable": true, "metadata": {} }, -// { "name": "minWriterVersion", "type": "integer", "nullable": true, "metadata": {} } -// ]); - -// let txn_fields = json!([ -// { "name": "appId", "type": "string", "nullable": true, "metadata": {} }, -// { "name": "version", "type": "long", "nullable": true, "metadata": {} } -// ]); - -// let add_fields = json!([ -// { "name": "path", "type": "string", "nullable": true, "metadata": {} }, -// { "name": "size", "type": "long", "nullable": true, "metadata": {} }, -// { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, -// { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {} }, -// { "name": "stats", "type": "string", "nullable": true, "metadata": {} }, -// /*{ -// "name": "partitionValues", -// "type": { -// "type": "map", -// "keyType": "string", -// "valueType": "string", -// "valueContainsNull": true, -// }, -// "nullable": true, -// "metadata": {}, -// }*/ -// ]); - -// let remove_fields = json!([ -// { "name": "path", "type": "string", "nullable": true, "metadata": {} }, -// { "name": "size", "type": "long", "nullable": true, "metadata": {} }, -// { "name": "modificationTime", "type": "long", "nullable": true, "metadata": {} }, -// { "name": "dataChange", "type": "boolean", "nullable": true, "metadata": {}, }, -// { "name": "stats", "type": "string", "nullable": true, "metadata": {}, -// },/*{ -// "name": "partitionValues", -// "type": { -// "type": "map", -// "keyType": "string", -// "valueType": "string", -// "valueContainsNull": true, -// }, -// "nullable": true, -// "metadata": {}, - -// }*/]); - -// let mut map = HashMap::new(); - -// map.insert( -// "metaData".to_string(), -// serde_json::from_value(meta_data_fields).unwrap(), -// ); -// map.insert( -// "protocol".to_string(), -// serde_json::from_value(protocol_fields).unwrap(), -// ); -// map.insert( -// "txn".to_string(), -// serde_json::from_value(txn_fields).unwrap(), -// ); -// map.insert( -// "add".to_string(), -// serde_json::from_value(add_fields).unwrap(), -// ); -// map.insert( -// "remove".to_string(), -// serde_json::from_value(remove_fields).unwrap(), -// ); - -// Self { common_fields: map } -// } - -// /// Creates a Schema representing the delta log for a specific delta table. -// /// Merges fields from the table schema into the delta log schema. -// pub fn delta_log_schema_for_table( -// &self, -// table_schema: &Schema, -// partition_columns: &[String], -// ) -> Result { -// let (partition_fields, non_partition_fields): (Vec, Vec) = -// table_schema -// .fields -// .iter() -// .map(|f| f.to_owned()) -// .partition(|field| partition_columns.contains(&field.name)); - -// let fields: Vec = self -// .common_fields -// .iter() -// .map(|(name, fields)| match name.as_str() { -// "add" => { -// let mut fields = fields.clone(); - -// if !partition_fields.is_empty() { -// let partition_values_parsed = SchemaField { -// name: "partitionValues_parsed".to_string(), -// nullable: true, -// metadata: HashMap::new(), -// r#type: SchemaDataType::r#struct(SchemaTypeStruct { -// r#type: "struct".to_string(), -// fields: partition_fields.clone(), -// }), -// }; -// fields.push(partition_values_parsed); -// } - -// if !non_partition_fields.is_empty() { -// let min_values = SchemaField { -// name: "minValues".to_string(), -// nullable: true, -// metadata: HashMap::new(), -// r#type: SchemaDataType::r#struct(SchemaTypeStruct { -// r#type: "struct".to_string(), -// fields: non_partition_fields.clone(), -// }), -// }; - -// let max_values = SchemaField { -// name: "maxValues".to_string(), -// nullable: true, -// metadata: HashMap::new(), -// r#type: SchemaDataType::r#struct(SchemaTypeStruct { -// r#type: "struct".to_string(), -// fields: non_partition_fields.clone(), -// }), -// }; - -// let null_counts = SchemaField { -// name: "nullCounts".to_string(), -// nullable: true, -// metadata: HashMap::new(), -// r#type: SchemaDataType::r#struct(SchemaTypeStruct { -// r#type: "struct".to_string(), -// fields: non_partition_fields.clone(), -// }), -// }; - -// let stats_parsed = SchemaField { -// name: "stats_parsed".to_string(), -// nullable: true, -// metadata: HashMap::new(), -// r#type: SchemaDataType::r#struct(SchemaTypeStruct { -// r#type: "struct".to_string(), -// fields: vec![min_values, max_values, null_counts], -// }), -// }; - -// fields.push(stats_parsed); -// } - -// SchemaField { -// name: name.clone(), -// nullable: true, -// metadata: HashMap::new(), -// r#type: SchemaDataType::r#struct(SchemaTypeStruct { -// r#type: "struct".to_string(), -// fields, -// }), -// } -// } -// _ => SchemaField { -// name: name.clone(), -// nullable: true, -// metadata: HashMap::new(), -// r#type: SchemaDataType::r#struct(SchemaTypeStruct { -// r#type: "struct".to_string(), -// fields: fields.clone(), -// }), -// }, -// }) -// .collect(); - -// Ok(Schema { -// r#type: "struct".to_string(), -// fields, -// }) -// } -// } - -// impl Default for DeltaLogSchemaFactory { -// fn default() -> Self { -// Self::new() -// } -// } - -// #[cfg(test)] -// mod tests { -// use super::*; - -// #[test] -// fn delta_log_schema_factory_creates_schema() { -// let factory = DeltaLogSchemaFactory::new(); - -// let table_schema = json!({ -// "type": "struct", -// "fields": [ -// { "name": "pcol", "type": "integer", "nullable": true, "metadata": {} }, -// { "name": "col1", "type": "integer", "nullable": true, "metadata": {} }, -// ] -// }); -// let table_schema = serde_json::from_value(table_schema).unwrap(); - -// let partition_columns = vec!["pcol".to_string()]; - -// let log_schema = factory -// .delta_log_schema_for_table(&table_schema, partition_columns.as_slice()) -// .unwrap(); - -// assert_eq!("struct", log_schema.r#type); -// assert_eq!(5, log_schema.get_fields().len()); - -// for f in log_schema.get_fields().iter() { -// match f.get_name() { -// "txn" => { -// if let SchemaDataType::r#struct(txn) = f.get_type() { -// assert_eq!(2, txn.get_fields().len()); -// for f in txn.get_fields().iter() { -// match f.get_name() { -// "appId" => { -// assert_eq!( -// SchemaDataType::primitive("string".to_string()), -// f.get_type().to_owned() -// ); -// } -// "version" => { -// assert_eq!( -// SchemaDataType::primitive("long".to_string()), -// f.get_type().to_owned() -// ); -// } -// _ => panic!("Unhandled schema field name"), -// } -// } -// } else { -// panic!("txn must be a struct"); -// } -// } -// "protocol" => { -// if let SchemaDataType::r#struct(protocol) = f.get_type() { -// assert_eq!(2, protocol.get_fields().len()); -// for f in protocol.get_fields().iter() { -// match f.get_name() { -// "minReaderVersion" | "minWriterVersion" => { -// assert_eq!( -// SchemaDataType::primitive("integer".to_string()), -// f.get_type().to_owned() -// ); -// } -// _ => panic!("Unhandled schema field name"), -// } -// } -// } else { -// panic!("protocol must be a struct"); -// } -// } -// "metaData" => { -// if let SchemaDataType::r#struct(metadata) = f.get_type() { -// assert_eq!(7, metadata.get_fields().len()); -// for f in metadata.get_fields().iter() { -// match f.get_name() { -// "id" | "name" | "description" | "schemaString" => { -// assert_eq!( -// SchemaDataType::primitive("string".to_string()), -// f.get_type().to_owned() -// ); -// } -// "createdTime" => { -// assert_eq!( -// SchemaDataType::primitive("long".to_string()), -// f.get_type().to_owned() -// ); -// } -// "partitionColumns" => match f.get_type() { -// SchemaDataType::array(partition_columns) => { -// assert_eq!("array", partition_columns.r#type); -// assert_eq!( -// Box::new(SchemaDataType::primitive( -// "string".to_string() -// )), -// partition_columns.elementType -// ); -// } -// _ => panic!("partitionColumns should be an array"), -// }, -// "format" => { -// // TODO -// } -// _ => panic!("Unhandled schema field name"), -// } -// } -// } else { -// panic!("metaData must be a struct"); -// } -// } -// "add" => { -// if let SchemaDataType::r#struct(add) = f.get_type() { -// assert_eq!(7, add.get_fields().len()); -// for f in add.get_fields().iter() { -// match f.get_name() { -// "path" | "stats" => { -// assert_eq!( -// SchemaDataType::primitive("string".to_string()), -// f.r#type -// ); -// } -// "size" | "modificationTime" => { -// assert_eq!( -// SchemaDataType::primitive("long".to_string()), -// f.r#type -// ); -// } -// "dataChange" => { -// assert_eq!( -// SchemaDataType::primitive("boolean".to_string()), -// f.r#type -// ); -// } -// "stats_parsed" => match f.get_type() { -// SchemaDataType::r#struct(stats_parsed) => { -// let expected_fields: Vec<&SchemaField> = table_schema -// .get_fields() -// .iter() -// .filter(|f| !partition_columns.contains(&f.name)) -// .collect(); -// for stat_field in stats_parsed.get_fields() { -// match stat_field.get_name() { -// "minValues" | "maxValues" | "nullCounts" => { -// if let SchemaDataType::r#struct(f) = -// stat_field.get_type() -// { -// for (i, e) in -// f.get_fields().iter().enumerate() -// { -// assert_eq!(e, expected_fields[i]); -// } -// } else { -// panic!("Unexpected type for stat field"); -// } -// } -// _ => panic!("Unhandled schema field name"), -// } -// } -// } -// _ => panic!("'stats_parsed' must be a struct"), -// }, -// "partitionValues_parsed" => match f.get_type() { -// SchemaDataType::r#struct(partition_values_parsed) => { -// let expected_fields: Vec<&SchemaField> = table_schema -// .get_fields() -// .iter() -// .filter(|f| partition_columns.contains(&f.name)) -// .collect(); - -// for (i, e) in -// partition_values_parsed.get_fields().iter().enumerate() -// { -// assert_eq!(e, expected_fields[i], "'partitionValues_parsed' should contain SchemaFields for all partition columns"); -// } -// } -// _ => panic!("'partition_values_parsed' must be a struct"), -// }, -// _ => panic!("Unhandled schema field name"), -// } -// } -// } else { -// panic!("'add' must be a struct"); -// } -// } -// "remove" => { -// if let SchemaDataType::r#struct(remove) = f.get_type() { -// assert_eq!(5, remove.get_fields().len()); -// for f in remove.get_fields().iter() { -// match f.get_name() { -// "path" | "stats" => { -// assert_eq!( -// SchemaDataType::primitive("string".to_string()), -// f.get_type().to_owned() -// ); -// } -// "size" | "modificationTime" => { -// assert_eq!( -// SchemaDataType::primitive("long".to_string()), -// f.get_type().to_owned() -// ); -// } -// "dataChange" => { -// assert_eq!( -// SchemaDataType::primitive("boolean".to_string()), -// f.get_type().to_owned() -// ); -// } -// _ => panic!("Unhandled schema field name"), -// } -// } -// } else { -// panic!("'remove' must be a struct"); -// } -// } -// _ => panic!("Unhandled schema field name"), -// } -// } -// } -// } From 0b2261911d0f47d949e4461d64c09e3723fd142a Mon Sep 17 00:00:00 2001 From: xianwill Date: Thu, 10 Jun 2021 20:57:38 -0400 Subject: [PATCH 19/20] Remove unnecessary collect when building checkpoint --- rust/src/checkpoints.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs index c061277a81..d272fa6536 100644 --- a/rust/src/checkpoints.rs +++ b/rust/src/checkpoints.rs @@ -143,14 +143,14 @@ impl CheckPointWriter { let current_metadata = state .current_metadata() .ok_or(CheckPointWriterError::MissingMetaData)?; - let jsons: Vec> = vec![ - action::Action::protocol(action::Protocol { - min_reader_version: state.min_reader_version(), - min_writer_version: state.min_writer_version(), - }), - action::Action::metaData(action::MetaData::try_from(current_metadata.clone())?), - ] - .into_iter() + + let jsons = std::iter::once(action::Action::protocol(action::Protocol { + min_reader_version: state.min_reader_version(), + min_writer_version: state.min_writer_version(), + })) + .chain(std::iter::once(action::Action::metaData( + action::MetaData::try_from(current_metadata.clone())?, + ))) .chain(state.files().iter().map(|f| action::Action::add(f.clone()))) .chain( state @@ -170,8 +170,7 @@ impl CheckPointWriter { }) }), ) - .map(|a| serde_json::to_value(a).map_err(ArrowError::from)) - .collect(); + .map(|a| serde_json::to_value(a).map_err(ArrowError::from)); debug!("Preparing checkpoint parquet buffer."); let arrow_schema = delta_log_schema_for_table( @@ -183,7 +182,11 @@ impl CheckPointWriter { ArrowWriter::try_new(writeable_cursor.clone(), arrow_schema.clone(), None)?; debug!("Writing to checkpoint parquet buffer..."); - let decoder = Decoder::new(arrow_schema, jsons.len(), None); + let batch_size = state.app_transaction_version().len() + + state.tombstones().len() + + state.files().len() + + 2; + let decoder = Decoder::new(arrow_schema, batch_size, None); let mut value_iter = jsons.into_iter(); while let Some(batch) = decoder.next_batch(&mut value_iter)? { writer.write(&batch)?; From e6a0fc465c74ce718f0f10abd4ab26dda2d1158b Mon Sep 17 00:00:00 2001 From: xianwill Date: Thu, 10 Jun 2021 21:05:31 -0400 Subject: [PATCH 20/20] Remove useless into_iter --- rust/src/checkpoints.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rust/src/checkpoints.rs b/rust/src/checkpoints.rs index d272fa6536..5977903c36 100644 --- a/rust/src/checkpoints.rs +++ b/rust/src/checkpoints.rs @@ -144,7 +144,7 @@ impl CheckPointWriter { .current_metadata() .ok_or(CheckPointWriterError::MissingMetaData)?; - let jsons = std::iter::once(action::Action::protocol(action::Protocol { + let mut jsons = std::iter::once(action::Action::protocol(action::Protocol { min_reader_version: state.min_reader_version(), min_writer_version: state.min_writer_version(), })) @@ -187,8 +187,7 @@ impl CheckPointWriter { + state.files().len() + 2; let decoder = Decoder::new(arrow_schema, batch_size, None); - let mut value_iter = jsons.into_iter(); - while let Some(batch) = decoder.next_batch(&mut value_iter)? { + while let Some(batch) = decoder.next_batch(&mut jsons)? { writer.write(&batch)?; } let _ = writer.close()?;