From dcc3a7b4ea7ef0d675c6448fb205b9c0a54e03cc Mon Sep 17 00:00:00 2001 From: Robert Pack <42610831+roeap@users.noreply.github.com> Date: Mon, 6 Nov 2023 07:46:08 +0100 Subject: [PATCH] feat: adopt kernel schemas and improve protocol support (#1756) # Description ~~this PR depends on #1741.~~ Migrating the implementation of actions and schema over from kernel. The schema is much more complete in terms of the more recent delta features and more rigorously leverages the rust type system. # Related Issue(s) # Documentation --- crates/deltalake-core/Cargo.toml | 4 + .../src/delta_datafusion/expr.rs | 58 +- .../src/delta_datafusion/mod.rs | 37 +- crates/deltalake-core/src/errors.rs | 6 + .../src/kernel/actions/arrow.rs | 1049 +++++++++++++++++ .../src/kernel/actions/checkpoint.rs | 589 +++++++++ .../deltalake-core/src/kernel/actions/mod.rs | 64 + .../src/kernel/actions/schemas.rs | 255 ++++ .../src/kernel/actions/serde_path.rs | 89 ++ .../src/kernel/actions/types.rs | 900 ++++++++++++++ crates/deltalake-core/src/kernel/error.rs | 78 ++ crates/deltalake-core/src/kernel/mod.rs | 9 + crates/deltalake-core/src/kernel/schema.rs | 788 +++++++++++++ crates/deltalake-core/src/lib.rs | 17 +- .../deltalake-core/src/operations/create.rs | 57 +- .../deltalake-core/src/operations/delete.rs | 22 +- .../src/operations/filesystem_check.rs | 7 +- crates/deltalake-core/src/operations/merge.rs | 36 +- .../deltalake-core/src/operations/optimize.rs | 30 +- .../deltalake-core/src/operations/restore.rs | 11 +- .../transaction/conflict_checker.rs | 31 +- .../src/operations/transaction/mod.rs | 27 +- .../src/operations/transaction/state.rs | 10 +- .../src/operations/transaction/test_utils.rs | 72 +- .../deltalake-core/src/operations/update.rs | 41 +- .../deltalake-core/src/operations/vacuum.rs | 15 +- crates/deltalake-core/src/operations/write.rs | 45 +- .../deltalake-core/src/operations/writer.rs | 2 +- .../src/protocol/checkpoints.rs | 127 +- crates/deltalake-core/src/protocol/mod.rs | 631 +--------- .../src/protocol/parquet2_read/boolean.rs | 2 +- .../src/protocol/parquet2_read/map.rs | 2 +- .../src/protocol/parquet2_read/mod.rs | 75 +- .../src/protocol/parquet2_read/primitive.rs | 2 +- .../src/protocol/parquet2_read/string.rs | 2 +- .../src/protocol/parquet_read/mod.rs | 69 +- .../src/schema/arrow_convert.rs | 353 +++--- crates/deltalake-core/src/schema/mod.rs | 377 ------ .../deltalake-core/src/schema/partitions.rs | 26 +- crates/deltalake-core/src/storage/utils.rs | 12 +- crates/deltalake-core/src/table/config.rs | 4 +- crates/deltalake-core/src/table/mod.rs | 61 +- crates/deltalake-core/src/table/state.rs | 59 +- .../deltalake-core/src/table/state_arrow.rs | 33 +- crates/deltalake-core/src/writer/json.rs | 22 +- crates/deltalake-core/src/writer/mod.rs | 5 +- .../deltalake-core/src/writer/record_batch.rs | 17 +- crates/deltalake-core/src/writer/stats.rs | 9 +- .../deltalake-core/src/writer/test_utils.rs | 59 +- .../deltalake-core/tests/checkpoint_writer.rs | 12 +- .../deltalake-core/tests/command_optimize.rs | 38 +- .../deltalake-core/tests/command_restore.rs | 20 +- crates/deltalake-core/tests/command_vacuum.rs | 8 +- .../tests/commit_info_format.rs | 5 +- crates/deltalake-core/tests/common/mod.rs | 28 +- .../_delta_log/00000000000000000000.json | 4 + .../_delta_log/00000000000000000001.json | 3 + ...r_61d16c75-6994-46b7-a15b-8b538852e50e.bin | Bin 0 -> 45 bytes ...4e51-827b-c3d5516560ca-c000.snappy.parquet | Bin 0 -> 635 bytes .../_delta_log/00000000000000000000.json | 4 + ...48e8-82b4-0229cc194867-c000.snappy.parquet | Bin 0 -> 548 bytes .../_delta_log/00000000000000000000.json | 3 + .../_delta_log/00000000000000000001.json | 2 + .../00000000000000000002.checkpoint.parquet | Bin 0 -> 12712 bytes .../_delta_log/00000000000000000002.json | 3 + .../_delta_log/00000000000000000003.json | 3 + crates/deltalake-core/tests/fs_common/mod.rs | 24 +- .../tests/integration_checkpoint.rs | 7 +- .../tests/integration_concurrent_writes.rs | 16 +- .../tests/integration_datafusion.rs | 83 +- .../deltalake-core/tests/integration_read.rs | 35 +- .../tests/read_delta_partitions_test.rs | 18 +- .../tests/serde/checkpoint_schema.json | 267 +++++ crates/deltalake-core/tests/serde/schema.json | 68 ++ crates/deltalake/examples/basic_operations.rs | 28 +- .../deltalake/examples/recordbatch-writer.rs | 28 +- python/src/error.rs | 1 + python/src/lib.rs | 39 +- python/src/schema.rs | 304 +++-- python/tests/test_schema.py | 5 +- 80 files changed, 5343 insertions(+), 2009 deletions(-) create mode 100644 crates/deltalake-core/src/kernel/actions/arrow.rs create mode 100644 crates/deltalake-core/src/kernel/actions/checkpoint.rs create mode 100644 crates/deltalake-core/src/kernel/actions/mod.rs create mode 100644 crates/deltalake-core/src/kernel/actions/schemas.rs create mode 100644 crates/deltalake-core/src/kernel/actions/serde_path.rs create mode 100644 crates/deltalake-core/src/kernel/actions/types.rs create mode 100644 crates/deltalake-core/src/kernel/error.rs create mode 100644 crates/deltalake-core/src/kernel/mod.rs create mode 100644 crates/deltalake-core/src/kernel/schema.rs create mode 100644 crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json create mode 100644 crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json create mode 100644 crates/deltalake-core/tests/data/table-with-dv-small/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin create mode 100644 crates/deltalake-core/tests/data/table-with-dv-small/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet create mode 100644 crates/deltalake-core/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json create mode 100644 crates/deltalake-core/tests/data/table-without-dv-small/part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet create mode 100644 crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000000.json create mode 100644 crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000001.json create mode 100644 crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.checkpoint.parquet create mode 100644 crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json create mode 100644 crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json create mode 100644 crates/deltalake-core/tests/serde/checkpoint_schema.json create mode 100644 crates/deltalake-core/tests/serde/schema.json diff --git a/crates/deltalake-core/Cargo.toml b/crates/deltalake-core/Cargo.toml index e645b6bfd0..ce1c7490ad 100644 --- a/crates/deltalake-core/Cargo.toml +++ b/crates/deltalake-core/Cargo.toml @@ -68,6 +68,8 @@ tokio = { workspace = true, features = [ # other deps (these should be organized and pulled into workspace.dependencies as necessary) cfg-if = "1" errno = "0.3" +either = "1.8" +fix-hidden-lifetime-bug = "0.2" hyper = { version = "0.14", optional = true } itertools = "0.11" lazy_static = "1" @@ -80,8 +82,10 @@ once_cell = "1.16.0" parking_lot = "0.12" parquet2 = { version = "0.17", optional = true } percent-encoding = "2" +roaring = "0.10.1" tracing = { version = "0.1", optional = true } rand = "0.8" +z85 = "3.0.5" # hdfs datafusion-objectstore-hdfs = { version = "0.1.3", default-features = false, features = [ diff --git a/crates/deltalake-core/src/delta_datafusion/expr.rs b/crates/deltalake-core/src/delta_datafusion/expr.rs index 815b01831f..e451484183 100644 --- a/crates/deltalake-core/src/delta_datafusion/expr.rs +++ b/crates/deltalake-core/src/delta_datafusion/expr.rs @@ -338,14 +338,13 @@ impl<'a> fmt::Display for ScalarValueFormat<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; - - use arrow_schema::DataType; + use arrow_schema::DataType as ArrowDataType; use datafusion::prelude::SessionContext; use datafusion_common::{DFSchema, ScalarValue}; use datafusion_expr::{col, decode, lit, substring, Cast, Expr, ExprSchemable}; - use crate::{DeltaOps, DeltaTable, Schema, SchemaDataType, SchemaField}; + use crate::kernel::{DataType, PrimitiveType, StructField, StructType}; + use crate::{DeltaOps, DeltaTable}; use super::fmt_expr_to_sql; @@ -366,66 +365,57 @@ mod test { } async fn setup_table() -> DeltaTable { - let schema = Schema::new(vec![ - SchemaField::new( + let schema = StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value2".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "active".to_string(), - SchemaDataType::primitive("boolean".to_string()), + DataType::Primitive(PrimitiveType::Boolean), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "money".to_string(), - SchemaDataType::primitive("decimal(12,2)".to_string()), + DataType::Primitive(PrimitiveType::Decimal(12, 2)), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "_date".to_string(), - SchemaDataType::primitive("date".to_string()), + DataType::Primitive(PrimitiveType::Date), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "_timestamp".to_string(), - SchemaDataType::primitive("timestamp".to_string()), + DataType::Primitive(PrimitiveType::Timestamp), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "_binary".to_string(), - SchemaDataType::primitive("binary".to_string()), + DataType::Primitive(PrimitiveType::Binary), true, - HashMap::new(), ), ]); let table = DeltaOps::new_in_memory() .create() - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -441,7 +431,7 @@ mod test { simple!( Expr::Cast(Cast { expr: Box::new(lit(1_i64)), - data_type: DataType::Int32 + data_type: ArrowDataType::Int32 }), "arrow_cast(1, 'Int32')".to_string() ), diff --git a/crates/deltalake-core/src/delta_datafusion/mod.rs b/crates/deltalake-core/src/delta_datafusion/mod.rs index 7fbe362afc..19d7a510ef 100644 --- a/crates/deltalake-core/src/delta_datafusion/mod.rs +++ b/crates/deltalake-core/src/delta_datafusion/mod.rs @@ -70,11 +70,12 @@ use serde::{Deserialize, Serialize}; use url::Url; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{self, Add}; +use crate::kernel::{Add, DataType as DeltaDataType, Invariant, PrimitiveType}; +use crate::protocol::{self}; use crate::storage::ObjectStoreRef; use crate::table::builder::ensure_table_uri; use crate::table::state::DeltaTableState; -use crate::{open_table, open_table_with_storage_options, DeltaTable, Invariant, SchemaDataType}; +use crate::{open_table, open_table_with_storage_options, DeltaTable}; const PATH_COLUMN: &str = "__delta_rs_path"; @@ -121,7 +122,7 @@ impl DeltaTableState { min_value: None, distinct_count: None }; - self.schema().unwrap().get_fields().len() + self.schema().unwrap().fields().len() ]), is_exact: true, }, @@ -139,13 +140,13 @@ impl DeltaTableState { column_statistics: acc.column_statistics.map(|col_stats| { self.schema() .unwrap() - .get_fields() + .fields() .iter() .zip(col_stats) .map(|(field, stats)| { let null_count = new_stats .null_count - .get(field.get_name()) + .get(field.name()) .and_then(|x| { let null_count_acc = stats.null_count?; let null_count = x.as_value()? as usize; @@ -155,7 +156,7 @@ impl DeltaTableState { let max_value = new_stats .max_values - .get(field.get_name()) + .get(field.name()) .and_then(|x| { let old_stats = stats.clone(); let max_value = to_scalar_value(x.as_value()?); @@ -179,7 +180,7 @@ impl DeltaTableState { let min_value = new_stats .min_values - .get(field.get_name()) + .get(field.name()) .and_then(|x| { let old_stats = stats.clone(); let min_value = to_scalar_value(x.as_value()?); @@ -222,7 +223,7 @@ impl DeltaTableState { num_rows: stats.num_rows, total_byte_size: stats.total_byte_size, column_statistics: stats.column_statistics.map(|col_stats| { - let fields = self.schema().unwrap().get_fields(); + let fields = self.schema().unwrap().fields(); col_stats .iter() .zip(fields) @@ -230,7 +231,7 @@ impl DeltaTableState { let dt = self .arrow_schema() .unwrap() - .field_with_name(field.get_name()) + .field_with_name(field.name()) .unwrap() .data_type() .clone(); @@ -258,16 +259,14 @@ fn get_prune_stats(table: &DeltaTable, column: &Column, get_max: bool) -> Option let field = table .get_schema() .ok() - .map(|s| s.get_field_with_name(&column.name).ok())??; + .map(|s| s.field_with_name(&column.name).ok())??; // See issue 1214. Binary type does not support natural order which is required for Datafusion to prune - if let SchemaDataType::primitive(t) = &field.get_type() { - if t == "binary" { - return None; - } + if let DeltaDataType::Primitive(PrimitiveType::Binary) = &field.data_type() { + return None; } - let data_type = field.get_type().try_into().ok()?; + let data_type = field.data_type().try_into().ok()?; let partition_columns = &table.get_metadata().ok()?.partition_columns; let values = table.get_state().files().iter().map(|add| { @@ -921,7 +920,7 @@ pub(crate) fn get_null_of_arrow_type(t: &ArrowDataType) -> DeltaResult PartitionedFile { @@ -1790,7 +1789,7 @@ mod tests { let mut partition_values = std::collections::HashMap::new(); partition_values.insert("month".to_string(), Some("1".to_string())); partition_values.insert("year".to_string(), Some("2015".to_string())); - let action = protocol::Add { + let action = Add { path: "year=2015/month=1/part-00000-4dcb50d3-d017-450c-9df7-a7257dbd3c5d-c000.snappy.parquet".to_string(), size: 10644, partition_values, @@ -1801,6 +1800,8 @@ mod tests { deletion_vector: None, stats_parsed: None, tags: None, + base_row_id: None, + default_row_commit_version: None, }; let schema = ArrowSchema::new(vec![ Field::new("year", ArrowDataType::Int64, true), @@ -1953,7 +1954,7 @@ mod tests { let table = crate::DeltaOps::new_in_memory() .create() - .with_columns(get_delta_schema().get_fields().clone()) + .with_columns(get_delta_schema().fields().clone()) .with_partition_columns(["modified", "id"]) .await .unwrap(); diff --git a/crates/deltalake-core/src/errors.rs b/crates/deltalake-core/src/errors.rs index 24989b2814..bd088e9a4f 100644 --- a/crates/deltalake-core/src/errors.rs +++ b/crates/deltalake-core/src/errors.rs @@ -205,6 +205,12 @@ pub enum DeltaTableError { /// Source error source: Box, }, + + #[error("Kernel: {source}")] + Kernel { + #[from] + source: crate::kernel::Error, + }, } impl From for DeltaTableError { diff --git a/crates/deltalake-core/src/kernel/actions/arrow.rs b/crates/deltalake-core/src/kernel/actions/arrow.rs new file mode 100644 index 0000000000..d292362604 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/arrow.rs @@ -0,0 +1,1049 @@ +use std::sync::Arc; + +use arrow_schema::{ + ArrowError, DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, + Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use lazy_static::lazy_static; + +use super::super::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; + +impl TryFrom<&StructType> for ArrowSchema { + type Error = ArrowError; + + fn try_from(s: &StructType) -> Result { + let fields = s + .fields() + .iter() + .map(>::try_from) + .collect::, ArrowError>>()?; + + Ok(ArrowSchema::new(fields)) + } +} + +impl TryFrom<&StructField> for ArrowField { + type Error = ArrowError; + + fn try_from(f: &StructField) -> Result { + let metadata = f + .metadata() + .iter() + .map(|(key, val)| Ok((key.clone(), serde_json::to_string(val)?))) + .collect::>() + .map_err(|err| ArrowError::JsonError(err.to_string()))?; + + let field = ArrowField::new( + f.name(), + ArrowDataType::try_from(f.data_type())?, + f.is_nullable(), + ) + .with_metadata(metadata); + + Ok(field) + } +} + +impl TryFrom<&ArrayType> for ArrowField { + type Error = ArrowError; + + fn try_from(a: &ArrayType) -> Result { + Ok(ArrowField::new( + "item", + ArrowDataType::try_from(a.element_type())?, + a.contains_null(), + )) + } +} + +impl TryFrom<&MapType> for ArrowField { + type Error = ArrowError; + + fn try_from(a: &MapType) -> Result { + Ok(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::try_from(a.key_type())?, false), + ArrowField::new( + "value", + ArrowDataType::try_from(a.value_type())?, + a.value_contains_null(), + ), + ] + .into(), + ), + false, // always non-null + )) + } +} + +impl TryFrom<&DataType> for ArrowDataType { + type Error = ArrowError; + + fn try_from(t: &DataType) -> Result { + match t { + DataType::Primitive(p) => { + match p { + PrimitiveType::String => Ok(ArrowDataType::Utf8), + PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type + PrimitiveType::Integer => Ok(ArrowDataType::Int32), + PrimitiveType::Short => Ok(ArrowDataType::Int16), + PrimitiveType::Byte => Ok(ArrowDataType::Int8), + PrimitiveType::Float => Ok(ArrowDataType::Float32), + PrimitiveType::Double => Ok(ArrowDataType::Float64), + PrimitiveType::Boolean => Ok(ArrowDataType::Boolean), + PrimitiveType::Binary => Ok(ArrowDataType::Binary), + PrimitiveType::Decimal(precision, scale) => { + let precision = u8::try_from(*precision).map_err(|_| { + ArrowError::SchemaError(format!( + "Invalid precision for decimal: {}", + precision + )) + })?; + let scale = i8::try_from(*scale).map_err(|_| { + ArrowError::SchemaError(format!("Invalid scale for decimal: {}", scale)) + })?; + + if precision <= 38 { + Ok(ArrowDataType::Decimal128(precision, scale)) + } else if precision <= 76 { + Ok(ArrowDataType::Decimal256(precision, scale)) + } else { + Err(ArrowError::SchemaError(format!( + "Precision too large to be represented in Arrow: {}", + precision + ))) + } + } + PrimitiveType::Date => { + // A calendar date, represented as a year-month-day triple without a + // timezone. Stored as 4 bytes integer representing days since 1970-01-01 + Ok(ArrowDataType::Date32) + } + PrimitiveType::Timestamp => { + // Issue: https://github.com/delta-io/delta/issues/643 + Ok(ArrowDataType::Timestamp(TimeUnit::Microsecond, None)) + } + } + } + DataType::Struct(s) => Ok(ArrowDataType::Struct( + s.fields() + .iter() + .map(>::try_from) + .collect::, ArrowError>>()? + .into(), + )), + DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(>::try_from(a)?))), + DataType::Map(m) => Ok(ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new( + "keys", + >::try_from(m.key_type())?, + false, + ), + ArrowField::new( + "values", + >::try_from(m.value_type())?, + m.value_contains_null(), + ), + ] + .into(), + ), + false, + )), + false, + )), + } + } +} + +impl TryFrom<&ArrowSchema> for StructType { + type Error = ArrowError; + + fn try_from(arrow_schema: &ArrowSchema) -> Result { + let new_fields: Result, _> = arrow_schema + .fields() + .iter() + .map(|field| field.as_ref().try_into()) + .collect(); + Ok(StructType::new(new_fields?)) + } +} + +impl TryFrom for StructType { + type Error = ArrowError; + + fn try_from(arrow_schema: ArrowSchemaRef) -> Result { + arrow_schema.as_ref().try_into() + } +} + +impl TryFrom<&ArrowField> for StructField { + type Error = ArrowError; + + fn try_from(arrow_field: &ArrowField) -> Result { + Ok(StructField::new( + arrow_field.name().clone(), + arrow_field.data_type().try_into()?, + arrow_field.is_nullable(), + ) + .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) + } +} + +impl TryFrom<&ArrowDataType> for DataType { + type Error = ArrowError; + + fn try_from(arrow_datatype: &ArrowDataType) -> Result { + match arrow_datatype { + ArrowDataType::Utf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::LargeUtf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::Int64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::Int32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::Int16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::Int8 => Ok(DataType::Primitive(PrimitiveType::Byte)), + ArrowDataType::UInt64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::UInt32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::UInt16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::UInt8 => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Float32 => Ok(DataType::Primitive(PrimitiveType::Float)), + ArrowDataType::Float64 => Ok(DataType::Primitive(PrimitiveType::Double)), + ArrowDataType::Boolean => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::Decimal128(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, + ))), + ArrowDataType::Decimal256(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, + ))), + ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)), + ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)), + ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { + Ok(DataType::Primitive(PrimitiveType::Timestamp)) + } + ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) + if tz.eq_ignore_ascii_case("utc") => + { + Ok(DataType::Primitive(PrimitiveType::Timestamp)) + } + ArrowDataType::Struct(fields) => { + let converted_fields: Result, _> = fields + .iter() + .map(|field| field.as_ref().try_into()) + .collect(); + Ok(DataType::Struct(Box::new(StructType::new( + converted_fields?, + )))) + } + ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( + ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), + ))), + ArrowDataType::Map(field, _) => { + if let ArrowDataType::Struct(struct_fields) = field.data_type() { + let key_type = struct_fields[0].data_type().try_into()?; + let value_type = struct_fields[1].data_type().try_into()?; + let value_type_nullable = struct_fields[1].is_nullable(); + Ok(DataType::Map(Box::new(MapType::new( + key_type, + value_type, + value_type_nullable, + )))) + } else { + panic!("DataType::Map should contain a struct field child"); + } + } + s => Err(ArrowError::SchemaError(format!( + "Invalid data type for Delta Lake: {s}" + ))), + } + } +} + +macro_rules! arrow_map { + ($fieldname: ident, null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Utf8, false), + ArrowField::new("value", ArrowDataType::Utf8, true), + ] + .into(), + ), + false, + )), + false, + ), + true, + ) + }; + ($fieldname: ident, not_null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Utf8, false), + ArrowField::new("value", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + )), + false, + ), + false, + ) + }; +} + +macro_rules! arrow_field { + ($fieldname:ident, $type_qual:ident, null) => { + ArrowField::new(stringify!($fieldname), ArrowDataType::$type_qual, true) + }; + ($fieldname:ident, $type_qual:ident, not_null) => { + ArrowField::new(stringify!($fieldname), ArrowDataType::$type_qual, false) + }; +} + +macro_rules! arrow_list { + ($fieldname:ident, $element_name:ident, $type_qual:ident, null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::List(Arc::new(ArrowField::new( + stringify!($element_name), + ArrowDataType::$type_qual, + true, + ))), + true, + ) + }; + ($fieldname:ident, $element_name:ident, $type_qual:ident, not_null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::List(Arc::new(ArrowField::new( + stringify!($element_name), + ArrowDataType::$type_qual, + true, + ))), + false, + ) + }; +} + +macro_rules! arrow_struct { + ($fieldname:ident, [$($inner:tt)+], null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Struct( + arrow_defs! [$($inner)+].into() + ), + true + ) + }; + ($fieldname:ident, [$($inner:tt)+], not_null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Struct( + arrow_defs! [$($inner)+].into() + ), + false + ) + } +} + +macro_rules! arrow_def { + ($fieldname:ident $(null)?) => { + arrow_map!($fieldname, null) + }; + ($fieldname:ident not_null) => { + arrow_map!($fieldname, not_null) + }; + ($fieldname:ident[$inner_name:ident]{$type_qual:ident} $(null)?) => { + arrow_list!($fieldname, $inner_name, $type_qual, null) + }; + ($fieldname:ident[$inner_name:ident]{$type_qual:ident} not_null) => { + arrow_list!($fieldname, $inner_name, $type_qual, not_null) + }; + ($fieldname:ident:$type_qual:ident $(null)?) => { + arrow_field!($fieldname, $type_qual, null) + }; + ($fieldname:ident:$type_qual:ident not_null) => { + arrow_field!($fieldname, $type_qual, not_null) + }; + ($fieldname:ident[$($inner:tt)+] $(null)?) => { + arrow_struct!($fieldname, [$($inner)+], null) + }; + ($fieldname:ident[$($inner:tt)+] not_null) => { + arrow_struct!($fieldname, [$($inner)+], not_null) + } +} + +/// A helper macro to create more readable Arrow field definitions, delimited by commas +/// +/// The argument patterns are as follows: +/// +/// fieldname (null|not_null)? -- An arrow field of type map with name "fieldname" consisting of Utf8 key-value pairs, and an +/// optional nullability qualifier (null if not specified). +/// +/// fieldname:type (null|not_null)? -- An Arrow field consisting of an atomic type. For example, +/// id:Utf8 gets mapped to ArrowField::new("id", ArrowDataType::Utf8, true). +/// where customerCount:Int64 not_null gets mapped to gets mapped to +/// ArrowField::new("customerCount", ArrowDataType::Utf8, true) +/// +/// fieldname[list_element]{list_element_type} (null|not_null)? -- An Arrow list, with the name of the elements wrapped in square brackets +/// and the type of the list elements wrapped in curly brackets. For example, +/// customers[name]{Utf8} is an nullable arrow field of type arrow list consisting +/// of elements called "name" with type Utf8. +/// +/// fieldname[element1, element2, element3, ....] (null|not_null)? -- An arrow struct with name "fieldname" consisting of elements adhering to any of the patterns +/// documented, including additional structs arbitrarily nested up to the recursion +/// limit for Rust macros. +macro_rules! arrow_defs { + () => { + vec![] as Vec + }; + ($($fieldname:ident$(:$type_qual:ident)?$([$($inner:tt)+])?$({$list_type_qual:ident})? $($nullable:ident)?),+) => { + vec![ + $(arrow_def!($fieldname$(:$type_qual)?$([$($inner)+])?$({$list_type_qual})? $($nullable)?)),+ + ] + } +} + +/// Returns an arrow schema representing the delta log for use in checkpoints +/// +/// # Arguments +/// +/// * `table_schema` - The arrow schema representing the table backed by the delta log +/// * `partition_columns` - The list of partition columns of the table. +/// * `use_extended_remove_schema` - Whether to include extended file metadata in remove action schema. +/// Required for compatibility with different versions of Databricks runtime. +pub(crate) fn delta_log_schema_for_table( + table_schema: ArrowSchema, + partition_columns: &[String], + use_extended_remove_schema: bool, +) -> ArrowSchemaRef { + lazy_static! { + static ref SCHEMA_FIELDS: Vec = arrow_defs![ + metaData[ + id:Utf8, + name:Utf8, + description:Utf8, + schemaString:Utf8, + createdTime:Int64, + partitionColumns[element]{Utf8}, + configuration, + format[provider:Utf8, options] + ], + protocol[ + minReaderVersion:Int32, + minWriterVersion:Int32 + ], + txn[ + appId:Utf8, + version:Int64 + ] + ]; + static ref ADD_FIELDS: Vec = arrow_defs![ + path:Utf8, + size:Int64, + modificationTime:Int64, + dataChange:Boolean, + stats:Utf8, + partitionValues, + tags, + deletionVector[ + storageType:Utf8 not_null, + pathOrInlineDv:Utf8 not_null, + offset:Int32 null, + sizeInBytes:Int32 not_null, + cardinality:Int64 not_null + ] + ]; + static ref REMOVE_FIELDS: Vec = arrow_defs![ + path: Utf8, + deletionTimestamp: Int64, + dataChange: Boolean, + extendedFileMetadata: Boolean + ]; + static ref REMOVE_EXTENDED_FILE_METADATA_FIELDS: Vec = + arrow_defs![size: Int64, partitionValues, tags]; + }; + + // create add fields according to the specific data table schema + let (partition_fields, non_partition_fields): (Vec, Vec) = + table_schema + .fields() + .iter() + .map(|field| field.to_owned()) + .partition(|field| partition_columns.contains(field.name())); + + let mut stats_parsed_fields: Vec = + vec![ArrowField::new("numRecords", ArrowDataType::Int64, true)]; + if !non_partition_fields.is_empty() { + let mut max_min_vec = Vec::new(); + non_partition_fields + .iter() + .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); + + stats_parsed_fields.extend(["minValues", "maxValues"].into_iter().map(|name| { + ArrowField::new( + name, + ArrowDataType::Struct(max_min_vec.clone().into()), + true, + ) + })); + + let mut null_count_vec = Vec::new(); + non_partition_fields + .iter() + .for_each(|f| null_count_schema_for_fields(&mut null_count_vec, f)); + let null_count_struct = ArrowField::new( + "nullCount", + ArrowDataType::Struct(null_count_vec.into()), + true, + ); + + stats_parsed_fields.push(null_count_struct); + } + let mut add_fields = ADD_FIELDS.clone(); + add_fields.push(ArrowField::new( + "stats_parsed", + ArrowDataType::Struct(stats_parsed_fields.into()), + true, + )); + if !partition_fields.is_empty() { + add_fields.push(ArrowField::new( + "partitionValues_parsed", + ArrowDataType::Struct(partition_fields.into()), + true, + )); + } + + // create remove fields with or without extendedFileMetadata + let mut remove_fields = REMOVE_FIELDS.clone(); + if use_extended_remove_schema { + remove_fields.extend(REMOVE_EXTENDED_FILE_METADATA_FIELDS.clone()); + } + + // include add and remove fields in checkpoint schema + let mut schema_fields = SCHEMA_FIELDS.clone(); + schema_fields.push(ArrowField::new( + "add", + ArrowDataType::Struct(add_fields.into()), + true, + )); + schema_fields.push(ArrowField::new( + "remove", + ArrowDataType::Struct(remove_fields.into()), + true, + )); + + let arrow_schema = ArrowSchema::new(schema_fields); + + std::sync::Arc::new(arrow_schema) +} + +fn max_min_schema_for_fields(dest: &mut Vec, f: &ArrowField) { + match f.data_type() { + ArrowDataType::Struct(struct_fields) => { + let mut child_dest = Vec::new(); + + for f in struct_fields { + max_min_schema_for_fields(&mut child_dest, f); + } + + dest.push(ArrowField::new( + f.name(), + ArrowDataType::Struct(child_dest.into()), + true, + )); + } + // don't compute min or max for list, map or binary types + ArrowDataType::List(_) | ArrowDataType::Map(_, _) | ArrowDataType::Binary => { /* noop */ } + _ => { + let f = f.clone(); + dest.push(f); + } + } +} + +fn null_count_schema_for_fields(dest: &mut Vec, f: &ArrowField) { + match f.data_type() { + ArrowDataType::Struct(struct_fields) => { + let mut child_dest = Vec::new(); + + for f in struct_fields { + null_count_schema_for_fields(&mut child_dest, f); + } + + dest.push(ArrowField::new( + f.name(), + ArrowDataType::Struct(child_dest.into()), + true, + )); + } + _ => { + let f = ArrowField::new(f.name(), ArrowDataType::Int64, true); + dest.push(f); + } + } +} + +#[cfg(test)] +mod tests { + use arrow::array::ArrayData; + use arrow_array::Array; + use arrow_array::{make_array, ArrayRef, MapArray, StringArray, StructArray}; + use arrow_buffer::{Buffer, ToByteSlice}; + use arrow_schema::Field; + + use super::*; + use std::collections::HashMap; + use std::sync::Arc; + + #[test] + fn delta_log_schema_for_table_test() { + // NOTE: We should future proof the checkpoint schema in case action schema changes. + // See https://github.com/delta-io/delta-rs/issues/287 + + let table_schema = ArrowSchema::new(vec![ + ArrowField::new("pcol", ArrowDataType::Int32, true), + ArrowField::new("col1", ArrowDataType::Int32, true), + ]); + let partition_columns = vec!["pcol".to_string()]; + let log_schema = + delta_log_schema_for_table(table_schema.clone(), partition_columns.as_slice(), false); + + // verify top-level schema contains all expected fields and they are named correctly. + let expected_fields = ["metaData", "protocol", "txn", "remove", "add"]; + for f in log_schema.fields().iter() { + assert!(expected_fields.contains(&f.name().as_str())); + } + assert_eq!(5, log_schema.fields().len()); + + // verify add fields match as expected. a lot of transformation goes into these. + let add_fields: Vec<_> = log_schema + .fields() + .iter() + .filter(|f| f.name() == "add") + .flat_map(|f| { + if let ArrowDataType::Struct(fields) = f.data_type() { + fields.iter().cloned() + } else { + unreachable!(); + } + }) + .collect(); + let field_names: Vec<&String> = add_fields.iter().map(|v| v.name()).collect(); + assert_eq!( + vec![ + "path", + "size", + "modificationTime", + "dataChange", + "stats", + "partitionValues", + "tags", + "deletionVector", + "stats_parsed", + "partitionValues_parsed" + ], + field_names + ); + let add_field_map: HashMap<_, _> = add_fields + .iter() + .map(|f| (f.name().to_owned(), f.clone())) + .collect(); + let partition_values_parsed = add_field_map.get("partitionValues_parsed").unwrap(); + if let ArrowDataType::Struct(fields) = partition_values_parsed.data_type() { + assert_eq!(1, fields.len()); + let field = fields.get(0).unwrap().to_owned(); + assert_eq!( + Arc::new(ArrowField::new("pcol", ArrowDataType::Int32, true)), + field + ); + } else { + unreachable!(); + } + let stats_parsed = add_field_map.get("stats_parsed").unwrap(); + if let ArrowDataType::Struct(fields) = stats_parsed.data_type() { + assert_eq!(4, fields.len()); + + let field_map: HashMap<_, _> = fields + .iter() + .map(|f| (f.name().to_owned(), f.clone())) + .collect(); + + for (k, v) in field_map.iter() { + match k.as_ref() { + "minValues" | "maxValues" | "nullCount" => match v.data_type() { + ArrowDataType::Struct(fields) => { + assert_eq!(1, fields.len()); + let field = fields.get(0).unwrap().to_owned(); + let data_type = if k == "nullCount" { + ArrowDataType::Int64 + } else { + ArrowDataType::Int32 + }; + assert_eq!(Arc::new(ArrowField::new("col1", data_type, true)), field); + } + _ => unreachable!(), + }, + "numRecords" => {} + _ => panic!(), + } + } + } else { + unreachable!(); + } + + // verify extended remove schema fields **ARE NOT** included when `use_extended_remove_schema` is false. + let num_remove_fields = log_schema + .fields() + .iter() + .filter(|f| f.name() == "remove") + .flat_map(|f| { + if let ArrowDataType::Struct(fields) = f.data_type() { + fields.iter().cloned() + } else { + unreachable!(); + } + }) + .count(); + assert_eq!(4, num_remove_fields); + + // verify extended remove schema fields **ARE** included when `use_extended_remove_schema` is true. + let log_schema = + delta_log_schema_for_table(table_schema, partition_columns.as_slice(), true); + let remove_fields: Vec<_> = log_schema + .fields() + .iter() + .filter(|f| f.name() == "remove") + .flat_map(|f| { + if let ArrowDataType::Struct(fields) = f.data_type() { + fields.iter().cloned() + } else { + unreachable!(); + } + }) + .collect(); + assert_eq!(7, remove_fields.len()); + let expected_fields = [ + "path", + "deletionTimestamp", + "dataChange", + "extendedFileMetadata", + "partitionValues", + "size", + "tags", + ]; + for f in remove_fields.iter() { + assert!(expected_fields.contains(&f.name().as_str())); + } + } + + #[test] + fn test_arrow_from_delta_decimal_type() { + let precision = 20; + let scale = 2; + let decimal_field = DataType::Primitive(PrimitiveType::Decimal(precision, scale)); + assert_eq!( + >::try_from(&decimal_field).unwrap(), + ArrowDataType::Decimal128(precision as u8, scale as i8) + ); + } + + #[test] + fn test_arrow_from_delta_timestamp_type() { + let timestamp_field = DataType::Primitive(PrimitiveType::Timestamp); + assert_eq!( + >::try_from(×tamp_field).unwrap(), + ArrowDataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_delta_from_arrow_timestamp_type() { + let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); + assert_eq!( + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) + ); + } + + #[test] + fn test_delta_from_arrow_timestamp_type_with_tz() { + let timestamp_field = + ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())); + assert_eq!( + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) + ); + } + + #[test] + fn test_delta_from_arrow_map_type() { + let arrow_map = ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int8, false), + ArrowField::new("value", ArrowDataType::Binary, true), + ] + .into(), + ), + false, + )), + false, + ); + let converted_map: DataType = (&arrow_map).try_into().unwrap(); + + assert_eq!( + converted_map, + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::Byte), + DataType::Primitive(PrimitiveType::Binary), + true, + ))) + ); + } + + #[test] + fn test_record_batch_from_map_type() { + let keys = vec!["0", "1", "5", "6", "7"]; + let values: Vec<&[u8]> = vec![ + b"test_val_1", + b"test_val_2", + b"long_test_val_3", + b"4", + b"test_val_5", + ]; + let entry_offsets = vec![0u32, 1, 1, 4, 5, 5]; + let num_rows = keys.len(); + + // Copied the function `new_from_string` with the patched code from https://github.com/apache/arrow-rs/pull/4808 + // This should be reverted back [`MapArray::new_from_strings`] once arrow is upgraded in this project. + fn new_from_strings<'a>( + keys: impl Iterator, + values: &dyn Array, + entry_offsets: &[u32], + ) -> Result { + let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); + let keys_data = StringArray::from_iter_values(keys); + + let keys_field = Arc::new(Field::new("keys", ArrowDataType::Utf8, false)); + let values_field = Arc::new(Field::new( + "values", + values.data_type().clone(), + values.null_count() > 0, + )); + + let entry_struct = StructArray::from(vec![ + (keys_field, Arc::new(keys_data) as ArrayRef), + (values_field, make_array(values.to_data())), + ]); + + let map_data_type = ArrowDataType::Map( + Arc::new(Field::new( + "entries", + entry_struct.data_type().clone(), + false, + )), + false, + ); + + let map_data = ArrayData::builder(map_data_type) + .len(entry_offsets.len() - 1) + .add_buffer(entry_offsets_buffer) + .add_child_data(entry_struct.into_data()) + .build()?; + + Ok(MapArray::from(map_data)) + } + + let map_array = new_from_strings( + keys.into_iter(), + &arrow::array::BinaryArray::from(values), + entry_offsets.as_slice(), + ) + .expect("Could not create a map array"); + + let schema = + >::try_from(&StructType::new(vec![ + StructField::new( + "example".to_string(), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::String), + DataType::Primitive(PrimitiveType::Binary), + false, + ))), + false, + ), + ])) + .expect("Could not get schema"); + + let record_batch = + arrow::record_batch::RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) + .expect("Failed to create RecordBatch"); + + assert_eq!(record_batch.num_columns(), 1); + assert_eq!(record_batch.num_rows(), num_rows); + } + + #[test] + fn test_max_min_schema_for_fields() { + let mut max_min_vec: Vec = Vec::new(); + let fields = [ + ArrowField::new("simple", ArrowDataType::Int32, true), + ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ArrowField::new("simple", ArrowDataType::Int32, true)].into(), + ), + true, + ), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "simple", + ArrowDataType::Int32, + true, + ))), + true, + ), + ArrowField::new( + "map", + ArrowDataType::Map( + Arc::new(ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int32, true), + ArrowField::new("value", ArrowDataType::Int32, true), + ] + .into(), + ), + true, + )), + true, + ), + true, + ), + ArrowField::new("binary", ArrowDataType::Binary, true), + ]; + + let expected = vec![fields[0].clone(), fields[1].clone()]; + + fields + .iter() + .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); + + assert_eq!(max_min_vec, expected); + } + + #[test] + fn test_null_count_schema_for_fields() { + let mut null_count_vec: Vec = Vec::new(); + let fields = [ + ArrowField::new("int32", ArrowDataType::Int32, true), + ArrowField::new("int64", ArrowDataType::Int64, true), + ArrowField::new("Utf8", ArrowDataType::Utf8, true), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "simple", + ArrowDataType::Int32, + true, + ))), + true, + ), + ArrowField::new( + "map", + ArrowDataType::Map( + Arc::new(ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int32, true), + ArrowField::new("value", ArrowDataType::Int32, true), + ] + .into(), + ), + true, + )), + true, + ), + true, + ), + ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ArrowField::new("int32", ArrowDataType::Int32, true)].into(), + ), + true, + ), + ]; + let expected = vec![ + ArrowField::new(fields[0].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[1].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[2].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[3].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[4].name(), ArrowDataType::Int64, true), + ArrowField::new( + fields[5].name(), + ArrowDataType::Struct( + vec![ArrowField::new("int32", ArrowDataType::Int64, true)].into(), + ), + true, + ), + ]; + fields + .iter() + .for_each(|f| null_count_schema_for_fields(&mut null_count_vec, f)); + assert_eq!(null_count_vec, expected); + } + + /* + * This test validates the trait implementation of + * TryFrom<&Arc> for schema::SchemaField which is required with Arrow 37 since + * iterators on Fields will give an &Arc + */ + #[test] + fn tryfrom_arrowfieldref_with_structs() { + let field = Arc::new(ArrowField::new( + "test_struct", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int32, true), + ArrowField::new("value", ArrowDataType::Int32, true), + ] + .into(), + ), + true, + )); + let _converted: StructField = field.as_ref().try_into().unwrap(); + } +} diff --git a/crates/deltalake-core/src/kernel/actions/checkpoint.rs b/crates/deltalake-core/src/kernel/actions/checkpoint.rs new file mode 100644 index 0000000000..59960f66b8 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/checkpoint.rs @@ -0,0 +1,589 @@ +use std::collections::HashMap; +use std::str::FromStr; + +use arrow_array::{ + BooleanArray, Int32Array, Int64Array, ListArray, MapArray, RecordBatch, StringArray, + StructArray, +}; +use either::Either; +use fix_hidden_lifetime_bug::fix_hidden_lifetime_bug; +use itertools::izip; +use serde::{Deserialize, Serialize}; + +use super::{error::Error, DeltaResult}; + +#[fix_hidden_lifetime_bug] +#[allow(dead_code)] +pub(crate) fn parse_actions<'a>( + batch: &RecordBatch, + types: impl IntoIterator, +) -> DeltaResult> { + Ok(types + .into_iter() + .filter_map(|action| parse_action(batch, action).ok()) + .flatten()) +} + +#[fix_hidden_lifetime_bug] +pub(crate) fn parse_action( + batch: &RecordBatch, + action_type: &ActionType, +) -> DeltaResult> { + let column_name = match action_type { + ActionType::Metadata => "metaData", + ActionType::Protocol => "protocol", + ActionType::Add => "add", + ActionType::Remove => "remove", + _ => unimplemented!(), + }; + + let arr = batch + .column_by_name(column_name) + .ok_or(Error::MissingColumn(column_name.into()))? + .as_any() + .downcast_ref::() + .ok_or(Error::UnexpectedColumnType( + "Cannot downcast to StructArray".into(), + ))?; + + match action_type { + ActionType::Metadata => parse_action_metadata(arr), + ActionType::Protocol => parse_action_protocol(arr), + ActionType::Add => parse_actions_add(arr), + ActionType::Remove => parse_actions_remove(arr), + _ => todo!(), + } +} + +fn parse_action_metadata(arr: &StructArray) -> DeltaResult>> { + let ids = cast_struct_column::(arr, "id")?; + let schema_strings = cast_struct_column::(arr, "schemaString")?; + let metadata = ids + .into_iter() + .zip(schema_strings) + .filter_map(|(maybe_id, maybe_schema_string)| { + if let (Some(id), Some(schema_string)) = (maybe_id, maybe_schema_string) { + Some(Metadata::new( + id, + Format { + provider: "parquet".into(), + options: Default::default(), + }, + schema_string, + Vec::::new(), + None, + )) + } else { + None + } + }) + .next(); + + if metadata.is_none() { + return Ok(Box::new(std::iter::empty())); + } + let mut metadata = metadata.unwrap(); + + metadata.partition_columns = cast_struct_column::(arr, "partitionColumns") + .ok() + .map(|arr| { + arr.iter() + .filter_map(|it| { + if let Some(features) = it { + let vals = features + .as_any() + .downcast_ref::()? + .iter() + .filter_map(|v| v.map(|inner| inner.to_owned())) + .collect::>(); + Some(vals) + } else { + None + } + }) + .flatten() + .collect::>() + }) + .unwrap_or_default(); + + metadata.name = cast_struct_column::(arr, "name") + .ok() + .and_then(|arr| { + arr.iter() + .flat_map(|maybe| maybe.map(|v| v.to_string())) + .next() + }); + metadata.description = cast_struct_column::(arr, "description") + .ok() + .and_then(|arr| { + arr.iter() + .flat_map(|maybe| maybe.map(|v| v.to_string())) + .next() + }); + metadata.created_time = cast_struct_column::(arr, "createdTime") + .ok() + .and_then(|arr| arr.iter().flatten().next()); + + if let Ok(config) = cast_struct_column::(arr, "configuration") { + let keys = config + .keys() + .as_any() + .downcast_ref::() + .ok_or(Error::MissingData("expected key column in map".into()))?; + let values = config + .values() + .as_any() + .downcast_ref::() + .ok_or(Error::MissingData("expected value column in map".into()))?; + metadata.configuration = keys + .into_iter() + .zip(values) + .filter_map(|(k, v)| k.map(|key| (key.to_string(), v.map(|vv| vv.to_string())))) + .collect::>(); + }; + + Ok(Box::new(std::iter::once(Action::Metadata(metadata)))) +} + +fn parse_action_protocol(arr: &StructArray) -> DeltaResult>> { + let min_reader = cast_struct_column::(arr, "minReaderVersion")?; + let min_writer = cast_struct_column::(arr, "minWriterVersion")?; + let protocol = min_reader + .into_iter() + .zip(min_writer) + .filter_map(|(r, w)| { + if let (Some(min_reader_version), Some(min_wrriter_version)) = (r, w) { + Some(Protocol::new(min_reader_version, min_wrriter_version)) + } else { + None + } + }) + .next(); + + if protocol.is_none() { + return Ok(Box::new(std::iter::empty())); + } + let mut protocol = protocol.unwrap(); + + protocol.reader_features = cast_struct_column::(arr, "readerFeatures") + .ok() + .map(|arr| { + arr.iter() + .filter_map(|it| { + if let Some(features) = it { + let vals = features + .as_any() + .downcast_ref::()? + .iter() + .filter_map(|v| v.map(|inner| inner.to_owned())) + .collect::>(); + Some(vals) + } else { + None + } + }) + .flatten() + .collect::>() + }); + + protocol.writer_features = cast_struct_column::(arr, "writerFeatures") + .ok() + .map(|arr| { + arr.iter() + .filter_map(|it| { + if let Some(features) = it { + let vals = features + .as_any() + .downcast_ref::()? + .iter() + .filter_map(|v| v.map(|inner| inner.to_string())) + .collect::>(); + Some(vals) + } else { + None + } + }) + .flatten() + .collect::>() + }); + + Ok(Box::new(std::iter::once(Action::Protocol(protocol)))) +} + +fn parse_actions_add(arr: &StructArray) -> DeltaResult + '_>> { + let paths = cast_struct_column::(arr, "path")?; + let sizes = cast_struct_column::(arr, "size")?; + let modification_times = cast_struct_column::(arr, "modificationTime")?; + let data_changes = cast_struct_column::(arr, "dataChange")?; + let partition_values = cast_struct_column::(arr, "partitionValues")? + .iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())); + + let tags = if let Ok(stats) = cast_struct_column::(arr, "tags") { + Either::Left( + stats + .iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())), + ) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let stats = if let Ok(stats) = cast_struct_column::(arr, "stats") { + Either::Left(stats.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let base_row_ids = if let Ok(row_ids) = cast_struct_column::(arr, "baseRowId") { + Either::Left(row_ids.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let commit_versions = + if let Ok(versions) = cast_struct_column::(arr, "defaultRowCommitVersion") { + Either::Left(versions.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let deletion_vectors = if let Ok(dvs) = cast_struct_column::(arr, "deletionVector") + { + Either::Left(parse_dv(dvs)?) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let zipped = izip!( + paths, + sizes, + modification_times, + data_changes, + partition_values, + stats, + tags, + base_row_ids, + commit_versions, + deletion_vectors, + ); + let zipped = zipped.map( + |( + maybe_paths, + maybe_size, + maybe_modification_time, + maybe_data_change, + partition_values, + stat, + tags, + base_row_id, + default_row_commit_version, + deletion_vector, + )| { + if let (Some(path), Some(size), Some(modification_time), Some(data_change)) = ( + maybe_paths, + maybe_size, + maybe_modification_time, + maybe_data_change, + ) { + Some(Add { + path: path.into(), + size, + modification_time, + data_change, + partition_values: partition_values.unwrap_or_default(), + stats: stat.map(|v| v.to_string()), + tags, + base_row_id, + default_row_commit_version, + deletion_vector, + stats_parsed: None, + partition_values_parsed: None, + }) + } else { + None + } + }, + ); + + Ok(Box::new(zipped.flatten().map(Action::Add))) +} + +fn parse_actions_remove(arr: &StructArray) -> DeltaResult + '_>> { + let paths = cast_struct_column::(arr, "path")?; + let data_changes = cast_struct_column::(arr, "dataChange")?; + + let deletion_timestamps = + if let Ok(ts) = cast_struct_column::(arr, "deletionTimestamp") { + Either::Left(ts.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let extended_file_metadata = + if let Ok(metas) = cast_struct_column::(arr, "extendedFileMetadata") { + Either::Left(metas.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let partition_values = + if let Ok(values) = cast_struct_column::(arr, "partitionValues") { + Either::Left( + values + .iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())), + ) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let sizes = if let Ok(size) = cast_struct_column::(arr, "size") { + Either::Left(size.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let tags = if let Ok(tags) = cast_struct_column::(arr, "tags") { + Either::Left( + tags.iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())), + ) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let deletion_vectors = if let Ok(dvs) = cast_struct_column::(arr, "deletionVector") + { + Either::Left(parse_dv(dvs)?) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let base_row_ids = if let Ok(row_ids) = cast_struct_column::(arr, "baseRowId") { + Either::Left(row_ids.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let commit_versions = + if let Ok(row_ids) = cast_struct_column::(arr, "defaultRowCommitVersion") { + Either::Left(row_ids.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let zipped = izip!( + paths, + data_changes, + deletion_timestamps, + extended_file_metadata, + partition_values, + sizes, + tags, + deletion_vectors, + base_row_ids, + commit_versions, + ); + + let zipped = zipped.map( + |( + maybe_paths, + maybe_data_change, + deletion_timestamp, + extended_file_metadata, + partition_values, + size, + tags, + deletion_vector, + base_row_id, + default_row_commit_version, + )| { + if let (Some(path), Some(data_change)) = (maybe_paths, maybe_data_change) { + Some(Remove { + path: path.into(), + data_change, + deletion_timestamp, + extended_file_metadata, + partition_values, + size, + tags, + deletion_vector, + base_row_id, + default_row_commit_version, + }) + } else { + None + } + }, + ); + + Ok(Box::new(zipped.flatten().map(Action::Remove))) +} + +fn parse_dv( + arr: &StructArray, +) -> DeltaResult> + '_> { + let storage_types = cast_struct_column::(arr, "storageType")?; + let paths_or_inlines = cast_struct_column::(arr, "pathOrInlineDv")?; + let sizes_in_bytes = cast_struct_column::(arr, "sizeInBytes")?; + let cardinalities = cast_struct_column::(arr, "cardinality")?; + + let offsets = if let Ok(offsets) = cast_struct_column::(arr, "offset") { + Either::Left(offsets.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(cardinalities.len())) + }; + + let zipped = izip!( + storage_types, + paths_or_inlines, + sizes_in_bytes, + cardinalities, + offsets, + ); + + Ok(zipped.map( + |(maybe_type, maybe_path_or_inline_dv, maybe_size_in_bytes, maybe_cardinality, offset)| { + if let ( + Some(storage_type), + Some(path_or_inline_dv), + Some(size_in_bytes), + Some(cardinality), + ) = ( + maybe_type, + maybe_path_or_inline_dv, + maybe_size_in_bytes, + maybe_cardinality, + ) { + Some(DeletionVectorDescriptor { + storage_type: StorageType::from_str(storage_type).unwrap(), + path_or_inline_dv: path_or_inline_dv.into(), + size_in_bytes, + cardinality, + offset, + }) + } else { + None + } + }, + )) +} + +fn cast_struct_column(arr: &StructArray, name: impl AsRef) -> DeltaResult<&T> { + arr.column_by_name(name.as_ref()) + .ok_or(Error::MissingColumn(name.as_ref().into()))? + .as_any() + .downcast_ref::() + .ok_or(Error::UnexpectedColumnType( + "Cannot downcast to expected type".into(), + )) +} + +fn struct_array_to_map(arr: &StructArray) -> DeltaResult>> { + let keys = cast_struct_column::(arr, "key")?; + let values = cast_struct_column::(arr, "value")?; + Ok(keys + .into_iter() + .zip(values) + .filter_map(|(k, v)| k.map(|key| (key.to_string(), v.map(|vv| vv.to_string())))) + .collect()) +} + +#[cfg(all(test, feature = "default-client"))] +mod tests { + use std::sync::Arc; + + use object_store::local::LocalFileSystem; + + use super::*; + use crate::actions::Protocol; + use crate::client::json::DefaultJsonHandler; + use crate::executor::tokio::TokioBackgroundExecutor; + use crate::JsonHandler; + + fn action_batch() -> RecordBatch { + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, + ] + .into(); + let output_schema = Arc::new(get_log_schema()); + handler.parse_json(json_strings, output_schema).unwrap() + } + + #[test] + fn test_parse_protocol() { + let batch = action_batch(); + let action = parse_action(&batch, &ActionType::Protocol) + .unwrap() + .collect::>(); + let expected = Action::Protocol(Protocol { + min_reader_version: 3, + min_writer_version: 7, + reader_features: Some(vec!["deletionVectors".into()]), + writer_features: Some(vec!["deletionVectors".into()]), + }); + assert_eq!(action[0], expected) + } + + #[test] + fn test_parse_metadata() { + let batch = action_batch(); + let action = parse_action(&batch, &ActionType::Metadata) + .unwrap() + .collect::>(); + let configuration = HashMap::from_iter([ + ( + "delta.enableDeletionVectors".to_string(), + Some("true".to_string()), + ), + ( + "delta.columnMapping.mode".to_string(), + Some("none".to_string()), + ), + ]); + let expected = Action::Metadata(Metadata { + id: "testId".into(), + name: None, + description: None, + format: Format { + provider: "parquet".into(), + options: Default::default(), + }, + schema_string: r#"{"type":"struct","fields":[{"name":"value","type":"integer","nullable":true,"metadata":{}}]}"#.to_string(), + partition_columns: Vec::new(), + created_time: Some(1677811175819), + configuration, + }); + assert_eq!(action[0], expected) + } + + #[test] + fn test_parse_add_partitioned() { + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + + let json_strings: StringArray = vec![ + r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"add":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"c1=5/c2=b/part-00007-4e73fa3b-2c88-424a-8051-f8b54328ffdb.c000.snappy.parquet","partitionValues":{"c1":"5","c2":"b"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":6},\"maxValues\":{\"c3\":6},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#, + ] + .into(); + let output_schema = Arc::new(get_log_schema()); + let batch = handler.parse_json(json_strings, output_schema).unwrap(); + + let actions = parse_action(&batch, &ActionType::Add) + .unwrap() + .collect::>(); + println!("{:?}", actions) + } +} diff --git a/crates/deltalake-core/src/kernel/actions/mod.rs b/crates/deltalake-core/src/kernel/actions/mod.rs new file mode 100644 index 0000000000..865c9d3cd9 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/mod.rs @@ -0,0 +1,64 @@ +//! Actions are the fundamental unit of work in Delta Lake. Each action performs a single atomic +//! operation on the state of a Delta table. Actions are stored in the `_delta_log` directory of a +//! Delta table in JSON format. The log is a time series of actions that represent all the changes +//! made to a table. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +#[cfg(all(feature = "arrow", feature = "parquet"))] +pub(crate) mod arrow; +// pub(crate) mod schemas; +mod serde_path; +pub(crate) mod types; + +pub use types::*; + +#[derive(Debug)] +/// The type of action that was performed on the table +pub enum ActionType { + /// modify the data in a table by adding individual logical files + Add, + /// add a file containing only the data that was changed as part of the transaction + Cdc, + /// additional provenance information about what higher-level operation was being performed + CommitInfo, + /// contains a configuration (string-string map) for a named metadata domain + DomainMetadata, + /// changes the current metadata of the table + Metadata, + /// increase the version of the Delta protocol that is required to read or write a given table + Protocol, + /// modify the data in a table by removing individual logical files + Remove, + /// The Row ID high-water mark tracks the largest ID that has been assigned to a row in the table. + RowIdHighWaterMark, + /// Transactional information + Txn, +} + +#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +#[allow(missing_docs)] +pub enum Action { + #[serde(rename = "metaData")] + Metadata(Metadata), + Protocol(Protocol), + Add(Add), + Remove(Remove), + Cdc(AddCDCFile), + Txn(Txn), + CommitInfo(CommitInfo), + DomainMetadata(DomainMetadata), +} + +impl Action { + /// Create a commit info from a map + pub fn commit_info(info: HashMap) -> Self { + Self::CommitInfo(CommitInfo { + info, + ..Default::default() + }) + } +} diff --git a/crates/deltalake-core/src/kernel/actions/schemas.rs b/crates/deltalake-core/src/kernel/actions/schemas.rs new file mode 100644 index 0000000000..0cc870318f --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/schemas.rs @@ -0,0 +1,255 @@ +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Fields, Schema}; + +use super::ActionType; + +impl ActionType { + /// Returns the root field for the action type + pub fn field(&self) -> Field { + match self { + Self::Add => get_root("add", self.fields()), + Self::Cdc => get_root("cdc", self.fields()), + Self::CommitInfo => get_root("commitInfo", self.fields()), + Self::DomainMetadata => get_root("domainMetadata", self.fields()), + Self::Metadata => get_root("metaData", self.fields()), + Self::Protocol => get_root("protocol", self.fields()), + Self::Remove => get_root("remove", self.fields()), + Self::RowIdHighWaterMark => get_root("rowIdHighWaterMark", self.fields()), + Self::Txn => get_root("txn", self.fields()), + } + } + + /// Returns the child fields for the action type + pub fn fields(&self) -> Vec { + match self { + Self::Add => add_fields(), + Self::Cdc => cdc_fields(), + Self::CommitInfo => commit_info_fields(), + Self::DomainMetadata => domain_metadata_fields(), + Self::Metadata => metadata_fields(), + Self::Protocol => protocol_fields(), + Self::Remove => remove_fields(), + Self::RowIdHighWaterMark => watermark_fields(), + Self::Txn => txn_fields(), + } + } +} + +/// Returns the schema for the delta log +pub fn get_log_schema() -> Schema { + Schema { + fields: Fields::from_iter([ + ActionType::Add.field(), + ActionType::Cdc.field(), + ActionType::CommitInfo.field(), + ActionType::DomainMetadata.field(), + ActionType::Metadata.field(), + ActionType::Protocol.field(), + ActionType::Remove.field(), + ActionType::RowIdHighWaterMark.field(), + ActionType::Txn.field(), + ]), + metadata: Default::default(), + } +} + +fn get_root(name: &str, fields: Vec) -> Field { + Field::new(name, DataType::Struct(Fields::from_iter(fields)), true) +} + +fn add_fields() -> Vec { + Vec::from_iter([ + Field::new("path", DataType::Utf8, false), + Field::new("size", DataType::Int64, false), + Field::new("modificationTime", DataType::Int64, false), + Field::new("dataChange", DataType::Boolean, false), + Field::new("stats", DataType::Utf8, true), + Field::new( + "partitionValues", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "tags", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "deletionVector", + DataType::Struct(Fields::from(vec![ + Field::new("storageType", DataType::Utf8, false), + Field::new("pathOrInlineDv", DataType::Utf8, false), + Field::new("offset", DataType::Int32, true), + Field::new("sizeInBytes", DataType::Int32, false), + Field::new("cardinality", DataType::Int64, false), + ])), + true, + ), + Field::new("baseRowId", DataType::Int64, true), + Field::new("defaultRowCommitVersion", DataType::Int64, true), + ]) +} + +fn cdc_fields() -> Vec { + Vec::from_iter([ + Field::new("path", DataType::Utf8, true), + Field::new( + "partitionValues", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new("size", DataType::Int64, true), + Field::new("dataChange", DataType::Boolean, true), + Field::new( + "tags", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + ]) +} + +fn remove_fields() -> Vec { + Vec::from_iter([ + Field::new("path", DataType::Utf8, true), + Field::new("deletionTimestamp", DataType::Int64, true), + Field::new("dataChange", DataType::Boolean, true), + Field::new("extendedFileMetadata", DataType::Boolean, true), + Field::new("size", DataType::Int64, true), + Field::new( + "partitionValues", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "tags", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + ]) +} + +fn metadata_fields() -> Vec { + Vec::from_iter([ + Field::new("id", DataType::Utf8, false), + Field::new("name", DataType::Utf8, true), + Field::new("description", DataType::Utf8, true), + Field::new( + "format", + DataType::Struct(Fields::from_iter([ + Field::new("provider", DataType::Utf8, true), + Field::new( + "options", + DataType::Map( + Arc::new(Field::new( + "key_value", + DataType::Struct(Fields::from_iter([ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ])), + false, + )), + false, + ), + false, + ), + ])), + false, + ), + Field::new("schemaString", DataType::Utf8, false), + Field::new("createdTime", DataType::Int64, true), + Field::new( + "partitionColumns", + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + false, + ), + Field::new( + "configuration", + DataType::Map( + Arc::new(Field::new( + "key_value", + DataType::Struct(Fields::from_iter([ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ])), + false, + )), + false, + ), + true, + ), + ]) +} + +fn protocol_fields() -> Vec { + Vec::from_iter([ + Field::new("minReaderVersion", DataType::Int32, false), + Field::new("minWriterVersion", DataType::Int32, false), + Field::new( + "readerFeatures", + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + true, + ), + Field::new( + "writerFeatures", + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + true, + ), + ]) +} + +fn txn_fields() -> Vec { + Vec::from_iter([ + Field::new("appId", DataType::Utf8, true), + Field::new("version", DataType::Int64, true), + Field::new("lastUpdated", DataType::Int64, true), + ]) +} + +fn watermark_fields() -> Vec { + Vec::from_iter([Field::new("highWaterMark", DataType::Int64, true)]) +} + +fn commit_info_fields() -> Vec { + Vec::from_iter([ + Field::new("timestamp", DataType::Int64, true), + Field::new("operation", DataType::Utf8, true), + Field::new("isolationLevel", DataType::Utf8, true), + Field::new("isBlindAppend", DataType::Boolean, true), + Field::new("txnId", DataType::Utf8, true), + Field::new("readVersion", DataType::Int32, true), + Field::new( + "operationParameters", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "operationMetrics", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + ]) +} + +fn domain_metadata_fields() -> Vec { + Vec::from_iter([ + Field::new("domain", DataType::Utf8, true), + Field::new( + "configuration", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new("removed", DataType::Boolean, true), + ]) +} + +fn get_map_field() -> Field { + Field::new( + "key_value", + DataType::Struct(Fields::from_iter([ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ])), + false, + ) +} diff --git a/crates/deltalake-core/src/kernel/actions/serde_path.rs b/crates/deltalake-core/src/kernel/actions/serde_path.rs new file mode 100644 index 0000000000..9868523e81 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/serde_path.rs @@ -0,0 +1,89 @@ +use std::str::Utf8Error; + +use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS}; +use serde::{self, Deserialize, Deserializer, Serialize, Serializer}; + +pub fn deserialize<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + decode_path(&s).map_err(serde::de::Error::custom) +} + +pub fn serialize(value: &str, serializer: S) -> Result +where + S: Serializer, +{ + let encoded = encode_path(value); + String::serialize(&encoded, serializer) +} + +pub const _DELIMITER: &str = "/"; +/// The path delimiter as a single byte +pub const _DELIMITER_BYTE: u8 = _DELIMITER.as_bytes()[0]; + +/// Characters we want to encode. +const INVALID: &AsciiSet = &CONTROLS + // The delimiter we are reserving for internal hierarchy + // .add(DELIMITER_BYTE) + // Characters AWS recommends avoiding for object keys + // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html + .add(b'\\') + .add(b'{') + .add(b'^') + .add(b'}') + .add(b'%') + .add(b'`') + .add(b']') + .add(b'"') + .add(b'>') + .add(b'[') + // .add(b'~') + .add(b'<') + .add(b'#') + .add(b'|') + // Characters Google Cloud Storage recommends avoiding for object names + // https://cloud.google.com/storage/docs/naming-objects + .add(b'\r') + .add(b'\n') + .add(b'*') + .add(b'?'); + +fn encode_path(path: &str) -> String { + percent_encode(path.as_bytes(), INVALID).to_string() +} + +fn decode_path(path: &str) -> Result { + Ok(percent_decode_str(path).decode_utf8()?.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encode_path() { + let cases = [ + ( + "string=$%25&%2F()%3D%5E%22%5B%5D%23%2A%3F.%3A/part-00023-4b06bc90-0678-4a63-94a2-f09af1adb945.c000.snappy.parquet", + "string=$%2525&%252F()%253D%255E%2522%255B%255D%2523%252A%253F.%253A/part-00023-4b06bc90-0678-4a63-94a2-f09af1adb945.c000.snappy.parquet", + ), + ( + "string=$%25&%2F()%3D%5E%22<>~%5B%5D%7B}`%23|%2A%3F%2F%5Cr%5Cn.%3A/part-00023-e0a68495-8098-40a6-be5f-b502b111b789.c000.snappy.parquet", + "string=$%2525&%252F()%253D%255E%2522%3C%3E~%255B%255D%257B%7D%60%2523%7C%252A%253F%252F%255Cr%255Cn.%253A/part-00023-e0a68495-8098-40a6-be5f-b502b111b789.c000.snappy.parquet" + ), + ( + "string=$%25&%2F()%3D%5E%22<>~%5B%5D%7B}`%23|%2A%3F%2F%5Cr%5Cn.%3A_-/part-00023-346b6795-dafa-4948-bda5-ecdf4baa4445.c000.snappy.parquet", + "string=$%2525&%252F()%253D%255E%2522%3C%3E~%255B%255D%257B%7D%60%2523%7C%252A%253F%252F%255Cr%255Cn.%253A_-/part-00023-346b6795-dafa-4948-bda5-ecdf4baa4445.c000.snappy.parquet" + ) + ]; + + for (raw, expected) in cases { + let encoded = encode_path(raw); + assert_eq!(encoded, expected); + let decoded = decode_path(expected).unwrap(); + assert_eq!(decoded, raw); + } + } +} diff --git a/crates/deltalake-core/src/kernel/actions/types.rs b/crates/deltalake-core/src/kernel/actions/types.rs new file mode 100644 index 0000000000..166dbc98ef --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/types.rs @@ -0,0 +1,900 @@ +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +// use std::io::{Cursor, Read}; +// use std::sync::Arc; + +// use roaring::RoaringTreemap; +use log::warn; +use serde::{Deserialize, Serialize}; +use url::Url; + +use super::super::schema::StructType; +use super::super::{error::Error, DeltaResult}; +use super::serde_path; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +/// Defines a file format used in table +pub struct Format { + /// Name of the encoding for files in this table + pub provider: String, + /// A map containing configuration options for the format + pub options: HashMap>, +} + +impl Format { + /// Allows creation of a new action::Format + pub fn new(provider: String, options: Option>>) -> Self { + let options = options.unwrap_or_default(); + Self { provider, options } + } + + /// Return the Format provider + pub fn get_provider(self) -> String { + self.provider + } +} + +impl Default for Format { + fn default() -> Self { + Self { + provider: String::from("parquet"), + options: HashMap::new(), + } + } +} + +/// Return a default empty schema to be used for edge-cases when a schema is missing +fn default_schema() -> String { + warn!("A `metaData` action was missing a `schemaString` and has been given an empty schema"); + r#"{"type":"struct", "fields": []}"#.into() +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// Defines a metadata action +pub struct Metadata { + /// Unique identifier for this table + pub id: String, + /// User-provided identifier for this table + pub name: Option, + /// User-provided description for this table + pub description: Option, + /// Specification of the encoding for the files stored in the table + pub format: Format, + /// Schema of the table + #[serde(default = "default_schema")] + pub schema_string: String, + /// Column names by which the data should be partitioned + pub partition_columns: Vec, + /// The time when this metadata action is created, in milliseconds since the Unix epoch + pub created_time: Option, + /// Configuration options for the metadata action + pub configuration: HashMap>, +} + +impl Metadata { + /// Create a new metadata action + pub fn new( + id: impl Into, + format: Format, + schema_string: impl Into, + partition_columns: impl IntoIterator>, + configuration: Option>>, + ) -> Self { + Self { + id: id.into(), + format, + schema_string: schema_string.into(), + partition_columns: partition_columns.into_iter().map(|c| c.into()).collect(), + configuration: configuration.unwrap_or_default(), + name: None, + description: None, + created_time: None, + } + } + + /// set the table name in the metadata action + pub fn with_name(mut self, name: impl Into) -> Self { + self.name = Some(name.into()); + self + } + + /// set the table description in the metadata action + pub fn with_description(mut self, description: impl Into) -> Self { + self.description = Some(description.into()); + self + } + + /// set the table creation time in the metadata action + pub fn with_created_time(mut self, created_time: i64) -> Self { + self.created_time = Some(created_time); + self + } + + /// get the table schema + pub fn schema(&self) -> DeltaResult { + Ok(serde_json::from_str(&self.schema_string)?) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// Defines a protocol action +pub struct Protocol { + /// The minimum version of the Delta read protocol that a client must implement + /// in order to correctly read this table + pub min_reader_version: i32, + /// The minimum version of the Delta write protocol that a client must implement + /// in order to correctly write this table + pub min_writer_version: i32, + /// A collection of features that a client must implement in order to correctly + /// read this table (exist only when minReaderVersion is set to 3) + pub reader_features: Option>, + /// A collection of features that a client must implement in order to correctly + /// write this table (exist only when minWriterVersion is set to 7) + pub writer_features: Option>, +} + +impl Protocol { + /// Create a new protocol action + pub fn new(min_reader_version: i32, min_wrriter_version: i32) -> Self { + Self { + min_reader_version, + min_writer_version: min_wrriter_version, + reader_features: None, + writer_features: None, + } + } + + /// set the reader features in the protocol action + pub fn with_reader_features( + mut self, + reader_features: impl IntoIterator>, + ) -> Self { + self.reader_features = Some(reader_features.into_iter().map(|c| c.into()).collect()); + self + } + + /// set the writer features in the protocol action + pub fn with_writer_features( + mut self, + writer_features: impl IntoIterator>, + ) -> Self { + self.writer_features = Some(writer_features.into_iter().map(|c| c.into()).collect()); + self + } +} + +/// Features table readers can support as well as let users know +/// what is supported +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] +#[serde(rename_all = "camelCase")] +pub enum ReaderFeatures { + /// Mapping of one column to another + ColumnMapping, + /// Deletion vectors for merge, update, delete + DeleteionVecotrs, + /// timestamps without timezone support + #[serde(alias = "timestampNtz")] + TimestampWithoutTimezone, + /// version 2 of checkpointing + V2Checkpoint, + /// If we do not match any other reader features + #[serde(untagged)] + Other(String), +} + +#[allow(clippy::from_over_into)] +impl Into for ReaderFeatures { + fn into(self) -> usize { + match self { + ReaderFeatures::Other(_) => 0, + ReaderFeatures::ColumnMapping => 2, + ReaderFeatures::DeleteionVecotrs + | ReaderFeatures::TimestampWithoutTimezone + | ReaderFeatures::V2Checkpoint => 3, + } + } +} + +#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] +impl From<&parquet::record::Field> for ReaderFeatures { + fn from(value: &parquet::record::Field) -> Self { + match value { + parquet::record::Field::Str(feature) => match feature.as_str() { + "columnMapping" => ReaderFeatures::ColumnMapping, + "deletionVectors" => ReaderFeatures::DeleteionVecotrs, + "timestampNtz" => ReaderFeatures::TimestampWithoutTimezone, + "v2Checkpoint" => ReaderFeatures::V2Checkpoint, + f => ReaderFeatures::Other(f.to_string()), + }, + f => ReaderFeatures::Other(f.to_string()), + } + } +} + +impl From for ReaderFeatures { + fn from(value: String) -> Self { + match value.as_str() { + "columnMapping" => ReaderFeatures::ColumnMapping, + "deletionVectors" => ReaderFeatures::DeleteionVecotrs, + "timestampNtz" => ReaderFeatures::TimestampWithoutTimezone, + "v2Checkpoint" => ReaderFeatures::V2Checkpoint, + f => ReaderFeatures::Other(f.to_string()), + } + } +} + +/// Features table writers can support as well as let users know +/// what is supported +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] +#[serde(rename_all = "camelCase")] +pub enum WriterFeatures { + /// Append Only Tables + AppendOnly, + /// Table invariants + Invariants, + /// Check constraints on columns + CheckConstraints, + /// CDF on a table + ChangeDataFeed, + /// Columns with generated values + GeneratedColumns, + /// Mapping of one column to another + ColumnMapping, + /// ID Columns + IdentityColumns, + /// Deletion vectors for merge, update, delete + DeleteionVecotrs, + /// Row tracking on tables + RowTracking, + /// timestamps without timezone support + #[serde(alias = "timestampNtz")] + TimestampWithoutTimezone, + /// domain specific metadata + DomainMetadata, + /// version 2 of checkpointing + V2Checkpoint, + /// Iceberg compatability support + IcebergCompatV1, + /// If we do not match any other reader features + #[serde(untagged)] + Other(String), +} + +#[allow(clippy::from_over_into)] +impl Into for WriterFeatures { + fn into(self) -> usize { + match self { + WriterFeatures::Other(_) => 0, + WriterFeatures::AppendOnly | WriterFeatures::Invariants => 2, + WriterFeatures::CheckConstraints => 3, + WriterFeatures::ChangeDataFeed | WriterFeatures::GeneratedColumns => 4, + WriterFeatures::ColumnMapping => 5, + WriterFeatures::IdentityColumns + | WriterFeatures::DeleteionVecotrs + | WriterFeatures::RowTracking + | WriterFeatures::TimestampWithoutTimezone + | WriterFeatures::DomainMetadata + | WriterFeatures::V2Checkpoint + | WriterFeatures::IcebergCompatV1 => 7, + } + } +} + +impl From for WriterFeatures { + fn from(value: String) -> Self { + match value.as_str() { + "appendOnly" => WriterFeatures::AppendOnly, + "invariants" => WriterFeatures::Invariants, + "checkConstraints" => WriterFeatures::CheckConstraints, + "changeDataFeed" => WriterFeatures::ChangeDataFeed, + "generatedColumns" => WriterFeatures::GeneratedColumns, + "columnMapping" => WriterFeatures::ColumnMapping, + "identityColumns" => WriterFeatures::IdentityColumns, + "deletionVectors" => WriterFeatures::DeleteionVecotrs, + "rowTracking" => WriterFeatures::RowTracking, + "timestampNtz" => WriterFeatures::TimestampWithoutTimezone, + "domainMetadata" => WriterFeatures::DomainMetadata, + "v2Checkpoint" => WriterFeatures::V2Checkpoint, + "icebergCompatV1" => WriterFeatures::IcebergCompatV1, + f => WriterFeatures::Other(f.to_string()), + } + } +} + +#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] +impl From<&parquet::record::Field> for WriterFeatures { + fn from(value: &parquet::record::Field) -> Self { + match value { + parquet::record::Field::Str(feature) => match feature.as_str() { + "appendOnly" => WriterFeatures::AppendOnly, + "invariants" => WriterFeatures::Invariants, + "checkConstraints" => WriterFeatures::CheckConstraints, + "changeDataFeed" => WriterFeatures::ChangeDataFeed, + "generatedColumns" => WriterFeatures::GeneratedColumns, + "columnMapping" => WriterFeatures::ColumnMapping, + "identityColumns" => WriterFeatures::IdentityColumns, + "deletionVectors" => WriterFeatures::DeleteionVecotrs, + "rowTracking" => WriterFeatures::RowTracking, + "timestampNtz" => WriterFeatures::TimestampWithoutTimezone, + "domainMetadata" => WriterFeatures::DomainMetadata, + "v2Checkpoint" => WriterFeatures::V2Checkpoint, + "icebergCompatV1" => WriterFeatures::IcebergCompatV1, + f => WriterFeatures::Other(f.to_string()), + }, + f => WriterFeatures::Other(f.to_string()), + } + } +} + +///Storage type of deletion vector +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +pub enum StorageType { + /// Stored at relative path derived from a UUID. + #[serde(rename = "u")] + UuidRelativePath, + /// Stored as inline string. + #[serde(rename = "i")] + Inline, + /// Stored at an absolute path. + #[serde(rename = "p")] + AbsolutePath, +} + +impl Default for StorageType { + fn default() -> Self { + Self::UuidRelativePath // seems to be used by Databricks and therefore most common + } +} + +impl FromStr for StorageType { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "u" => Ok(Self::UuidRelativePath), + "i" => Ok(Self::Inline), + "p" => Ok(Self::AbsolutePath), + _ => Err(Error::DeletionVector(format!( + "Unknown storage format: '{s}'." + ))), + } + } +} + +impl AsRef for StorageType { + fn as_ref(&self) -> &str { + match self { + Self::UuidRelativePath => "u", + Self::Inline => "i", + Self::AbsolutePath => "p", + } + } +} + +impl ToString for StorageType { + fn to_string(&self) -> String { + self.as_ref().to_string() + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +/// Defines a deletion vector +pub struct DeletionVectorDescriptor { + /// A single character to indicate how to access the DV. Legal options are: ['u', 'i', 'p']. + pub storage_type: StorageType, + + /// Three format options are currently proposed: + /// - If `storageType = 'u'` then ``: + /// The deletion vector is stored in a file with a path relative to the data + /// directory of this Delta table, and the file name can be reconstructed from + /// the UUID. See Derived Fields for how to reconstruct the file name. The random + /// prefix is recovered as the extra characters before the (20 characters fixed length) uuid. + /// - If `storageType = 'i'` then ``: The deletion vector + /// is stored inline in the log. The format used is the `RoaringBitmapArray` + /// format also used when the DV is stored on disk and described in [Deletion Vector Format]. + /// - If `storageType = 'p'` then ``: The DV is stored in a file with an + /// absolute path given by this path, which has the same format as the `path` field + /// in the `add`/`remove` actions. + /// + /// [Deletion Vector Format]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Deletion-Vector-Format + pub path_or_inline_dv: String, + + /// Start of the data for this DV in number of bytes from the beginning of the file it is stored in. + /// Always None (absent in JSON) when `storageType = 'i'`. + pub offset: Option, + + /// Size of the serialized DV in bytes (raw data size, i.e. before base85 encoding, if inline). + pub size_in_bytes: i32, + + /// Number of rows the given DV logically removes from the file. + pub cardinality: i64, +} + +impl DeletionVectorDescriptor { + /// get a unique idenitfier for the deletion vector + pub fn unique_id(&self) -> String { + if let Some(offset) = self.offset { + format!( + "{}{}@{offset}", + self.storage_type.as_ref(), + self.path_or_inline_dv + ) + } else { + format!("{}{}", self.storage_type.as_ref(), self.path_or_inline_dv) + } + } + + /// get the absolute path of the deletion vector + pub fn absolute_path(&self, parent: &Url) -> DeltaResult> { + match &self.storage_type { + StorageType::UuidRelativePath => { + let prefix_len = self.path_or_inline_dv.len() as i32 - 20; + if prefix_len < 0 { + return Err(Error::DeletionVector("Invalid length".to_string())); + } + let decoded = z85::decode(&self.path_or_inline_dv[(prefix_len as usize)..]) + .map_err(|_| Error::DeletionVector("Failed to decode DV uuid".to_string()))?; + let uuid = uuid::Uuid::from_slice(&decoded) + .map_err(|err| Error::DeletionVector(err.to_string()))?; + let mut dv_suffix = format!("deletion_vector_{uuid}.bin"); + if prefix_len > 0 { + dv_suffix = format!( + "{}/{}", + &self.path_or_inline_dv[..(prefix_len as usize)], + dv_suffix + ); + } + let dv_path = parent + .join(&dv_suffix) + .map_err(|_| Error::DeletionVector(format!("invalid path: {}", dv_suffix)))?; + Ok(Some(dv_path)) + } + StorageType::AbsolutePath => { + Ok(Some(Url::parse(&self.path_or_inline_dv).map_err(|_| { + Error::DeletionVector(format!("invalid path: {}", self.path_or_inline_dv)) + })?)) + } + StorageType::Inline => Ok(None), + } + } + + // TODO read only required byte ranges + // pub fn read( + // &self, + // fs_client: Arc, + // parent: Url, + // ) -> DeltaResult { + // match self.absolute_path(&parent)? { + // None => { + // let bytes = z85::decode(&self.path_or_inline_dv) + // .map_err(|_| Error::DeletionVector("Failed to decode DV".to_string()))?; + // RoaringTreemap::deserialize_from(&bytes[12..]) + // .map_err(|err| Error::DeletionVector(err.to_string())) + // } + // Some(path) => { + // let offset = self.offset; + // let size_in_bytes = self.size_in_bytes; + // + // let dv_data = fs_client + // .read_files(vec![(path, None)])? + // .next() + // .ok_or(Error::MissingData("No deletion Vector data".to_string()))??; + // + // let mut cursor = Cursor::new(dv_data); + // if let Some(offset) = offset { + // // TODO should we read the datasize from the DV file? + // // offset plus datasize bytes + // cursor.set_position((offset + 4) as u64); + // } + // + // let mut buf = vec![0; 4]; + // cursor + // .read(&mut buf) + // .map_err(|err| Error::DeletionVector(err.to_string()))?; + // let magic = + // i32::from_le_bytes(buf.try_into().map_err(|_| { + // Error::DeletionVector("filed to read magic bytes".to_string()) + // })?); + // println!("magic --> : {}", magic); + // // assert!(magic == 1681511377); + // + // let mut buf = vec![0; size_in_bytes as usize]; + // cursor + // .read(&mut buf) + // .map_err(|err| Error::DeletionVector(err.to_string()))?; + // + // RoaringTreemap::deserialize_from(Cursor::new(buf)) + // .map_err(|err| Error::DeletionVector(err.to_string())) + // } + // } + // } +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +#[serde(rename_all = "camelCase")] +/// Defines an add action +pub struct Add { + /// A relative path to a data file from the root of the table or an absolute path to a file + /// that should be added to the table. The path is a URI as specified by + /// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path. + /// + /// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt + #[serde(with = "serde_path")] + pub path: String, + + /// A map from partition column to value for this logical file. + pub partition_values: HashMap>, + + /// The size of this data file in bytes + pub size: i64, + + /// The time this logical file was created, as milliseconds since the epoch. + pub modification_time: i64, + + /// When `false` the logical file must already be present in the table or the records + /// in the added file must be contained in one or more remove actions in the same version. + pub data_change: bool, + + /// Contains [statistics] (e.g., count, min/max values for columns) about the data in this logical file. + /// + /// [statistics]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Per-file-Statistics + pub stats: Option, + + /// Map containing metadata about this logical file. + pub tags: Option>>, + + /// Information about deletion vector (DV) associated with this add action + pub deletion_vector: Option, + + /// Default generated Row ID of the first row in the file. The default generated Row IDs + /// of the other rows in the file can be reconstructed by adding the physical index of the + /// row within the file to the base Row ID + pub base_row_id: Option, + + /// First commit version in which an add action with the same path was committed to the table. + pub default_row_commit_version: Option, + + // TODO remove migration filds added to not do too many business logic changes in one PR + /// Partition values stored in raw parquet struct format. In this struct, the column names + /// correspond to the partition columns and the values are stored in their corresponding data + /// type. This is a required field when the table is partitioned and the table property + /// delta.checkpoint.writeStatsAsStruct is set to true. If the table is not partitioned, this + /// column can be omitted. + /// + /// This field is only available in add action records read from checkpoints + #[cfg(feature = "parquet")] + #[serde(skip_serializing, skip_deserializing)] + pub partition_values_parsed: Option, + /// Partition values parsed for parquet2 + #[cfg(feature = "parquet2")] + #[serde(skip_serializing, skip_deserializing)] + pub partition_values_parsed: Option, + + /// Contains statistics (e.g., count, min/max values for columns) about the data in this file in + /// raw parquet format. This field needs to be written when statistics are available and the + /// table property: delta.checkpoint.writeStatsAsStruct is set to true. + /// + /// This field is only available in add action records read from checkpoints + #[cfg(feature = "parquet")] + #[serde(skip_serializing, skip_deserializing)] + pub stats_parsed: Option, + /// Stats parsed for parquet2 + #[cfg(feature = "parquet2")] + #[serde(skip_serializing, skip_deserializing)] + pub stats_parsed: Option, +} + +impl Add { + /// get the unique id of the deletion vector, if any + pub fn dv_unique_id(&self) -> Option { + self.deletion_vector.clone().map(|dv| dv.unique_id()) + } + + /// set the base row id of the add action + pub fn with_base_row_id(mut self, base_row_id: i64) -> Self { + self.base_row_id = Some(base_row_id); + self + } +} + +/// Represents a tombstone (deleted file) in the Delta log. +#[derive(Serialize, Deserialize, Debug, Clone, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub struct Remove { + /// A relative path to a data file from the root of the table or an absolute path to a file + /// that should be added to the table. The path is a URI as specified by + /// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path. + /// + /// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt + pub path: String, + + /// When `false` the logical file must already be present in the table or the records + /// in the added file must be contained in one or more remove actions in the same version. + pub data_change: bool, + + /// The time this logical file was created, as milliseconds since the epoch. + pub deletion_timestamp: Option, + + /// When true the fields `partition_values`, `size`, and `tags` are present + pub extended_file_metadata: Option, + + /// A map from partition column to value for this logical file. + #[serde(skip_serializing_if = "Option::is_none")] + pub partition_values: Option>>, + + /// The size of this data file in bytes + #[serde(skip_serializing_if = "Option::is_none")] + pub size: Option, + + /// Map containing metadata about this logical file. + #[serde(skip_serializing_if = "Option::is_none")] + pub tags: Option>>, + + /// Information about deletion vector (DV) associated with this add action + #[serde(skip_serializing_if = "Option::is_none")] + pub deletion_vector: Option, + + /// Default generated Row ID of the first row in the file. The default generated Row IDs + /// of the other rows in the file can be reconstructed by adding the physical index of the + /// row within the file to the base Row ID + pub base_row_id: Option, + + /// First commit version in which an add action with the same path was committed to the table. + pub default_row_commit_version: Option, +} + +impl Remove { + /// get the unique id of the deletion vector, if any + pub fn dv_unique_id(&self) -> Option { + self.deletion_vector.clone().map(|dv| dv.unique_id()) + } +} + +/// Delta AddCDCFile action that describes a parquet CDC data file. +#[derive(Serialize, Deserialize, Clone, Debug, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct AddCDCFile { + /// A relative path, from the root of the table, or an + /// absolute path to a CDC file + #[serde(with = "serde_path")] + pub path: String, + /// The size of this file in bytes + pub size: i64, + /// A map from partition column to value for this file + pub partition_values: HashMap>, + /// Should always be set to false because they do not change the underlying data of the table + pub data_change: bool, + /// Map containing metadata about this file + pub tags: Option>>, +} + +/// Action used by streaming systems to track progress using application-specific versions to +/// enable idempotency. +#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct Txn { + /// A unique identifier for the application performing the transaction. + pub app_id: String, + /// An application-specific numeric identifier for this transaction. + pub version: i64, + /// The time when this transaction action was created in milliseconds since the Unix epoch. + pub last_updated: Option, +} + +/// The commitInfo is a fairly flexible action within the delta specification, where arbitrary data can be stored. +/// However the reference implementation as well as delta-rs store useful information that may for instance +/// allow us to be more permissive in commit conflict resolution. +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct CommitInfo { + /// Timestamp in millis when the commit was created + #[serde(skip_serializing_if = "Option::is_none")] + pub timestamp: Option, + /// Id of the user invoking the commit + #[serde(skip_serializing_if = "Option::is_none")] + pub user_id: Option, + /// Name of the user invoking the commit + #[serde(skip_serializing_if = "Option::is_none")] + pub user_name: Option, + /// The operation performed during the + #[serde(skip_serializing_if = "Option::is_none")] + pub operation: Option, + /// Parameters used for table operation + #[serde(skip_serializing_if = "Option::is_none")] + pub operation_parameters: Option>, + /// Version of the table when the operation was started + #[serde(skip_serializing_if = "Option::is_none")] + pub read_version: Option, + /// The isolation level of the commit + #[serde(skip_serializing_if = "Option::is_none")] + pub isolation_level: Option, + /// TODO + #[serde(skip_serializing_if = "Option::is_none")] + pub is_blind_append: Option, + /// Delta engine which created the commit. + #[serde(skip_serializing_if = "Option::is_none")] + pub engine_info: Option, + /// Additional provenance information for the commit + #[serde(flatten, default)] + pub info: HashMap, +} + +/// The domain metadata action contains a configuration (string) for a named metadata domain +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct DomainMetadata { + /// Identifier for this domain (system or user-provided) + pub domain: String, + /// String containing configuration for the metadata domain + pub configuration: String, + /// When `true` the action serves as a tombstone + pub removed: bool, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +/// The isolation level applied during transaction +pub enum IsolationLevel { + /// The strongest isolation level. It ensures that committed write operations + /// and all reads are Serializable. Operations are allowed as long as there + /// exists a serial sequence of executing them one-at-a-time that generates + /// the same outcome as that seen in the table. For the write operations, + /// the serial sequence is exactly the same as that seen in the table’s history. + Serializable, + + /// A weaker isolation level than Serializable. It ensures only that the write + /// operations (that is, not reads) are serializable. However, this is still stronger + /// than Snapshot isolation. WriteSerializable is the default isolation level because + /// it provides great balance of data consistency and availability for most common operations. + WriteSerializable, + + /// SnapshotIsolation is a guarantee that all reads made in a transaction will see a consistent + /// snapshot of the database (in practice it reads the last committed values that existed at the + /// time it started), and the transaction itself will successfully commit only if no updates + /// it has made conflict with any concurrent updates made since that snapshot. + SnapshotIsolation, +} + +// Spark assumes Serializable as default isolation level +// https://github.com/delta-io/delta/blob/abb171c8401200e7772b27e3be6ea8682528ac72/core/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala#L1023 +impl Default for IsolationLevel { + fn default() -> Self { + Self::Serializable + } +} + +impl AsRef for IsolationLevel { + fn as_ref(&self) -> &str { + match self { + Self::Serializable => "Serializable", + Self::WriteSerializable => "WriteSerializable", + Self::SnapshotIsolation => "SnapshotIsolation", + } + } +} + +impl FromStr for IsolationLevel { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "serializable" => Ok(Self::Serializable), + "writeserializable" | "write_serializable" => Ok(Self::WriteSerializable), + "snapshotisolation" | "snapshot_isolation" => Ok(Self::SnapshotIsolation), + _ => Err(Error::Generic("Invalid string for IsolationLevel".into())), + } + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + // use std::sync::Arc; + + // use object_store::local::LocalFileSystem; + + use crate::kernel::PrimitiveType; + + use super::*; + // use crate::client::filesystem::ObjectStoreFileSystemClient; + // use crate::executor::tokio::TokioBackgroundExecutor; + + fn dv_relateive() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "u".parse().unwrap(), + path_or_inline_dv: "ab^-aqEH.-t@S}K{vb[*k^".to_string(), + offset: Some(4), + size_in_bytes: 40, + cardinality: 6, + } + } + + fn dv_absolute() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "p".parse().unwrap(), + path_or_inline_dv: + "s3://mytable/deletion_vector_d2c639aa-8816-431a-aaf6-d3fe2512ff61.bin".to_string(), + offset: Some(4), + size_in_bytes: 40, + cardinality: 6, + } + } + + fn dv_inline() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "i".parse().unwrap(), + path_or_inline_dv: "wi5b=000010000siXQKl0rr91000f55c8Xg0@@D72lkbi5=-{L".to_string(), + offset: None, + size_in_bytes: 40, + cardinality: 6, + } + } + + fn dv_example() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "u".parse().unwrap(), + path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), + offset: Some(1), + size_in_bytes: 36, + cardinality: 2, + } + } + + #[test] + fn test_deletion_vector_absolute_path() { + let parent = Url::parse("s3://mytable/").unwrap(); + + let relative = dv_relateive(); + let expected = + Url::parse("s3://mytable/ab/deletion_vector_d2c639aa-8816-431a-aaf6-d3fe2512ff61.bin") + .unwrap(); + assert_eq!(expected, relative.absolute_path(&parent).unwrap().unwrap()); + + let absolute = dv_absolute(); + let expected = + Url::parse("s3://mytable/deletion_vector_d2c639aa-8816-431a-aaf6-d3fe2512ff61.bin") + .unwrap(); + assert_eq!(expected, absolute.absolute_path(&parent).unwrap().unwrap()); + + let inline = dv_inline(); + assert_eq!(None, inline.absolute_path(&parent).unwrap()); + + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let parent = url::Url::from_directory_path(path).unwrap(); + let dv_url = parent + .join("deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin") + .unwrap(); + let example = dv_example(); + assert_eq!(dv_url, example.absolute_path(&parent).unwrap().unwrap()); + } + + #[test] + fn test_primitive() { + let types: PrimitiveType = serde_json::from_str("\"string\"").unwrap(); + println!("{:?}", types); + } + + // #[test] + // fn test_deletion_vector_read() { + // let store = Arc::new(LocalFileSystem::new()); + // let path = + // std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + // let parent = url::Url::from_directory_path(path).unwrap(); + // let root = object_store::path::Path::from(parent.path()); + // let fs_client = Arc::new(ObjectStoreFileSystemClient::new( + // store, + // root, + // Arc::new(TokioBackgroundExecutor::new()), + // )); + // + // let example = dv_example(); + // let tree_map = example.read(fs_client, parent).unwrap(); + // + // let expected: Vec = vec![0, 9]; + // let found = tree_map.iter().collect::>(); + // assert_eq!(found, expected) + // } +} diff --git a/crates/deltalake-core/src/kernel/error.rs b/crates/deltalake-core/src/kernel/error.rs new file mode 100644 index 0000000000..8ec799ca96 --- /dev/null +++ b/crates/deltalake-core/src/kernel/error.rs @@ -0,0 +1,78 @@ +//! Error types for Delta Lake operations. + +/// A specialized [`Result`] type for Delta Lake operations. +pub type DeltaResult = std::result::Result; + +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[cfg(feature = "arrow")] + #[error("Arrow error: {0}")] + Arrow(#[from] arrow_schema::ArrowError), + + #[error("Generic delta kernel error: {0}")] + Generic(String), + + #[error("Generic error: {source}")] + GenericError { + /// Source error + source: Box, + }, + + #[cfg(feature = "parquet")] + #[error("Arrow error: {0}")] + Parquet(#[from] parquet::errors::ParquetError), + + #[cfg(feature = "object_store")] + #[error("Error interacting with object store: {0}")] + ObjectStore(object_store::Error), + + #[error("File not found: {0}")] + FileNotFound(String), + + #[error("{0}")] + MissingColumn(String), + + #[error("Expected column type: {0}")] + UnexpectedColumnType(String), + + #[error("Expected is missing: {0}")] + MissingData(String), + + #[error("No table version found.")] + MissingVersion, + + #[error("Deletion Vector error: {0}")] + DeletionVector(String), + + #[error("Schema error: {0}")] + Schema(String), + + #[error("Invalid url: {0}")] + InvalidUrl(#[from] url::ParseError), + + #[error("Invalid url: {0}")] + MalformedJson(#[from] serde_json::Error), + + #[error("No table metadata found in delta log.")] + MissingMetadata, + + /// Error returned when the log contains invalid stats JSON. + #[error("Invalid JSON in invariant expression, line=`{line}`, err=`{json_err}`")] + InvalidInvariantJson { + /// JSON error details returned when parsing the invariant expression JSON. + json_err: serde_json::error::Error, + /// Invariant expression. + line: String, + }, +} + +#[cfg(feature = "object_store")] +impl From for Error { + fn from(value: object_store::Error) -> Self { + match value { + object_store::Error::NotFound { path, .. } => Self::FileNotFound(path), + err => Self::ObjectStore(err), + } + } +} diff --git a/crates/deltalake-core/src/kernel/mod.rs b/crates/deltalake-core/src/kernel/mod.rs new file mode 100644 index 0000000000..7785c273f9 --- /dev/null +++ b/crates/deltalake-core/src/kernel/mod.rs @@ -0,0 +1,9 @@ +//! Kernel module + +pub mod actions; +pub mod error; +pub mod schema; + +pub use actions::*; +pub use error::*; +pub use schema::*; diff --git a/crates/deltalake-core/src/kernel/schema.rs b/crates/deltalake-core/src/kernel/schema.rs new file mode 100644 index 0000000000..12391ca6e8 --- /dev/null +++ b/crates/deltalake-core/src/kernel/schema.rs @@ -0,0 +1,788 @@ +//! Delta table schema + +use std::fmt::Formatter; +use std::sync::Arc; +use std::{collections::HashMap, fmt::Display}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use super::error::Error; + +/// Type alias for a top level schema +pub type Schema = StructType; +/// Schema reference type +pub type SchemaRef = Arc; + +/// A value that can be stored in the metadata of a Delta table schema entity. +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(untagged)] +pub enum MetadataValue { + /// A number value + Number(i32), + /// A string value + String(String), +} + +impl From for MetadataValue { + fn from(value: String) -> Self { + Self::String(value) + } +} + +impl From<&String> for MetadataValue { + fn from(value: &String) -> Self { + Self::String(value.clone()) + } +} + +impl From for MetadataValue { + fn from(value: i32) -> Self { + Self::Number(value) + } +} + +impl From for MetadataValue { + fn from(value: Value) -> Self { + Self::String(value.to_string()) + } +} + +#[derive(Debug)] +#[allow(missing_docs)] +pub enum ColumnMetadataKey { + ColumnMappingId, + ColumnMappingPhysicalName, + GenerationExpression, + IdentityStart, + IdentityStep, + IdentityHighWaterMark, + IdentityAllowExplicitInsert, + Invariants, +} + +impl AsRef for ColumnMetadataKey { + fn as_ref(&self) -> &str { + match self { + Self::ColumnMappingId => "delta.columnMapping.id", + Self::ColumnMappingPhysicalName => "delta.columnMapping.physicalName", + Self::GenerationExpression => "delta.generationExpression", + Self::IdentityAllowExplicitInsert => "delta.identity.allowExplicitInsert", + Self::IdentityHighWaterMark => "delta.identity.highWaterMark", + Self::IdentityStart => "delta.identity.start", + Self::IdentityStep => "delta.identity.step", + Self::Invariants => "delta.invariants", + } + } +} + +/// An invariant for a column that is enforced on all writes to a Delta table. +#[derive(Eq, PartialEq, Debug, Default, Clone)] +pub struct Invariant { + /// The full path to the field. + pub field_name: String, + /// The SQL string that must always evaluate to true. + pub invariant_sql: String, +} + +impl Invariant { + /// Create a new invariant + pub fn new(field_name: &str, invariant_sql: &str) -> Self { + Self { + field_name: field_name.to_string(), + invariant_sql: invariant_sql.to_string(), + } + } +} + +/// Represents a struct field defined in the Delta table schema. +// https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +pub struct StructField { + /// Name of this (possibly nested) column + pub name: String, + /// The data type of this field + #[serde(rename = "type")] + pub data_type: DataType, + /// Denotes whether this Field can be null + pub nullable: bool, + /// A JSON map containing information about this column + pub metadata: HashMap, +} + +impl StructField { + /// Creates a new field + pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { + Self { + name: name.into(), + data_type, + nullable, + metadata: HashMap::default(), + } + } + + /// Creates a new field with metadata + pub fn with_metadata( + mut self, + metadata: impl IntoIterator, impl Into)>, + ) -> Self { + self.metadata = metadata + .into_iter() + .map(|(k, v)| (k.into(), v.into())) + .collect(); + self + } + + /// Get the value of a specific metadata key + pub fn get_config_value(&self, key: &ColumnMetadataKey) -> Option<&MetadataValue> { + self.metadata.get(key.as_ref()) + } + + #[inline] + /// Returns the name of the column + pub fn name(&self) -> &String { + &self.name + } + + #[inline] + /// Returns whether the column is nullable + pub fn is_nullable(&self) -> bool { + self.nullable + } + + #[inline] + /// Returns the data type of the column + pub const fn data_type(&self) -> &DataType { + &self.data_type + } + + #[inline] + /// Returns the metadata of the column + pub const fn metadata(&self) -> &HashMap { + &self.metadata + } +} + +/// A struct is used to represent both the top-level schema of the table +/// as well as struct columns that contain nested columns. +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +pub struct StructType { + #[serde(rename = "type")] + /// The type of this struct + pub type_name: String, + /// The type of element stored in this array + pub fields: Vec, +} + +impl StructType { + /// Creates a new struct type + pub fn new(fields: Vec) -> Self { + Self { + type_name: "struct".into(), + fields, + } + } + + /// Returns an immutable reference of the fields in the struct + pub fn fields(&self) -> &Vec { + &self.fields + } + + /// Find the index of the column with the given name. + pub fn index_of(&self, name: &str) -> Result { + let (idx, _) = self + .fields() + .iter() + .enumerate() + .find(|(_, b)| b.name() == name) + .ok_or_else(|| { + let valid_fields: Vec<_> = self.fields.iter().map(|f| f.name()).collect(); + Error::Schema(format!( + "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" + )) + })?; + Ok(idx) + } + + /// Returns a reference of a specific [`StructField`] instance selected by name. + pub fn field_with_name(&self, name: &str) -> Result<&StructField, Error> { + Ok(&self.fields[self.index_of(name)?]) + } + + /// Get all invariants in the schemas + pub fn get_invariants(&self) -> Result, Error> { + let mut remaining_fields: Vec<(String, StructField)> = self + .fields() + .iter() + .map(|field| (field.name.clone(), field.clone())) + .collect(); + let mut invariants: Vec = Vec::new(); + + let add_segment = |prefix: &str, segment: &str| -> String { + if prefix.is_empty() { + segment.to_owned() + } else { + format!("{prefix}.{segment}") + } + }; + + while let Some((field_path, field)) = remaining_fields.pop() { + match field.data_type() { + DataType::Struct(inner) => { + remaining_fields.extend( + inner + .fields() + .iter() + .map(|field| { + let new_prefix = add_segment(&field_path, &field.name); + (new_prefix, field.clone()) + }) + .collect::>(), + ); + } + DataType::Array(inner) => { + let element_field_name = add_segment(&field_path, "element"); + remaining_fields.push(( + element_field_name, + StructField::new("".to_string(), inner.element_type.clone(), false), + )); + } + DataType::Map(inner) => { + let key_field_name = add_segment(&field_path, "key"); + remaining_fields.push(( + key_field_name, + StructField::new("".to_string(), inner.key_type.clone(), false), + )); + let value_field_name = add_segment(&field_path, "value"); + remaining_fields.push(( + value_field_name, + StructField::new("".to_string(), inner.value_type.clone(), false), + )); + } + _ => {} + } + // JSON format: {"expression": {"expression": ""} } + if let Some(MetadataValue::String(invariant_json)) = + field.metadata.get(ColumnMetadataKey::Invariants.as_ref()) + { + let json: Value = serde_json::from_str(invariant_json).map_err(|e| { + Error::InvalidInvariantJson { + json_err: e, + line: invariant_json.to_string(), + } + })?; + if let Value::Object(json) = json { + if let Some(Value::Object(expr1)) = json.get("expression") { + if let Some(Value::String(sql)) = expr1.get("expression") { + invariants.push(Invariant::new(&field_path, sql)); + } + } + } + } + } + Ok(invariants) + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(rename_all = "camelCase")] +/// An array stores a variable length collection of items of some type. +pub struct ArrayType { + #[serde(rename = "type")] + /// The type of this struct + pub type_name: String, + /// The type of element stored in this array + pub element_type: DataType, + /// Denoting whether this array can contain one or more null values + pub contains_null: bool, +} + +impl ArrayType { + /// Creates a new array type + pub fn new(element_type: DataType, contains_null: bool) -> Self { + Self { + type_name: "array".into(), + element_type, + contains_null, + } + } + + #[inline] + /// Returns the element type of the array + pub const fn element_type(&self) -> &DataType { + &self.element_type + } + + #[inline] + /// Returns whether the array can contain null values + pub const fn contains_null(&self) -> bool { + self.contains_null + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(rename_all = "camelCase")] +/// A map stores an arbitrary length collection of key-value pairs +pub struct MapType { + #[serde(rename = "type")] + /// The type of this struct + pub type_name: String, + /// The type of element used for the key of this map + pub key_type: DataType, + /// The type of element used for the value of this map + pub value_type: DataType, + /// Denoting whether this array can contain one or more null values + #[serde(default = "default_true")] + pub value_contains_null: bool, +} + +impl MapType { + /// Creates a new map type + pub fn new(key_type: DataType, value_type: DataType, value_contains_null: bool) -> Self { + Self { + type_name: "map".into(), + key_type, + value_type, + value_contains_null, + } + } + + #[inline] + /// Returns the key type of the map + pub const fn key_type(&self) -> &DataType { + &self.key_type + } + + #[inline] + /// Returns the value type of the map + pub const fn value_type(&self) -> &DataType { + &self.value_type + } + + #[inline] + /// Returns whether the map can contain null values + pub const fn value_contains_null(&self) -> bool { + self.value_contains_null + } +} + +fn default_true() -> bool { + true +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(rename_all = "camelCase")] +/// Primitive types supported by Delta +pub enum PrimitiveType { + /// UTF-8 encoded string of characters + String, + /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807 + Long, + /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647 + Integer, + /// i16: 2-byte signed integer numbers. Range: -32768 to 32767 + Short, + /// i8: 1-byte signed integer number. Range: -128 to 127 + Byte, + /// f32: 4-byte single-precision floating-point numbers + Float, + /// f64: 8-byte double-precision floating-point numbers + Double, + /// bool: boolean values + Boolean, + /// Binary: uninterpreted binary data + Binary, + /// Date: Calendar date (year, month, day) + Date, + /// Microsecond precision timestamp, adjusted to UTC. + Timestamp, + // TODO: timestamp without timezone + #[serde( + serialize_with = "serialize_decimal", + deserialize_with = "deserialize_decimal", + untagged + )] + /// Decimal: arbitrary precision decimal numbers + Decimal(i32, i32), +} + +fn serialize_decimal( + precision: &i32, + scale: &i32, + serializer: S, +) -> Result { + serializer.serialize_str(&format!("decimal({},{})", precision, scale)) +} + +fn deserialize_decimal<'de, D>(deserializer: D) -> Result<(i32, i32), D::Error> +where + D: serde::Deserializer<'de>, +{ + let str_value = String::deserialize(deserializer)?; + if !str_value.starts_with("decimal(") || !str_value.ends_with(')') { + return Err(serde::de::Error::custom(format!( + "Invalid decimal: {}", + str_value + ))); + } + + let mut parts = str_value[8..str_value.len() - 1].split(','); + let precision = parts + .next() + .and_then(|part| part.trim().parse::().ok()) + .ok_or_else(|| { + serde::de::Error::custom(format!("Invalid precision in decimal: {}", str_value)) + })?; + let scale = parts + .next() + .and_then(|part| part.trim().parse::().ok()) + .ok_or_else(|| { + serde::de::Error::custom(format!("Invalid scale in decimal: {}", str_value)) + })?; + + Ok((precision, scale)) +} + +impl Display for PrimitiveType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + PrimitiveType::String => write!(f, "string"), + PrimitiveType::Long => write!(f, "long"), + PrimitiveType::Integer => write!(f, "integer"), + PrimitiveType::Short => write!(f, "short"), + PrimitiveType::Byte => write!(f, "byte"), + PrimitiveType::Float => write!(f, "float"), + PrimitiveType::Double => write!(f, "double"), + PrimitiveType::Boolean => write!(f, "boolean"), + PrimitiveType::Binary => write!(f, "binary"), + PrimitiveType::Date => write!(f, "date"), + PrimitiveType::Timestamp => write!(f, "timestamp"), + PrimitiveType::Decimal(precision, scale) => { + write!(f, "decimal({},{})", precision, scale) + } + } + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(untagged, rename_all = "camelCase")] +/// The data type of a column +pub enum DataType { + /// UTF-8 encoded string of characters + Primitive(PrimitiveType), + /// An array stores a variable length collection of items of some type. + Array(Box), + /// A struct is used to represent both the top-level schema of the table as well + /// as struct columns that contain nested columns. + Struct(Box), + /// A map stores an arbitrary length collection of key-value pairs + /// with a single keyType and a single valueType + Map(Box), +} + +impl DataType { + /// create a new string type + pub fn string() -> Self { + DataType::Primitive(PrimitiveType::String) + } + + /// create a new long type + pub fn long() -> Self { + DataType::Primitive(PrimitiveType::Long) + } + + /// create a new integer type + pub fn integer() -> Self { + DataType::Primitive(PrimitiveType::Integer) + } + + /// create a new short type + pub fn short() -> Self { + DataType::Primitive(PrimitiveType::Short) + } + + /// create a new byte type + pub fn byte() -> Self { + DataType::Primitive(PrimitiveType::Byte) + } + + /// create a new float type + pub fn float() -> Self { + DataType::Primitive(PrimitiveType::Float) + } + + /// create a new double type + pub fn double() -> Self { + DataType::Primitive(PrimitiveType::Double) + } + + /// create a new boolean type + pub fn boolean() -> Self { + DataType::Primitive(PrimitiveType::Boolean) + } + + /// create a new binary type + pub fn binary() -> Self { + DataType::Primitive(PrimitiveType::Binary) + } + + /// create a new date type + pub fn date() -> Self { + DataType::Primitive(PrimitiveType::Date) + } + + /// create a new timestamp type + pub fn timestamp() -> Self { + DataType::Primitive(PrimitiveType::Timestamp) + } + + /// create a new decimal type + pub fn decimal(precision: usize, scale: usize) -> Self { + DataType::Primitive(PrimitiveType::Decimal(precision as i32, scale as i32)) + } +} + +impl Display for DataType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + DataType::Primitive(p) => write!(f, "{}", p), + DataType::Array(a) => write!(f, "array<{}>", a.element_type), + DataType::Struct(s) => { + write!(f, "struct<")?; + for (i, field) in s.fields.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}: {}", field.name, field.data_type)?; + } + write!(f, ">") + } + DataType::Map(m) => write!(f, "map<{}, {}>", m.key_type, m.value_type), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json; + use serde_json::json; + + #[test] + fn test_serde_data_types() { + let data = r#" + { + "name": "a", + "type": "integer", + "nullable": false, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!( + field.data_type, + DataType::Primitive(PrimitiveType::Integer) + )); + + let data = r#" + { + "name": "c", + "type": { + "type": "array", + "elementType": "integer", + "containsNull": false + }, + "nullable": true, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!(field.data_type, DataType::Array(_))); + + let data = r#" + { + "name": "e", + "type": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": {} + } + ] + }, + "containsNull": true + }, + "nullable": true, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!(field.data_type, DataType::Array(_))); + match field.data_type { + DataType::Array(array) => assert!(matches!(array.element_type, DataType::Struct(_))), + _ => unreachable!(), + } + + let data = r#" + { + "name": "f", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!(field.data_type, DataType::Map(_))); + } + + #[test] + fn test_roundtrip_decimal() { + let data = r#" + { + "name": "a", + "type": "decimal(10, 2)", + "nullable": false, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!( + field.data_type, + DataType::Primitive(PrimitiveType::Decimal(10, 2)) + )); + + let json_str = serde_json::to_string(&field).unwrap(); + assert_eq!( + json_str, + r#"{"name":"a","type":"decimal(10,2)","nullable":false,"metadata":{}}"# + ); + } + + #[test] + fn test_field_metadata() { + let data = r#" + { + "name": "e", + "type": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": { + "delta.columnMapping.id": 5, + "delta.columnMapping.physicalName": "col-a7f4159c-53be-4cb0-b81a-f7e5240cfc49" + } + } + ] + }, + "containsNull": true + }, + "nullable": true, + "metadata": { + "delta.columnMapping.id": 4, + "delta.columnMapping.physicalName": "col-5f422f40-de70-45b2-88ab-1d5c90e94db1" + } + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + + let col_id = field + .get_config_value(&ColumnMetadataKey::ColumnMappingId) + .unwrap(); + assert!(matches!(col_id, MetadataValue::Number(num) if *num == 4)); + let physical_name = field + .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + .unwrap(); + assert!( + matches!(physical_name, MetadataValue::String(name) if *name == "col-5f422f40-de70-45b2-88ab-1d5c90e94db1") + ); + } + + #[test] + fn test_read_schemas() { + let file = std::fs::File::open("./tests/serde/schema.json").unwrap(); + let schema: Result = serde_json::from_reader(file); + assert!(schema.is_ok()); + + let file = std::fs::File::open("./tests/serde/checkpoint_schema.json").unwrap(); + let schema: Result = serde_json::from_reader(file); + assert!(schema.is_ok()) + } + + #[test] + fn test_get_invariants() { + let schema: StructType = serde_json::from_value(json!({ + "type": "struct", + "fields": [{"name": "x", "type": "string", "nullable": true, "metadata": {}}] + })) + .unwrap(); + let invariants = schema.get_invariants().unwrap(); + assert_eq!(invariants.len(), 0); + + let schema: StructType = serde_json::from_value(json!({ + "type": "struct", + "fields": [ + {"name": "x", "type": "integer", "nullable": true, "metadata": { + "delta.invariants": "{\"expression\": { \"expression\": \"x > 2\"} }" + }}, + {"name": "y", "type": "integer", "nullable": true, "metadata": { + "delta.invariants": "{\"expression\": { \"expression\": \"y < 4\"} }" + }} + ] + })) + .unwrap(); + let invariants = schema.get_invariants().unwrap(); + assert_eq!(invariants.len(), 2); + assert!(invariants.contains(&Invariant::new("x", "x > 2"))); + assert!(invariants.contains(&Invariant::new("y", "y < 4"))); + + let schema: StructType = serde_json::from_value(json!({ + "type": "struct", + "fields": [{ + "name": "a_map", + "type": { + "type": "map", + "keyType": "string", + "valueType": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [{ + "name": "d", + "type": "integer", + "metadata": { + "delta.invariants": "{\"expression\": { \"expression\": \"a_map.value.element.d < 4\"} }" + }, + "nullable": false + }] + }, + "containsNull": false + }, + "valueContainsNull": false + }, + "nullable": false, + "metadata": {} + }] + })).unwrap(); + let invariants = schema.get_invariants().unwrap(); + assert_eq!(invariants.len(), 1); + assert_eq!( + invariants[0], + Invariant::new("a_map.value.element.d", "a_map.value.element.d < 4") + ); + } +} diff --git a/crates/deltalake-core/src/lib.rs b/crates/deltalake-core/src/lib.rs index fa7f65963f..d683b906dd 100644 --- a/crates/deltalake-core/src/lib.rs +++ b/crates/deltalake-core/src/lib.rs @@ -84,6 +84,7 @@ compile_error!( pub mod data_catalog; pub mod errors; +pub mod kernel; pub mod operations; pub mod protocol; pub mod schema; @@ -200,12 +201,17 @@ mod tests { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 4); - assert!(tombstones.contains(&crate::protocol::Remove { + assert!(tombstones.contains(&crate::kernel::Remove { path: "part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1564524298213), data_change: false, extended_file_metadata: Some(false), - ..Default::default() + deletion_vector: None, + partition_values: None, + tags: None, + base_row_id: None, + default_row_commit_version: None, + size: None, })); } @@ -302,14 +308,17 @@ mod tests { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 1); - assert!(tombstones.contains(&crate::protocol::Remove { + assert!(tombstones.contains(&crate::kernel::Remove { path: "part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1615043776198), data_change: true, extended_file_metadata: Some(true), partition_values: Some(HashMap::new()), size: Some(445), - ..Default::default() + base_row_id: None, + default_row_commit_version: None, + deletion_vector: None, + tags: None, })); } diff --git a/crates/deltalake-core/src/operations/create.rs b/crates/deltalake-core/src/operations/create.rs index 8a78f2266b..1dc9fdf8b2 100644 --- a/crates/deltalake-core/src/operations/create.rs +++ b/crates/deltalake-core/src/operations/create.rs @@ -10,8 +10,8 @@ use serde_json::{Map, Value}; use super::transaction::commit; use super::{MAX_SUPPORTED_READER_VERSION, MAX_SUPPORTED_WRITER_VERSION}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, DeltaOperation, MetaData, Protocol, SaveMode}; -use crate::schema::{SchemaDataType, SchemaField, SchemaTypeStruct}; +use crate::kernel::{Action, DataType, Metadata, Protocol, StructField, StructType}; +use crate::protocol::{DeltaOperation, SaveMode}; use crate::storage::DeltaObjectStore; use crate::table::builder::ensure_table_uri; use crate::table::config::DeltaConfigKey; @@ -51,7 +51,7 @@ pub struct CreateBuilder { location: Option, mode: SaveMode, comment: Option, - columns: Vec, + columns: Vec, partition_columns: Option>, storage_options: Option>, actions: Vec, @@ -114,23 +114,22 @@ impl CreateBuilder { pub fn with_column( mut self, name: impl Into, - data_type: SchemaDataType, + data_type: DataType, nullable: bool, metadata: Option>, ) -> Self { - self.columns.push(SchemaField::new( - name.into(), - data_type, - nullable, - metadata.unwrap_or_default(), - )); + let mut field = StructField::new(name.into(), data_type, nullable); + if let Some(meta) = metadata { + field = field.with_metadata(meta); + }; + self.columns.push(field); self } /// Specify columns to append to schema pub fn with_columns( mut self, - columns: impl IntoIterator>, + columns: impl IntoIterator>, ) -> Self { self.columns.extend(columns.into_iter().map(|c| c.into())); self @@ -212,7 +211,7 @@ impl CreateBuilder { if self .actions .iter() - .any(|a| matches!(a, Action::metaData(_))) + .any(|a| matches!(a, Action::Metadata(_))) { return Err(CreateError::MetadataSpecified.into()); } @@ -242,9 +241,9 @@ impl CreateBuilder { let protocol = self .actions .iter() - .find(|a| matches!(a, Action::protocol(_))) + .find(|a| matches!(a, Action::Protocol(_))) .map(|a| match a { - Action::protocol(p) => p.clone(), + Action::Protocol(p) => p.clone(), _ => unreachable!(), }) .unwrap_or_else(|| Protocol { @@ -258,7 +257,7 @@ impl CreateBuilder { self.name, self.comment, None, - SchemaTypeStruct::new(self.columns), + StructType::new(self.columns), self.partition_columns.unwrap_or_default(), self.configuration, ); @@ -271,13 +270,13 @@ impl CreateBuilder { }; let mut actions = vec![ - Action::protocol(protocol), - Action::metaData(MetaData::try_from(metadata)?), + Action::Protocol(protocol), + Action::Metadata(Metadata::try_from(metadata)?), ]; actions.extend( self.actions .into_iter() - .filter(|a| !matches!(a, Action::protocol(_))), + .filter(|a| !matches!(a, Action::Protocol(_))), ); Ok((table, actions, operation)) @@ -340,7 +339,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -360,7 +359,7 @@ mod tests { .await .unwrap() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -378,7 +377,7 @@ mod tests { ); let table = CreateBuilder::new() .with_location(format!("./{relative_path}")) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -389,7 +388,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -406,8 +405,8 @@ mod tests { }; let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.get_fields().clone()) - .with_actions(vec![Action::protocol(protocol)]) + .with_columns(schema.fields().clone()) + .with_actions(vec![Action::Protocol(protocol)]) .await .unwrap(); assert_eq!(table.get_min_reader_version(), 0); @@ -415,7 +414,7 @@ mod tests { let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_configuration_property(DeltaConfigKey::AppendOnly, Some("true")) .await .unwrap(); @@ -438,7 +437,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location(tmp_dir.path().to_str().unwrap()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -449,7 +448,7 @@ mod tests { // Check an error is raised when a table exists at location let table = CreateBuilder::new() .with_object_store(object_store.clone()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_save_mode(SaveMode::ErrorIfExists) .await; assert!(table.is_err()); @@ -457,7 +456,7 @@ mod tests { // Check current table is returned when ignore option is chosen. let table = CreateBuilder::new() .with_object_store(object_store.clone()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -466,7 +465,7 @@ mod tests { // Check table is overwritten let table = CreateBuilder::new() .with_object_store(object_store.clone()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().iter().cloned()) .with_save_mode(SaveMode::Overwrite) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/delete.rs b/crates/deltalake-core/src/operations/delete.rs index d387024673..7f8be1f293 100644 --- a/crates/deltalake-core/src/operations/delete.rs +++ b/crates/deltalake-core/src/operations/delete.rs @@ -17,11 +17,10 @@ //! .await?; //! ```` +use std::collections::HashMap; use std::sync::Arc; use std::time::{Instant, SystemTime, UNIX_EPOCH}; -use crate::delta_datafusion::expr::fmt_expr_to_sql; -use crate::protocol::{Action, Add, Remove}; use datafusion::execution::context::{SessionContext, SessionState}; use datafusion::physical_expr::create_physical_expr; use datafusion::physical_plan::filter::FilterExec; @@ -32,11 +31,12 @@ use datafusion_common::DFSchema; use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::Map; use serde_json::Value; +use crate::delta_datafusion::expr::fmt_expr_to_sql; use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; use crate::errors::{DeltaResult, DeltaTableError}; +use crate::kernel::{Action, Add, Remove}; use crate::operations::transaction::commit; use crate::operations::write::write_execution_plan; use crate::protocol::DeltaOperation; @@ -60,7 +60,7 @@ pub struct DeleteBuilder { /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, } #[derive(Default, Debug, Serialize)] @@ -112,7 +112,7 @@ impl DeleteBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -191,7 +191,7 @@ async fn execute( snapshot: &DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + app_metadata: Option>, ) -> DeltaResult<((Vec, i64), DeleteMetrics)> { let exec_start = Instant::now(); let mut metrics = DeleteMetrics::default(); @@ -226,21 +226,23 @@ async fn execute( .unwrap() .as_millis() as i64; - let mut actions: Vec = add.into_iter().map(Action::add).collect(); + let mut actions: Vec = add.into_iter().map(Action::Add).collect(); let mut version = snapshot.version(); metrics.num_removed_files = remove.len(); metrics.num_added_files = actions.len(); for action in remove { - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: action.path, deletion_timestamp: Some(deletion_timestamp), data_change: true, extended_file_metadata: Some(true), partition_values: Some(action.partition_values), size: Some(action.size), - deletion_vector: None, + deletion_vector: action.deletion_vector, tags: None, + base_row_id: action.base_row_id, + default_row_commit_version: action.default_row_commit_version, })) } @@ -334,7 +336,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/filesystem_check.rs b/crates/deltalake-core/src/operations/filesystem_check.rs index 83af12b57c..65716bbfe1 100644 --- a/crates/deltalake-core/src/operations/filesystem_check.rs +++ b/crates/deltalake-core/src/operations/filesystem_check.rs @@ -26,8 +26,9 @@ use serde::Serialize; use url::{ParseError, Url}; use crate::errors::{DeltaResult, DeltaTableError}; +use crate::kernel::{Action, Add, Remove}; use crate::operations::transaction::commit; -use crate::protocol::{Action, Add, DeltaOperation, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::DeltaObjectStore; use crate::table::state::DeltaTableState; use crate::DeltaTable; @@ -140,7 +141,7 @@ impl FileSystemCheckPlan { let deletion_time = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); let deletion_time = deletion_time.as_millis() as i64; removed_file_paths.push(file.path.clone()); - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: file.path, deletion_timestamp: Some(deletion_time), data_change: true, @@ -149,6 +150,8 @@ impl FileSystemCheckPlan { size: Some(file.size), deletion_vector: None, tags: file.tags, + base_row_id: file.base_row_id, + default_row_commit_version: file.default_row_commit_version, })); } diff --git a/crates/deltalake-core/src/operations/merge.rs b/crates/deltalake-core/src/operations/merge.rs index a51e7649fc..57621cb316 100644 --- a/crates/deltalake-core/src/operations/merge.rs +++ b/crates/deltalake-core/src/operations/merge.rs @@ -61,21 +61,19 @@ use datafusion_physical_expr::{create_physical_expr, expressions, PhysicalExpr}; use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::{Map, Value}; +use serde_json::Value; use super::datafusion_utils::{into_expr, maybe_into_expr, Expression}; use super::transaction::commit; use crate::delta_datafusion::expr::{fmt_expr_to_sql, parse_predicate_expression}; use crate::delta_datafusion::{register_store, DeltaScanBuilder}; +use crate::kernel::{Action, Remove}; use crate::operations::datafusion_utils::MetricObserverExec; -use crate::{ - operations::write::write_execution_plan, - storage::{DeltaObjectStore, ObjectStoreRef}, - DeltaResult, DeltaTable, DeltaTableError, -}; - -use crate::protocol::{Action, DeltaOperation, MergePredicate, Remove}; +use crate::operations::write::write_execution_plan; +use crate::protocol::{DeltaOperation, MergePredicate}; +use crate::storage::{DeltaObjectStore, ObjectStoreRef}; use crate::table::state::DeltaTableState; +use crate::{DeltaResult, DeltaTable, DeltaTableError}; const OPERATION_COLUMN: &str = "__delta_rs_operation"; const DELETE_COLUMN: &str = "__delta_rs_delete"; @@ -115,7 +113,7 @@ pub struct MergeBuilder { /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, /// safe_cast determines how data types that do not match the underlying table are handled /// By default an error is returned safe_cast: bool, @@ -343,7 +341,7 @@ impl MergeBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -567,7 +565,7 @@ async fn execute( snapshot: &DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + app_metadata: Option>, safe_cast: bool, source_alias: Option, target_alias: Option, @@ -843,7 +841,7 @@ async fn execute( let mut projection_map = HashMap::new(); let mut f = project_schema_df.fields().clone(); - for delta_field in snapshot.schema().unwrap().get_fields() { + for delta_field in snapshot.schema().unwrap().fields() { let mut when_expr = Vec::with_capacity(operations_size); let mut then_expr = Vec::with_capacity(operations_size); @@ -853,7 +851,7 @@ async fn execute( }), None => TableReference::none(), }; - let name = delta_field.get_name(); + let name = delta_field.name(); let column = Column::new(qualifier.clone(), name); let field = project_schema_df.field_with_name(qualifier.as_ref(), name)?; @@ -882,8 +880,8 @@ async fn execute( state.execution_props(), )?; - projection_map.insert(delta_field.get_name(), expressions.len()); - let name = "__delta_rs_c_".to_owned() + delta_field.get_name(); + projection_map.insert(delta_field.name(), expressions.len()); + let name = "__delta_rs_c_".to_owned() + delta_field.name(); f.push(DFField::new_unqualified( &name, @@ -1143,12 +1141,12 @@ async fn execute( .unwrap() .as_millis() as i64; - let mut actions: Vec = add_actions.into_iter().map(Action::add).collect(); + let mut actions: Vec = add_actions.into_iter().map(Action::Add).collect(); metrics.num_target_files_added = actions.len(); for action in snapshot.files() { metrics.num_target_files_removed += 1; - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: action.path.clone(), deletion_timestamp: Some(deletion_timestamp), data_change: true, @@ -1157,6 +1155,8 @@ async fn execute( deletion_vector: action.deletion_vector.clone(), size: Some(action.size), tags: None, + base_row_id: action.base_row_id, + default_row_commit_version: action.default_row_commit_version, })) } @@ -1270,7 +1270,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/optimize.rs b/crates/deltalake-core/src/operations/optimize.rs index ae9ab6cd65..7feecd1e56 100644 --- a/crates/deltalake-core/src/operations/optimize.rs +++ b/crates/deltalake-core/src/operations/optimize.rs @@ -37,12 +37,12 @@ use parquet::basic::{Compression, ZstdLevel}; use parquet::errors::ParquetError; use parquet::file::properties::WriterProperties; use serde::{Deserialize, Serialize}; -use serde_json::Map; use super::transaction::commit; use super::writer::{PartitionWriter, PartitionWriterConfig}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{self, Action, DeltaOperation}; +use crate::kernel::{Action, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::ObjectStoreRef; use crate::table::state::DeltaTableState; use crate::writer::utils::arrow_schema_without_partitions; @@ -311,7 +311,7 @@ fn create_remove( let deletion_time = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); let deletion_time = deletion_time.as_millis() as i64; - Ok(Action::remove(protocol::Remove { + Ok(Action::Remove(Remove { path: path.to_string(), deletion_timestamp: Some(deletion_time), data_change: false, @@ -320,6 +320,8 @@ fn create_remove( size: Some(size), deletion_vector: None, tags: None, + base_row_id: None, + default_row_commit_version: None, })) } @@ -450,7 +452,7 @@ impl MergePlan { partial_metrics.files_added.max = std::cmp::max(partial_metrics.files_added.max, size); partial_metrics.files_added.min = std::cmp::min(partial_metrics.files_added.min, size); - Action::add(add) + Action::Add(add) }); partial_actions.extend(add_actions); @@ -703,7 +705,7 @@ impl MergePlan { last_commit = now; buffered_metrics.preserve_insertion_order = true; - let mut metadata = Map::new(); + let mut metadata = HashMap::new(); metadata.insert("readVersion".to_owned(), self.read_table_version.into()); let maybe_map_metrics = serde_json::to_value(std::mem::replace( &mut buffered_metrics, @@ -791,12 +793,14 @@ pub fn create_merge_plan( let input_parameters = OptimizeInput { target_size }; let file_schema = arrow_schema_without_partitions( - &Arc::new(>::try_from( - &snapshot - .current_metadata() - .ok_or(DeltaTableError::NoMetadata)? - .schema, - )?), + &Arc::new( + >::try_from( + &snapshot + .current_metadata() + .ok_or(DeltaTableError::NoMetadata)? + .schema, + )?, + ), partitions_keys, ); @@ -943,9 +947,9 @@ fn build_zorder_plan( .current_metadata() .unwrap() .schema - .get_fields() + .fields() .iter() - .map(|field| field.get_name().to_string()) + .map(|field| field.name().to_string()) .collect_vec(); let unknown_columns = zorder_columns .iter() diff --git a/crates/deltalake-core/src/operations/restore.rs b/crates/deltalake-core/src/operations/restore.rs index 1f4de3a06c..a356b5b312 100644 --- a/crates/deltalake-core/src/operations/restore.rs +++ b/crates/deltalake-core/src/operations/restore.rs @@ -30,8 +30,9 @@ use object_store::path::Path; use object_store::ObjectStore; use serde::Serialize; +use crate::kernel::{Action, Add, Protocol, Remove}; use crate::operations::transaction::{prepare_commit, try_commit_transaction, TransactionError}; -use crate::protocol::{Action, Add, DeltaOperation, Protocol, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::ObjectStoreRef; use crate::table::state::DeltaTableState; use crate::{DeltaResult, DeltaTable, DeltaTableConfig, DeltaTableError, ObjectStoreError}; @@ -187,6 +188,8 @@ async fn execute( size: Some(a.size), tags: a.tags, deletion_vector: a.deletion_vector, + base_row_id: a.base_row_id, + default_row_commit_version: a.default_row_commit_version, } }) .collect(); @@ -230,9 +233,9 @@ async fn execute( reader_features: snapshot.reader_features().cloned(), } }; - actions.push(Action::protocol(protocol)); - actions.extend(files_to_add.into_iter().map(Action::add)); - actions.extend(files_to_remove.into_iter().map(Action::remove)); + actions.push(Action::Protocol(protocol)); + actions.extend(files_to_add.into_iter().map(Action::Add)); + actions.extend(files_to_remove.into_iter().map(Action::Remove)); let commit = prepare_commit( object_store.as_ref(), diff --git a/crates/deltalake-core/src/operations/transaction/conflict_checker.rs b/crates/deltalake-core/src/operations/transaction/conflict_checker.rs index 6bbc2a9d45..3a0bf0526d 100644 --- a/crates/deltalake-core/src/operations/transaction/conflict_checker.rs +++ b/crates/deltalake-core/src/operations/transaction/conflict_checker.rs @@ -6,7 +6,8 @@ use object_store::ObjectStore; use super::CommitInfo; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, Add, DeltaOperation, MetaData, Protocol, Remove}; +use crate::kernel::{Action, Add, Metadata, Protocol, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::commit_uri_from_version; use crate::table::config::IsolationLevel; use crate::table::state::DeltaTableState; @@ -169,7 +170,7 @@ impl<'a> TransactionInfo<'a> { pub fn metadata_changed(&self) -> bool { self.actions .iter() - .any(|a| matches!(a, Action::metaData(_))) + .any(|a| matches!(a, Action::Metadata(_))) } #[cfg(feature = "datafusion")] @@ -236,9 +237,9 @@ impl WinningCommitSummary { let commit_info = actions .iter() - .find(|action| matches!(action, Action::commitInfo(_))) + .find(|action| matches!(action, Action::CommitInfo(_))) .map(|action| match action { - Action::commitInfo(info) => info.clone(), + Action::CommitInfo(info) => info.clone(), _ => unreachable!(), }); @@ -248,12 +249,12 @@ impl WinningCommitSummary { }) } - pub fn metadata_updates(&self) -> Vec { + pub fn metadata_updates(&self) -> Vec { self.actions .iter() .cloned() .filter_map(|action| match action { - Action::metaData(metadata) => Some(metadata), + Action::Metadata(metadata) => Some(metadata), _ => None, }) .collect() @@ -264,7 +265,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::txn(txn) => Some(txn.app_id), + Action::Txn(txn) => Some(txn.app_id), _ => None, }) .collect() @@ -275,7 +276,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::protocol(protocol) => Some(protocol), + Action::Protocol(protocol) => Some(protocol), _ => None, }) .collect() @@ -286,7 +287,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::remove(remove) => Some(remove), + Action::Remove(remove) => Some(remove), _ => None, }) .collect() @@ -297,7 +298,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::add(add) => Some(add), + Action::Add(add) => Some(add), _ => None, }) .collect() @@ -414,7 +415,7 @@ impl<'a> ConflictChecker<'a> { .txn_info .actions .iter() - .any(|a| matches!(a, Action::protocol(_))) + .any(|a| matches!(a, Action::Protocol(_))) { return Err(CommitConflictError::ProtocolChanged( "protocol changed".into(), @@ -546,7 +547,7 @@ impl<'a> ConflictChecker<'a> { .iter() .cloned() .filter_map(|action| match action { - Action::remove(remove) => Some(remove.path), + Action::Remove(remove) => Some(remove.path), _ => None, }) .collect(); @@ -620,8 +621,8 @@ pub(super) fn can_downgrade_to_snapshot_isolation<'a>( let mut has_non_file_actions = false; for action in actions { match action { - Action::add(act) if act.data_change => data_changed = true, - Action::remove(rem) if rem.data_change => data_changed = true, + Action::Add(act) if act.data_change => data_changed = true, + Action::Remove(rem) if rem.data_change => data_changed = true, _ => has_non_file_actions = true, } } @@ -644,7 +645,7 @@ mod tests { use super::super::test_utils as tu; use super::super::test_utils::init_table_actions; use super::*; - use crate::protocol::Action; + use crate::kernel::Action; #[cfg(feature = "datafusion")] use datafusion_expr::{col, lit}; use serde_json::json; diff --git a/crates/deltalake-core/src/operations/transaction/mod.rs b/crates/deltalake-core/src/operations/transaction/mod.rs index 738ae404ec..c31c349fd7 100644 --- a/crates/deltalake-core/src/operations/transaction/mod.rs +++ b/crates/deltalake-core/src/operations/transaction/mod.rs @@ -1,13 +1,16 @@ //! Delta transactions +use std::collections::HashMap; + use chrono::Utc; use conflict_checker::ConflictChecker; use object_store::path::Path; use object_store::{Error as ObjectStoreError, ObjectStore}; -use serde_json::{Map, Value}; +use serde_json::Value; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, CommitInfo, DeltaOperation}; +use crate::kernel::{Action, CommitInfo}; +use crate::protocol::DeltaOperation; use crate::storage::commit_uri_from_version; use crate::table::state::DeltaTableState; @@ -79,7 +82,7 @@ fn log_entry_from_actions<'a>( let mut jsons = Vec::::new(); for action in actions { if append_only { - if let Action::remove(remove) = action { + if let Action::Remove(remove) = action { if remove.data_change { return Err(TransactionError::DeltaTableAppendOnly); } @@ -96,24 +99,24 @@ pub(crate) fn get_commit_bytes( operation: &DeltaOperation, actions: &Vec, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, ) -> Result { - if !actions.iter().any(|a| matches!(a, Action::commitInfo(..))) { - let mut extra_info = Map::::new(); + if !actions.iter().any(|a| matches!(a, Action::CommitInfo(..))) { + let mut extra_info = HashMap::::new(); let mut commit_info = operation.get_commit_info(); commit_info.timestamp = Some(Utc::now().timestamp_millis()); extra_info.insert( "clientVersion".to_string(), Value::String(format!("delta-rs.{}", crate_version())), ); - if let Some(mut meta) = app_metadata { - extra_info.append(&mut meta) + if let Some(meta) = app_metadata { + extra_info.extend(meta) } commit_info.info = extra_info; Ok(bytes::Bytes::from(log_entry_from_actions( actions .iter() - .chain(std::iter::once(&Action::commitInfo(commit_info))), + .chain(std::iter::once(&Action::CommitInfo(commit_info))), read_snapshot, )?)) } else { @@ -132,7 +135,7 @@ pub(crate) async fn prepare_commit<'a>( operation: &DeltaOperation, actions: &Vec, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, ) -> Result { // Serialize all actions that are part of this log entry. let log_entry = get_commit_bytes(operation, actions, read_snapshot, app_metadata)?; @@ -180,7 +183,7 @@ pub async fn commit( actions: &Vec, operation: DeltaOperation, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, ) -> DeltaResult { commit_with_retries(storage, actions, operation, read_snapshot, app_metadata, 15).await } @@ -194,7 +197,7 @@ pub async fn commit_with_retries( actions: &Vec, operation: DeltaOperation, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, max_retries: usize, ) -> DeltaResult { let tmp_commit = diff --git a/crates/deltalake-core/src/operations/transaction/state.rs b/crates/deltalake-core/src/operations/transaction/state.rs index bb9c3ff35e..a209b7369d 100644 --- a/crates/deltalake-core/src/operations/transaction/state.rs +++ b/crates/deltalake-core/src/operations/transaction/state.rs @@ -20,7 +20,7 @@ use crate::delta_datafusion::{ get_null_of_arrow_type, logical_expr_to_physical_expr, to_correct_scalar_value, }; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::Add; +use crate::kernel::Add; use crate::table::state::DeltaTableState; impl DeltaTableState { @@ -33,15 +33,15 @@ impl DeltaTableState { let meta = self.current_metadata().ok_or(DeltaTableError::NoMetadata)?; let fields = meta .schema - .get_fields() + .fields() .iter() - .filter(|f| !meta.partition_columns.contains(&f.get_name().to_string())) + .filter(|f| !meta.partition_columns.contains(&f.name().to_string())) .map(|f| f.try_into()) .chain( meta.schema - .get_fields() + .fields() .iter() - .filter(|f| meta.partition_columns.contains(&f.get_name().to_string())) + .filter(|f| meta.partition_columns.contains(&f.name().to_string())) .map(|f| { let field = ArrowField::try_from(f)?; let corrected = if wrap_partitions { diff --git a/crates/deltalake-core/src/operations/transaction/test_utils.rs b/crates/deltalake-core/src/operations/transaction/test_utils.rs index e4ae14b2ed..b52b1a1c7b 100644 --- a/crates/deltalake-core/src/operations/transaction/test_utils.rs +++ b/crates/deltalake-core/src/operations/transaction/test_utils.rs @@ -1,31 +1,49 @@ #![allow(unused)] use std::collections::HashMap; -use super::{prepare_commit, try_commit_transaction, CommitInfo}; -use crate::protocol::{Action, Add, DeltaOperation, MetaData, Protocol, Remove, SaveMode}; +use super::{prepare_commit, try_commit_transaction}; +use crate::kernel::{ + Action, Add, CommitInfo, DataType, Metadata, PrimitiveType, Protocol, Remove, StructField, + StructType, +}; +use crate::protocol::{DeltaOperation, SaveMode}; use crate::table::state::DeltaTableState; use crate::table::DeltaTableMetaData; -use crate::{DeltaTable, DeltaTableBuilder, Schema, SchemaDataType, SchemaField}; +use crate::{DeltaTable, DeltaTableBuilder}; pub fn create_add_action( path: impl Into, data_change: bool, stats: Option, ) -> Action { - Action::add(Add { + Action::Add(Add { path: path.into(), size: 100, data_change, stats, - ..Default::default() + modification_time: -1, + partition_values: Default::default(), + partition_values_parsed: None, + stats_parsed: None, + base_row_id: None, + default_row_commit_version: None, + tags: None, + deletion_vector: None, }) } pub fn create_remove_action(path: impl Into, data_change: bool) -> Action { - Action::remove(Remove { + Action::Remove(Remove { path: path.into(), data_change, - ..Default::default() + size: None, + deletion_timestamp: None, + deletion_vector: None, + partition_values: Default::default(), + extended_file_metadata: None, + base_row_id: None, + default_row_commit_version: None, + tags: None, }) } @@ -36,31 +54,28 @@ pub fn create_protocol_action(max_reader: Option, max_writer: Option) writer_features: None, reader_features: None, }; - Action::protocol(protocol) + Action::Protocol(protocol) } pub fn create_metadata_action( parttiton_columns: Option>, configuration: Option>>, ) -> Action { - let table_schema = Schema::new(vec![ - SchemaField::new( + let table_schema = StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]); let metadata = DeltaTableMetaData::new( @@ -71,7 +86,7 @@ pub fn create_metadata_action( parttiton_columns.unwrap_or_default(), configuration.unwrap_or_default(), ); - Action::metaData(MetaData::try_from(metadata).unwrap()) + Action::Metadata(Metadata::try_from(metadata).unwrap()) } pub fn init_table_actions(configuration: Option>>) -> Vec { @@ -96,7 +111,7 @@ pub fn init_table_actions(configuration: Option>> let commit_info = serde_json::from_str::(raw).unwrap(); vec![ - Action::commitInfo(commit_info), + Action::CommitInfo(commit_info), create_protocol_action(None, None), create_metadata_action(None, configuration), ] @@ -109,24 +124,21 @@ pub async fn create_initialized_table( let storage = DeltaTableBuilder::from_uri("memory://") .build_storage() .unwrap(); - let table_schema = Schema::new(vec![ - SchemaField::new( + let table_schema = StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]); let state = DeltaTableState::from_actions(init_table_actions(None), 0).unwrap(); diff --git a/crates/deltalake-core/src/operations/update.rs b/crates/deltalake-core/src/operations/update.rs index 8b7ec915f3..1723d287a2 100644 --- a/crates/deltalake-core/src/operations/update.rs +++ b/crates/deltalake-core/src/operations/update.rs @@ -41,21 +41,18 @@ use datafusion_physical_expr::{ use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::{Map, Value}; - -use crate::{ - delta_datafusion::{expr::fmt_expr_to_sql, find_files, register_store, DeltaScanBuilder}, - protocol::{Action, DeltaOperation, Remove}, - storage::{DeltaObjectStore, ObjectStoreRef}, - table::state::DeltaTableState, - DeltaResult, DeltaTable, DeltaTableError, -}; - -use super::{ - datafusion_utils::{Expression, MetricObserverExec}, - transaction::commit, - write::write_execution_plan, -}; +use serde_json::Value; + +use super::datafusion_utils::{Expression, MetricObserverExec}; +use super::transaction::commit; +use super::write::write_execution_plan; +use crate::delta_datafusion::expr::fmt_expr_to_sql; +use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; +use crate::kernel::{Action, Remove}; +use crate::protocol::DeltaOperation; +use crate::storage::{DeltaObjectStore, ObjectStoreRef}; +use crate::table::state::DeltaTableState; +use crate::{DeltaResult, DeltaTable, DeltaTableError}; /// Updates records in the Delta Table. /// See this module's documentation for more information @@ -73,7 +70,7 @@ pub struct UpdateBuilder { /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, /// safe_cast determines how data types that do not match the underlying table are handled /// By default an error is returned safe_cast: bool, @@ -138,7 +135,7 @@ impl UpdateBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -171,7 +168,7 @@ async fn execute( snapshot: &DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + app_metadata: Option>, safe_cast: bool, ) -> DeltaResult<((Vec, i64), UpdateMetrics)> { // Validate the predicate and update expressions. @@ -384,13 +381,13 @@ async fn execute( .duration_since(UNIX_EPOCH) .unwrap() .as_millis() as i64; - let mut actions: Vec = add_actions.into_iter().map(Action::add).collect(); + let mut actions: Vec = add_actions.into_iter().map(Action::Add).collect(); metrics.num_added_files = actions.len(); metrics.num_removed_files = candidates.candidates.len(); for action in candidates.candidates { - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: action.path, deletion_timestamp: Some(deletion_timestamp), data_change: true, @@ -399,6 +396,8 @@ async fn execute( size: Some(action.size), deletion_vector: action.deletion_vector, tags: None, + base_row_id: None, + default_row_commit_version: None, })) } @@ -480,7 +479,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/vacuum.rs b/crates/deltalake-core/src/operations/vacuum.rs index 684e6f6d0a..47f7c1d5c9 100644 --- a/crates/deltalake-core/src/operations/vacuum.rs +++ b/crates/deltalake-core/src/operations/vacuum.rs @@ -21,7 +21,7 @@ //! let (table, metrics) = VacuumBuilder::new(table.object_store(). table.state).await?; //! ```` -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -31,12 +31,13 @@ use futures::{StreamExt, TryStreamExt}; use object_store::Error; use object_store::{path::Path, ObjectStore}; use serde::Serialize; -use serde_json::{Map, Value}; +use serde_json::Value; use super::transaction::commit; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, DeltaOperation}; // Txn CommitInfo +use crate::kernel::Action; +use crate::protocol::DeltaOperation; use crate::storage::DeltaObjectStore; use crate::table::state::DeltaTableState; use crate::DeltaTable; @@ -288,7 +289,7 @@ impl VacuumPlan { // Begin VACUUM START COMMIT let mut commit_info = start_operation.get_commit_info(); - let mut extra_info = Map::::new(); + let mut extra_info = HashMap::::new(); commit_info.timestamp = Some(Utc::now().timestamp_millis()); extra_info.insert( @@ -300,7 +301,7 @@ impl VacuumPlan { } commit_info.info = extra_info; - let start_actions = vec![Action::commitInfo(commit_info)]; + let start_actions = vec![Action::CommitInfo(commit_info)]; commit(store, &start_actions, start_operation, snapshot, None).await?; // Finish VACUUM START COMMIT @@ -327,7 +328,7 @@ impl VacuumPlan { // Begin VACUUM END COMMIT let mut commit_info = end_operation.get_commit_info(); - let mut extra_info = Map::::new(); + let mut extra_info = HashMap::::new(); commit_info.timestamp = Some(Utc::now().timestamp_millis()); extra_info.insert( @@ -339,7 +340,7 @@ impl VacuumPlan { } commit_info.info = extra_info; - let end_actions = vec![Action::commitInfo(commit_info)]; + let end_actions = vec![Action::CommitInfo(commit_info)]; commit(store, &end_actions, end_operation, snapshot, None).await?; // Finish VACUUM END COMMIT diff --git a/crates/deltalake-core/src/operations/write.rs b/crates/deltalake-core/src/operations/write.rs index 31723cc235..45bdaaeff5 100644 --- a/crates/deltalake-core/src/operations/write.rs +++ b/crates/deltalake-core/src/operations/write.rs @@ -37,15 +37,14 @@ use datafusion::physical_plan::{memory::MemoryExec, ExecutionPlan}; use futures::future::BoxFuture; use futures::StreamExt; use parquet::file::properties::WriterProperties; -use serde_json::Map; use super::writer::{DeltaWriter, WriterConfig}; use super::MAX_SUPPORTED_WRITER_VERSION; use super::{transaction::commit, CreateBuilder}; use crate::delta_datafusion::DeltaDataChecker; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, Add, DeltaOperation, Remove, SaveMode}; -use crate::schema::Schema; +use crate::kernel::{Action, Add, Remove, StructType}; +use crate::protocol::{DeltaOperation, SaveMode}; use crate::storage::{DeltaObjectStore, ObjectStoreRef}; use crate::table::state::DeltaTableState; use crate::writer::record_batch::divide_by_partition_values; @@ -113,7 +112,7 @@ pub struct WriteBuilder { /// Parquet writer properties writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, } impl WriteBuilder { @@ -206,7 +205,7 @@ impl WriteBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -226,7 +225,7 @@ impl WriteBuilder { } } false => { - let schema: Schema = if let Some(plan) = &self.input { + let schema: StructType = if let Some(plan) = &self.input { Ok(plan.schema().try_into()?) } else if let Some(batches) = &self.batches { if batches.is_empty() { @@ -238,7 +237,7 @@ impl WriteBuilder { }?; let mut builder = CreateBuilder::new() .with_object_store(self.store.clone()) - .with_columns(schema.get_fields().clone()); + .with_columns(schema.fields().clone()); if let Some(partition_columns) = self.partition_columns.as_ref() { builder = builder.with_partition_columns(partition_columns.clone()) } @@ -426,7 +425,7 @@ impl std::future::IntoFuture for WriteBuilder { this.safe_cast, ) .await?; - actions.extend(add_actions.into_iter().map(Action::add)); + actions.extend(add_actions.into_iter().map(Action::Add)); // Collect remove actions if we are overwriting the table if matches!(this.mode, SaveMode::Overwrite) { @@ -437,7 +436,7 @@ impl std::future::IntoFuture for WriteBuilder { .as_millis() as i64; let to_remove_action = |add: &Add| { - Action::remove(Remove { + Action::Remove(Remove { path: add.path.clone(), deletion_timestamp: Some(deletion_timestamp), data_change: true, @@ -447,6 +446,8 @@ impl std::future::IntoFuture for WriteBuilder { // TODO add file metadata to remove action (tags missing) tags: None, deletion_vector: add.deletion_vector.clone(), + base_row_id: add.base_row_id, + default_row_commit_version: add.default_row_commit_version, }) }; @@ -599,14 +600,14 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); assert_eq!(table.state.commit_infos().len(), 1); // write some data - let metadata = Map::from_iter(vec![("k1".to_string(), json!("v1.1"))]); + let metadata = HashMap::from_iter(vec![("k1".to_string(), json!("v1.1"))]); let mut table = DeltaOps(table) .write(vec![batch.clone()]) .with_save_mode(SaveMode::Append) @@ -623,12 +624,13 @@ mod tests { .clone() .into_iter() .filter(|(k, _)| k != "clientVersion") - .collect::>(), + .collect::>(), metadata ); // append some data - let metadata: Map = Map::from_iter(vec![("k1".to_string(), json!("v1.2"))]); + let metadata: HashMap = + HashMap::from_iter(vec![("k1".to_string(), json!("v1.2"))]); let mut table = DeltaOps(table) .write(vec![batch.clone()]) .with_save_mode(SaveMode::Append) @@ -645,12 +647,13 @@ mod tests { .clone() .into_iter() .filter(|(k, _)| k != "clientVersion") - .collect::>(), + .collect::>(), metadata ); // overwrite table - let metadata: Map = Map::from_iter(vec![("k2".to_string(), json!("v2.1"))]); + let metadata: HashMap = + HashMap::from_iter(vec![("k2".to_string(), json!("v2.1"))]); let mut table = DeltaOps(table) .write(vec![batch]) .with_save_mode(SaveMode::Overwrite) @@ -667,7 +670,7 @@ mod tests { .clone() .into_iter() .filter(|(k, _)| k != "clientVersion") - .collect::>(), + .collect::>(), metadata ); } @@ -807,7 +810,7 @@ mod tests { #[tokio::test] async fn test_check_invariants() { let batch = get_record_batch(None, false); - let schema: Schema = serde_json::from_value(json!({ + let schema: StructType = serde_json::from_value(json!({ "type": "struct", "fields": [ {"name": "id", "type": "string", "nullable": true, "metadata": {}}, @@ -821,7 +824,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -829,7 +832,7 @@ mod tests { let table = DeltaOps(table).write(vec![batch.clone()]).await.unwrap(); assert_eq!(table.version(), 1); - let schema: Schema = serde_json::from_value(json!({ + let schema: StructType = serde_json::from_value(json!({ "type": "struct", "fields": [ {"name": "id", "type": "string", "nullable": true, "metadata": {}}, @@ -843,7 +846,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -859,7 +862,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); diff --git a/crates/deltalake-core/src/operations/writer.rs b/crates/deltalake-core/src/operations/writer.rs index 05bda44ae6..0bba167e33 100644 --- a/crates/deltalake-core/src/operations/writer.rs +++ b/crates/deltalake-core/src/operations/writer.rs @@ -13,7 +13,7 @@ use parquet::file::properties::WriterProperties; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::Add; +use crate::kernel::Add; use crate::storage::ObjectStoreRef; use crate::writer::record_batch::{divide_by_partition_values, PartitionResult}; use crate::writer::stats::create_add; diff --git a/crates/deltalake-core/src/protocol/checkpoints.rs b/crates/deltalake-core/src/protocol/checkpoints.rs index 5667b4e1b2..fc23c1d28b 100644 --- a/crates/deltalake-core/src/protocol/checkpoints.rs +++ b/crates/deltalake-core/src/protocol/checkpoints.rs @@ -4,9 +4,8 @@ use std::collections::HashMap; use std::convert::TryFrom; use std::iter::Iterator; -use arrow::datatypes::Schema as ArrowSchema; -use arrow::error::ArrowError; use arrow::json::ReaderBuilder; +use arrow_schema::{ArrowError, Schema as ArrowSchema}; use chrono::{Datelike, Utc}; use futures::{StreamExt, TryStreamExt}; @@ -18,9 +17,12 @@ use parquet::errors::ParquetError; use regex::Regex; use serde_json::Value; -use super::{time_utils, Action, Add as AddAction, MetaData, Protocol, ProtocolError, Txn}; -use crate::arrow_convert::delta_log_schema_for_table; -use crate::schema::*; +use super::{time_utils, ProtocolError}; +use crate::kernel::actions::arrow::delta_log_schema_for_table; +use crate::kernel::{ + Action, Add as AddAction, DataType, Metadata, PrimitiveType, Protocol, StructField, StructType, + Txn, +}; use crate::storage::DeltaObjectStore; use crate::table::state::DeltaTableState; use crate::table::{CheckPoint, CheckPointBuilder}; @@ -201,8 +203,11 @@ fn parquet_bytes_from_state( let partition_col_data_types = current_metadata.get_partition_col_data_types(); // Collect a map of paths that require special stats conversion. - let mut stats_conversions: Vec<(SchemaPath, SchemaDataType)> = Vec::new(); - collect_stats_conversions(&mut stats_conversions, current_metadata.schema.get_fields()); + let mut stats_conversions: Vec<(SchemaPath, DataType)> = Vec::new(); + collect_stats_conversions( + &mut stats_conversions, + current_metadata.schema.fields().as_slice(), + ); let mut tombstones = state.unexpired_tombstones().cloned().collect::>(); @@ -226,14 +231,14 @@ fn parquet_bytes_from_state( } // protocol - let jsons = std::iter::once(Action::protocol(Protocol { + let jsons = std::iter::once(Action::Protocol(Protocol { min_reader_version: state.min_reader_version(), min_writer_version: state.min_writer_version(), writer_features: None, reader_features: None, })) // metaData - .chain(std::iter::once(Action::metaData(MetaData::try_from( + .chain(std::iter::once(Action::Metadata(Metadata::try_from( current_metadata.clone(), )?))) // txns @@ -242,7 +247,7 @@ fn parquet_bytes_from_state( .app_transaction_version() .iter() .map(|(app_id, version)| { - Action::txn(Txn { + Action::Txn(Txn { app_id: app_id.clone(), version: *version, last_updated: None, @@ -259,7 +264,7 @@ fn parquet_bytes_from_state( r.extended_file_metadata = Some(false); } - Action::remove(r) + Action::Remove(r) })) .map(|a| serde_json::to_value(a).map_err(ProtocolError::from)) // adds @@ -269,7 +274,7 @@ fn parquet_bytes_from_state( // Create the arrow schema that represents the Checkpoint parquet file. let arrow_schema = delta_log_schema_for_table( - >::try_from(¤t_metadata.schema)?, + >::try_from(¤t_metadata.schema)?, current_metadata.partition_columns.as_slice(), use_extended_remove_schema, ); @@ -299,10 +304,10 @@ fn parquet_bytes_from_state( fn checkpoint_add_from_state( add: &AddAction, - partition_col_data_types: &[(&str, &SchemaDataType)], - stats_conversions: &[(SchemaPath, SchemaDataType)], + partition_col_data_types: &[(&String, &DataType)], + stats_conversions: &[(SchemaPath, DataType)], ) -> Result { - let mut v = serde_json::to_value(Action::add(add.clone())) + let mut v = serde_json::to_value(Action::Add(add.clone())) .map_err(|err| ArrowError::JsonError(err.to_string()))?; v["add"]["dataChange"] = Value::Bool(false); @@ -348,24 +353,27 @@ fn checkpoint_add_from_state( fn typed_partition_value_from_string( string_value: &str, - data_type: &SchemaDataType, + data_type: &DataType, ) -> Result { match data_type { - SchemaDataType::primitive(primitive_type) => match primitive_type.as_str() { - "string" | "binary" => Ok(string_value.to_owned().into()), - "long" | "integer" | "short" | "byte" => Ok(string_value + DataType::Primitive(primitive_type) => match primitive_type { + PrimitiveType::String | PrimitiveType::Binary => Ok(string_value.to_owned().into()), + PrimitiveType::Long + | PrimitiveType::Integer + | PrimitiveType::Short + | PrimitiveType::Byte => Ok(string_value .parse::() .map_err(|_| CheckpointError::PartitionValueNotParseable(string_value.to_owned()))? .into()), - "boolean" => Ok(string_value + PrimitiveType::Boolean => Ok(string_value .parse::() .map_err(|_| CheckpointError::PartitionValueNotParseable(string_value.to_owned()))? .into()), - "float" | "double" => Ok(string_value + PrimitiveType::Float | PrimitiveType::Double => Ok(string_value .parse::() .map_err(|_| CheckpointError::PartitionValueNotParseable(string_value.to_owned()))? .into()), - "date" => { + PrimitiveType::Date => { let d = chrono::naive::NaiveDate::parse_from_str(string_value, "%Y-%m-%d") .map_err(|_| { CheckpointError::PartitionValueNotParseable(string_value.to_owned()) @@ -373,7 +381,7 @@ fn typed_partition_value_from_string( // day 0 is 1970-01-01 (719163 days from ce) Ok((d.num_days_from_ce() - 719_163).into()) } - "timestamp" => { + PrimitiveType::Timestamp => { let ts = chrono::naive::NaiveDateTime::parse_from_str(string_value, "%Y-%m-%d %H:%M:%S") .map_err(|_| { @@ -395,7 +403,7 @@ fn typed_partition_value_from_string( fn typed_partition_value_from_option_string( string_value: &Option, - data_type: &SchemaDataType, + data_type: &DataType, ) -> Result { match string_value { Some(s) => { @@ -409,10 +417,7 @@ fn typed_partition_value_from_option_string( } } -fn collect_stats_conversions( - paths: &mut Vec<(SchemaPath, SchemaDataType)>, - fields: &[SchemaField], -) { +fn collect_stats_conversions(paths: &mut Vec<(SchemaPath, DataType)>, fields: &[StructField]) { let mut _path = SchemaPath::new(); fields .iter() @@ -421,20 +426,18 @@ fn collect_stats_conversions( fn collect_field_conversion( current_path: &mut SchemaPath, - all_paths: &mut Vec<(SchemaPath, SchemaDataType)>, - field: &SchemaField, + all_paths: &mut Vec<(SchemaPath, DataType)>, + field: &StructField, ) { - match field.get_type() { - SchemaDataType::primitive(type_name) => { - if let "timestamp" = type_name.as_str() { - let mut key_path = current_path.clone(); - key_path.push(field.get_name().to_owned()); - all_paths.push((key_path, field.get_type().to_owned())); - } + match field.data_type() { + DataType::Primitive(PrimitiveType::Timestamp) => { + let mut key_path = current_path.clone(); + key_path.push(field.name().to_owned()); + all_paths.push((key_path, field.data_type().to_owned())); } - SchemaDataType::r#struct(struct_field) => { - let struct_fields = struct_field.get_fields(); - current_path.push(field.get_name().to_owned()); + DataType::Struct(struct_field) => { + let struct_fields = struct_field.fields(); + current_path.push(field.name().to_owned()); struct_fields .iter() .for_each(|f| collect_field_conversion(current_path, all_paths, f)); @@ -447,11 +450,11 @@ fn collect_field_conversion( fn apply_stats_conversion( context: &mut serde_json::Map, path: &[String], - data_type: &SchemaDataType, + data_type: &DataType, ) { if path.len() == 1 { match data_type { - SchemaDataType::primitive(type_name) if type_name == "timestamp" => { + DataType::Primitive(PrimitiveType::Timestamp) => { let v = context.get_mut(&path[0]); if let Some(v) = v { @@ -488,7 +491,7 @@ mod tests { string_value, typed_partition_value_from_option_string( &Some("Hello World!".to_string()), - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ) .unwrap() ); @@ -498,7 +501,7 @@ mod tests { bool_value, typed_partition_value_from_option_string( &Some("true".to_string()), - &SchemaDataType::primitive("boolean".to_string()), + &DataType::Primitive(PrimitiveType::Boolean), ) .unwrap() ); @@ -508,7 +511,7 @@ mod tests { number_value, typed_partition_value_from_option_string( &Some("42".to_string()), - &SchemaDataType::primitive("integer".to_string()), + &DataType::Primitive(PrimitiveType::Integer), ) .unwrap() ); @@ -525,7 +528,7 @@ mod tests { date_value, typed_partition_value_from_option_string( &Some(s.to_string()), - &SchemaDataType::primitive("date".to_string()), + &DataType::Primitive(PrimitiveType::Date), ) .unwrap() ); @@ -543,7 +546,7 @@ mod tests { timestamp_value, typed_partition_value_from_option_string( &Some(s.to_string()), - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ) .unwrap() ); @@ -554,7 +557,7 @@ mod tests { binary_value, typed_partition_value_from_option_string( &Some("₁₂₃₄".to_string()), - &SchemaDataType::primitive("binary".to_string()), + &DataType::Primitive(PrimitiveType::Binary), ) .unwrap() ); @@ -566,7 +569,7 @@ mod tests { Value::Null, typed_partition_value_from_option_string( &None, - &SchemaDataType::primitive("integer".to_string()), + &DataType::Primitive(PrimitiveType::Integer), ) .unwrap() ); @@ -576,7 +579,7 @@ mod tests { Value::Null, typed_partition_value_from_option_string( &Some("".to_string()), - &SchemaDataType::primitive("integer".to_string()), + &DataType::Primitive(PrimitiveType::Integer), ) .unwrap() ); @@ -584,8 +587,8 @@ mod tests { #[test] fn collect_stats_conversions_test() { - let delta_schema: Schema = serde_json::from_value(SCHEMA.clone()).unwrap(); - let fields = delta_schema.get_fields(); + let delta_schema: StructType = serde_json::from_value(SCHEMA.clone()).unwrap(); + let fields = delta_schema.fields(); let mut paths = Vec::new(); collect_stats_conversions(&mut paths, fields.as_slice()); @@ -594,14 +597,14 @@ mod tests { assert_eq!( ( vec!["some_struct".to_string(), "struct_timestamp".to_string()], - SchemaDataType::primitive("timestamp".to_string()) + DataType::Primitive(PrimitiveType::Timestamp) ), paths[0] ); assert_eq!( ( vec!["some_timestamp".to_string()], - SchemaDataType::primitive("timestamp".to_string()) + DataType::Primitive(PrimitiveType::Timestamp) ), paths[1] ); @@ -616,22 +619,22 @@ mod tests { apply_stats_conversion( min_values, &["some_struct".to_string(), "struct_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( min_values, &["some_struct".to_string(), "struct_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); apply_stats_conversion( min_values, &["some_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( min_values, &["some_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); let max_values = stats.get_mut("maxValues").unwrap().as_object_mut().unwrap(); @@ -639,22 +642,22 @@ mod tests { apply_stats_conversion( max_values, &["some_struct".to_string(), "struct_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( max_values, &["some_struct".to_string(), "struct_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); apply_stats_conversion( max_values, &["some_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( max_values, &["some_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); // minValues diff --git a/crates/deltalake-core/src/protocol/mod.rs b/crates/deltalake-core/src/protocol/mod.rs index 66f06b13a1..47e24cd959 100644 --- a/crates/deltalake-core/src/protocol/mod.rs +++ b/crates/deltalake-core/src/protocol/mod.rs @@ -8,29 +8,27 @@ pub mod checkpoints; pub mod parquet2_read; #[cfg(feature = "parquet")] mod parquet_read; -mod serde_path; mod time_utils; #[cfg(feature = "arrow")] use arrow_schema::ArrowError; use futures::StreamExt; use lazy_static::lazy_static; -use log::*; +use log::debug; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; use regex::Regex; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; +use serde_json::Value; use std::borrow::Borrow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::mem::take; -use std::str::FromStr; use crate::errors::DeltaResult; +use crate::kernel::{Add, CommitInfo, Metadata, Protocol, Remove}; use crate::storage::ObjectStoreRef; -use crate::table::config::IsolationLevel; +use crate::table::CheckPoint; use crate::table::DeltaTableMetaData; -use crate::{schema::*, table::CheckPoint}; /// Error returned when an invalid Delta log action is encountered. #[allow(missing_docs)] @@ -105,6 +103,12 @@ pub enum ProtocolError { #[from] source: std::io::Error, }, + + #[error("Kernel: {source}")] + Kernel { + #[from] + source: crate::kernel::Error, + }, } /// Struct used to represent minValues and maxValues in add action statistics. @@ -244,170 +248,6 @@ pub struct StatsParsed { pub null_count: HashMap, } -/// Delta AddCDCFile action that describes a parquet CDC data file. -#[derive(Serialize, Deserialize, Clone, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct AddCDCFile { - /// A relative path, from the root of the table, or an - /// absolute path to a CDC file - #[serde(with = "serde_path")] - pub path: String, - /// The size of this file in bytes - pub size: i64, - /// A map from partition column to value for this file - pub partition_values: HashMap>, - /// Should always be set to false because they do not change the underlying data of the table - pub data_change: bool, - /// Map containing metadata about this file - pub tags: Option>>, -} - -///Storage type of deletion vector -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] -#[serde()] -pub enum StorageType { - /// Stored at relative path derived from a UUID. - #[serde(rename = "u")] - UuidRelativePath, - /// Stored as inline string. - #[serde(rename = "i")] - Inline, - /// Stored at an absolute path. - #[serde(rename = "p")] - AbsolutePath, -} - -impl Default for StorageType { - fn default() -> Self { - Self::UuidRelativePath // seems to be used by Databricks and therefore most common - } -} - -impl FromStr for StorageType { - type Err = ProtocolError; - - fn from_str(s: &str) -> Result { - match s { - "u" => Ok(Self::UuidRelativePath), - "i" => Ok(Self::Inline), - "p" => Ok(Self::AbsolutePath), - _ => Err(ProtocolError::InvalidDeletionVectorStorageType( - s.to_string(), - )), - } - } -} - -impl ToString for StorageType { - fn to_string(&self) -> String { - match self { - Self::UuidRelativePath => "u".to_string(), - Self::Inline => "i".to_string(), - Self::AbsolutePath => "p".to_string(), - } - } -} - -/// Describes deleted rows of a parquet file as part of an add or remove action -#[derive(Serialize, Deserialize, Clone, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct DeletionVector { - ///storageType of the deletion vector. p = Absolute Path, i = Inline, u = UUid Relative Path - pub storage_type: StorageType, - - ///If storageType = 'u' then - ///If storageType = 'i' then of the deletion vector data - ///If storageType = 'p' then - pub path_or_inline_dv: String, - - ///Start of the data for this DV in number of bytes from the beginning of the file it is stored in. Always None (absent in JSON) when storageType = 'i'. - pub offset: Option, - - ///Size of the serialized DV in bytes (raw data size, i.e. before base85 encoding, if inline). - pub size_in_bytes: i32, - - ///Number of rows the given DV logically removes from the file. - pub cardinality: i64, -} - -impl PartialEq for DeletionVector { - fn eq(&self, other: &Self) -> bool { - self.storage_type == other.storage_type - && self.path_or_inline_dv == other.path_or_inline_dv - && self.offset == other.offset - && self.size_in_bytes == other.size_in_bytes - && self.cardinality == other.cardinality - } -} - -impl Eq for DeletionVector {} - -/// Delta log action that describes a parquet data file that is part of the table. -#[derive(Serialize, Deserialize, Clone, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct Add { - /// A relative path, from the root of the table, to a file that should be added to the table - #[serde(with = "serde_path")] - pub path: String, - /// The size of this file in bytes - pub size: i64, - /// A map from partition column to value for this file - pub partition_values: HashMap>, - /// Partition values stored in raw parquet struct format. In this struct, the column names - /// correspond to the partition columns and the values are stored in their corresponding data - /// type. This is a required field when the table is partitioned and the table property - /// delta.checkpoint.writeStatsAsStruct is set to true. If the table is not partitioned, this - /// column can be omitted. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet")] - #[serde(skip_serializing, skip_deserializing)] - pub partition_values_parsed: Option, - /// Partition values stored in raw parquet struct format. In this struct, the column names - /// correspond to the partition columns and the values are stored in their corresponding data - /// type. This is a required field when the table is partitioned and the table property - /// delta.checkpoint.writeStatsAsStruct is set to true. If the table is not partitioned, this - /// column can be omitted. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet2")] - #[serde(skip_serializing, skip_deserializing)] - pub partition_values_parsed: Option, - /// The time this file was created, as milliseconds since the epoch - pub modification_time: i64, - /// When false the file must already be present in the table or the records in the added file - /// must be contained in one or more remove actions in the same version - /// - /// streaming queries that are tailing the transaction log can use this flag to skip actions - /// that would not affect the final results. - pub data_change: bool, - /// Contains statistics (e.g., count, min/max values for columns) about the data in this file - pub stats: Option, - /// Contains statistics (e.g., count, min/max values for columns) about the data in this file in - /// raw parquet format. This field needs to be written when statistics are available and the - /// table property: delta.checkpoint.writeStatsAsStruct is set to true. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet")] - #[serde(skip_serializing, skip_deserializing)] - pub stats_parsed: Option, - /// Contains statistics (e.g., count, min/max values for columns) about the data in this file in - /// raw parquet format. This field needs to be written when statistics are available and the - /// table property: delta.checkpoint.writeStatsAsStruct is set to true. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet2")] - #[serde(skip_serializing, skip_deserializing)] - pub stats_parsed: Option, - /// Map containing metadata about this file - #[serde(skip_serializing_if = "Option::is_none")] - pub tags: Option>>, - - /// Metadata about deletion vector - #[serde(skip_serializing_if = "Option::is_none")] - pub deletion_vector: Option, -} - impl Hash for Add { fn hash(&self, state: &mut H) { self.path.hash(state); @@ -468,127 +308,6 @@ impl Add { } } -/// Describes the data format of files in the table. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] -pub struct Format { - /// Name of the encoding for files in this table. - provider: String, - /// A map containing configuration options for the format. - options: HashMap>, -} - -impl Format { - /// Allows creation of a new action::Format - pub fn new(provider: String, options: Option>>) -> Self { - let options = options.unwrap_or_default(); - Self { provider, options } - } - - /// Return the Format provider - pub fn get_provider(self) -> String { - self.provider - } -} - -// Assuming this is a more appropriate default than derived Default -impl Default for Format { - fn default() -> Self { - Self { - provider: "parquet".to_string(), - options: Default::default(), - } - } -} - -/// Return a default empty schema to be used for edge-cases when a schema is missing -fn default_schema() -> String { - warn!("A `metaData` action was missing a `schemaString` and has been given an empty schema"); - r#"{"type":"struct", "fields": []}"#.into() -} - -/// Action that describes the metadata of the table. -/// This is a top-level action in Delta log entries. -#[derive(Serialize, Deserialize, Debug, Default, Clone)] -#[serde(rename_all = "camelCase")] -pub struct MetaData { - /// Unique identifier for this table - pub id: Guid, - /// User-provided identifier for this table - pub name: Option, - /// User-provided description for this table - pub description: Option, - /// Specification of the encoding for the files stored in the table - pub format: Format, - /// Schema of the table - #[serde(default = "default_schema")] - pub schema_string: String, - /// An array containing the names of columns by which the data should be partitioned - pub partition_columns: Vec, - /// The time when this metadata action is created, in milliseconds since the Unix epoch - pub created_time: Option, - /// A map containing configuration options for the table - pub configuration: HashMap>, -} - -impl MetaData { - /// Returns the table schema from the embedded schema string contained within the metadata - /// action. - pub fn get_schema(&self) -> Result { - serde_json::from_str(&self.schema_string) - } -} - -impl TryFrom for MetaData { - type Error = ProtocolError; - - fn try_from(metadata: DeltaTableMetaData) -> Result { - let schema_string = serde_json::to_string(&metadata.schema) - .map_err(|source| ProtocolError::SerializeOperation { source })?; - Ok(Self { - id: metadata.id, - name: metadata.name, - description: metadata.description, - format: metadata.format, - schema_string, - partition_columns: metadata.partition_columns, - created_time: metadata.created_time, - configuration: metadata.configuration, - }) - } -} - -/// Represents a tombstone (deleted file) in the Delta log. -/// This is a top-level action in Delta log entries. -#[derive(Serialize, Deserialize, Clone, Eq, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct Remove { - /// The path of the file that is removed from the table. - #[serde(with = "serde_path")] - pub path: String, - /// The timestamp when the remove was added to table state. - pub deletion_timestamp: Option, - /// Whether data is changed by the remove. A table optimize will report this as false for - /// example, since it adds and removes files by combining many files into one. - pub data_change: bool, - /// When true the fields partitionValues, size, and tags are present - /// - /// NOTE: Although it's defined as required in scala delta implementation, but some writes - /// it's still nullable so we keep it as Option<> for compatibly. - pub extended_file_metadata: Option, - /// A map from partition column to value for this file. - #[serde(skip_serializing_if = "Option::is_none")] - pub partition_values: Option>>, - /// Size of this file in bytes - #[serde(skip_serializing_if = "Option::is_none")] - pub size: Option, - /// Map containing metadata about this file - #[serde(skip_serializing_if = "Option::is_none")] - pub tags: Option>>, - /// Metadata about deletion vector - #[serde(skip_serializing_if = "Option::is_none")] - pub deletion_vector: Option, -} - impl Hash for Remove { fn hash(&self, state: &mut H) { self.path.hash(state); @@ -616,296 +335,21 @@ impl PartialEq for Remove { } } -/// Action used by streaming systems to track progress using application-specific versions to -/// enable idempotency. -#[derive(Serialize, Deserialize, Debug, Default, Clone)] -#[serde(rename_all = "camelCase")] -pub struct Txn { - /// A unique identifier for the application performing the transaction. - pub app_id: String, - /// An application-specific numeric identifier for this transaction. - pub version: i64, - /// The time when this transaction action was created in milliseconds since the Unix epoch. - pub last_updated: Option, -} - -/// Action used to increase the version of the Delta protocol required to read or write to the -/// table. -#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -pub struct Protocol { - /// Minimum version of the Delta read protocol a client must implement to correctly read the - /// table. - pub min_reader_version: i32, - /// Minimum version of the Delta write protocol a client must implement to correctly read the - /// table. - pub min_writer_version: i32, - /// Table features are missing from older versions - /// The table features this reader supports - #[serde(skip_serializing_if = "Option::is_none")] - pub reader_features: Option>, - /// Table features are missing from older versions - /// The table features this writer supports - #[serde(skip_serializing_if = "Option::is_none")] - pub writer_features: Option>, -} - -/// Features table readers can support as well as let users know -/// what is supported -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] -pub enum ReaderFeatures { - /// Mapping of one column to another - #[serde(alias = "columnMapping")] - COLUMN_MAPPING, - /// Deletion vectors for merge, update, delete - #[serde(alias = "deletionVectors")] - DELETION_VECTORS, - /// timestamps without timezone support - #[serde(alias = "timestampNtz")] - TIMESTAMP_WITHOUT_TIMEZONE, - /// version 2 of checkpointing - #[serde(alias = "v2Checkpoint")] - V2_CHECKPOINT, - /// If we do not match any other reader features - #[serde(untagged)] - OTHER(String), -} - -#[allow(clippy::from_over_into)] -impl Into for ReaderFeatures { - fn into(self) -> usize { - match self { - ReaderFeatures::OTHER(_) => 0, - ReaderFeatures::COLUMN_MAPPING => 2, - ReaderFeatures::DELETION_VECTORS - | ReaderFeatures::TIMESTAMP_WITHOUT_TIMEZONE - | ReaderFeatures::V2_CHECKPOINT => 3, - } - } -} - -#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] -impl From<&parquet::record::Field> for ReaderFeatures { - fn from(value: &parquet::record::Field) -> Self { - match value { - parquet::record::Field::Str(feature) => match feature.as_str() { - "columnMapping" => ReaderFeatures::COLUMN_MAPPING, - "deletionVectors" => ReaderFeatures::DELETION_VECTORS, - "timestampNtz" => ReaderFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "v2Checkpoint" => ReaderFeatures::V2_CHECKPOINT, - f => ReaderFeatures::OTHER(f.to_string()), - }, - f => ReaderFeatures::OTHER(f.to_string()), - } - } -} - -impl From for ReaderFeatures { - fn from(value: String) -> Self { - match value.as_str() { - "columnMapping" => ReaderFeatures::COLUMN_MAPPING, - "deletionVectors" => ReaderFeatures::DELETION_VECTORS, - "timestampNtz" => ReaderFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "v2Checkpoint" => ReaderFeatures::V2_CHECKPOINT, - f => ReaderFeatures::OTHER(f.to_string()), - } - } -} - -/// Features table writers can support as well as let users know -/// what is supported -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] -pub enum WriterFeatures { - /// Append Only Tables - #[serde(alias = "appendOnly")] - APPEND_ONLY, - /// Table invariants - #[serde(alias = "invariants")] - INVARIANTS, - /// Check constraints on columns - #[serde(alias = "checkConstraints")] - CHECK_CONSTRAINTS, - /// CDF on a table - #[serde(alias = "changeDataFeed")] - CHANGE_DATA_FEED, - /// Columns with generated values - #[serde(alias = "generatedColumns")] - GENERATED_COLUMNS, - /// Mapping of one column to another - #[serde(alias = "columnMapping")] - COLUMN_MAPPING, - /// ID Columns - #[serde(alias = "identityColumns")] - IDENTITY_COLUMNS, - /// Deletion vectors for merge, update, delete - #[serde(alias = "deletionVectors")] - DELETION_VECTORS, - /// Row tracking on tables - #[serde(alias = "rowTracking")] - ROW_TRACKING, - /// timestamps without timezone support - #[serde(alias = "timestampNtz")] - TIMESTAMP_WITHOUT_TIMEZONE, - /// domain specific metadata - #[serde(alias = "domainMetadata")] - DOMAIN_METADATA, - /// version 2 of checkpointing - #[serde(alias = "v2Checkpoint")] - V2_CHECKPOINT, - /// Iceberg compatability support - #[serde(alias = "icebergCompatV1")] - ICEBERG_COMPAT_V1, - /// If we do not match any other reader features - #[serde(untagged)] - OTHER(String), -} - -#[allow(clippy::from_over_into)] -impl Into for WriterFeatures { - fn into(self) -> usize { - match self { - WriterFeatures::OTHER(_) => 0, - WriterFeatures::APPEND_ONLY | WriterFeatures::INVARIANTS => 2, - WriterFeatures::CHECK_CONSTRAINTS => 3, - WriterFeatures::CHANGE_DATA_FEED | WriterFeatures::GENERATED_COLUMNS => 4, - WriterFeatures::COLUMN_MAPPING => 5, - WriterFeatures::IDENTITY_COLUMNS - | WriterFeatures::DELETION_VECTORS - | WriterFeatures::ROW_TRACKING - | WriterFeatures::TIMESTAMP_WITHOUT_TIMEZONE - | WriterFeatures::DOMAIN_METADATA - | WriterFeatures::V2_CHECKPOINT - | WriterFeatures::ICEBERG_COMPAT_V1 => 7, - } - } -} - -impl From for WriterFeatures { - fn from(value: String) -> Self { - match value.as_str() { - "appendOnly" => WriterFeatures::APPEND_ONLY, - "invariants" => WriterFeatures::INVARIANTS, - "checkConstraints" => WriterFeatures::CHECK_CONSTRAINTS, - "changeDataFeed" => WriterFeatures::CHANGE_DATA_FEED, - "generatedColumns" => WriterFeatures::GENERATED_COLUMNS, - "columnMapping" => WriterFeatures::COLUMN_MAPPING, - "identityColumns" => WriterFeatures::IDENTITY_COLUMNS, - "deletionVectors" => WriterFeatures::DELETION_VECTORS, - "rowTracking" => WriterFeatures::ROW_TRACKING, - "timestampNtz" => WriterFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "domainMetadata" => WriterFeatures::DOMAIN_METADATA, - "v2Checkpoint" => WriterFeatures::V2_CHECKPOINT, - "icebergCompatV1" => WriterFeatures::ICEBERG_COMPAT_V1, - f => WriterFeatures::OTHER(f.to_string()), - } - } -} - -#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] -impl From<&parquet::record::Field> for WriterFeatures { - fn from(value: &parquet::record::Field) -> Self { - match value { - parquet::record::Field::Str(feature) => match feature.as_str() { - "appendOnly" => WriterFeatures::APPEND_ONLY, - "invariants" => WriterFeatures::INVARIANTS, - "checkConstraints" => WriterFeatures::CHECK_CONSTRAINTS, - "changeDataFeed" => WriterFeatures::CHANGE_DATA_FEED, - "generatedColumns" => WriterFeatures::GENERATED_COLUMNS, - "columnMapping" => WriterFeatures::COLUMN_MAPPING, - "identityColumns" => WriterFeatures::IDENTITY_COLUMNS, - "deletionVectors" => WriterFeatures::DELETION_VECTORS, - "rowTracking" => WriterFeatures::ROW_TRACKING, - "timestampNtz" => WriterFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "domainMetadata" => WriterFeatures::DOMAIN_METADATA, - "v2Checkpoint" => WriterFeatures::V2_CHECKPOINT, - "icebergCompatV1" => WriterFeatures::ICEBERG_COMPAT_V1, - f => WriterFeatures::OTHER(f.to_string()), - }, - f => WriterFeatures::OTHER(f.to_string()), - } - } -} - -/// The commitInfo is a fairly flexible action within the delta specification, where arbitrary data can be stored. -/// However the reference implementation as well as delta-rs store useful information that may for instance -/// allow us to be more permissive in commit conflict resolution. -#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)] -#[serde(rename_all = "camelCase")] -pub struct CommitInfo { - /// Timestamp in millis when the commit was created - #[serde(skip_serializing_if = "Option::is_none")] - pub timestamp: Option, - /// Id of the user invoking the commit - #[serde(skip_serializing_if = "Option::is_none")] - pub user_id: Option, - /// Name of the user invoking the commit - #[serde(skip_serializing_if = "Option::is_none")] - pub user_name: Option, - /// The operation performed during the - #[serde(skip_serializing_if = "Option::is_none")] - pub operation: Option, - /// Parameters used for table operation - #[serde(skip_serializing_if = "Option::is_none")] - pub operation_parameters: Option>, - /// Version of the table when the operation was started - #[serde(skip_serializing_if = "Option::is_none")] - pub read_version: Option, - /// The isolation level of the commit - #[serde(skip_serializing_if = "Option::is_none")] - pub isolation_level: Option, - /// TODO - #[serde(skip_serializing_if = "Option::is_none")] - pub is_blind_append: Option, - /// Delta engine which created the commit. - #[serde(skip_serializing_if = "Option::is_none")] - pub engine_info: Option, - /// Additional provenance information for the commit - #[serde(flatten, default)] - pub info: Map, -} - -/// The domain metadata action contains a configuration (string) for a named metadata domain -#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)] -#[serde(rename_all = "camelCase")] -pub struct DomainMetaData { - /// Identifier for this domain (system or user-provided) - pub domain: String, - /// String containing configuration for the metadata domain - pub configuration: String, - /// When `true` the action serves as a tombstone - pub removed: bool, -} - -/// Represents an action in the Delta log. The Delta log is an aggregate of all actions performed -/// on the table, so the full list of actions is required to properly read a table. -#[derive(Serialize, Deserialize, Debug, Clone)] -pub enum Action { - /// Changes the current metadata of the table. Must be present in the first version of a table. - /// Subsequent `metaData` actions completely overwrite previous metadata. - metaData(MetaData), - /// Adds CDC a file to the table state. - cdc(AddCDCFile), - /// Adds a file to the table state. - add(Add), - /// Removes a file from the table state. - remove(Remove), - /// Used by streaming systems to track progress externally with application specific version - /// identifiers. - txn(Txn), - /// Describes the minimum reader and writer versions required to read or write to the table. - protocol(Protocol), - /// Describes commit provenance information for the table. - commitInfo(CommitInfo), - /// Describe s the configuration for a named metadata domain - domainMetadata(DomainMetaData), -} +impl TryFrom for Metadata { + type Error = ProtocolError; -impl Action { - /// Create a commit info from a map - pub fn commit_info(info: Map) -> Self { - Self::commitInfo(CommitInfo { - info, - ..Default::default() + fn try_from(metadata: DeltaTableMetaData) -> Result { + let schema_string = serde_json::to_string(&metadata.schema) + .map_err(|source| ProtocolError::SerializeOperation { source })?; + Ok(Self { + id: metadata.id, + name: metadata.name, + description: metadata.description, + format: metadata.format, + schema_string, + partition_columns: metadata.partition_columns, + created_time: metadata.created_time, + configuration: metadata.configuration, }) } } @@ -1232,6 +676,7 @@ pub(crate) async fn find_latest_check_point_for_version( #[cfg(test)] mod tests { use super::*; + use crate::kernel::Action; #[test] fn test_load_table_stats() { @@ -1245,7 +690,17 @@ mod tests { }) .to_string(), ), - ..Default::default() + path: Default::default(), + data_change: true, + deletion_vector: None, + partition_values: Default::default(), + partition_values_parsed: None, + stats_parsed: None, + tags: None, + size: 0, + modification_time: 0, + base_row_id: None, + default_row_commit_version: None, }; let stats = action.get_stats().unwrap().unwrap(); @@ -1310,7 +765,17 @@ mod tests { }) .to_string(), ), - ..Default::default() + path: Default::default(), + data_change: true, + deletion_vector: None, + partition_values: Default::default(), + partition_values_parsed: None, + stats_parsed: None, + tags: None, + size: 0, + modification_time: 0, + base_row_id: None, + default_row_commit_version: None, }; let stats = action.get_stats().unwrap().unwrap(); diff --git a/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs b/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs index 474a61a153..e68971be42 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs @@ -4,7 +4,7 @@ use parquet2::page::DataPage; use super::validity::ValidityRowIndexIter; use super::{split_page, ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; /// Parquet dictionary primitive value reader pub struct SomeBooleanValueIter<'a> { diff --git a/crates/deltalake-core/src/protocol/parquet2_read/map.rs b/crates/deltalake-core/src/protocol/parquet2_read/map.rs index 0739feae2d..df4dc94ab7 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/map.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/map.rs @@ -3,7 +3,7 @@ use parquet2::page::{DataPage, DictPage}; use super::string::for_each_repeated_string_field_value_with_idx; use super::{ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; #[derive(Default)] pub struct MapState { diff --git a/crates/deltalake-core/src/protocol/parquet2_read/mod.rs b/crates/deltalake-core/src/protocol/parquet2_read/mod.rs index ae5461d2b6..3314559e43 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/mod.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/mod.rs @@ -10,9 +10,10 @@ use parquet2::read::decompress; use parquet2::read::get_page_iterator; use parquet2::read::levels::get_bit_width; -use super::{ProtocolError, ReaderFeatures, WriterFeatures}; -use crate::protocol::{Action, Add, CommitInfo, MetaData, Protocol, Remove, Txn}; -use crate::schema::Guid; +use super::ProtocolError; +use crate::kernel::{ + Action, Add, CommitInfo, Metadata, Protocol, ReaderFeatures, Remove, Txn, WriterFeatures, +}; use boolean::for_each_boolean_field_value; use map::for_each_map_field_value; use primitive::for_each_primitive_field_value; @@ -138,12 +139,12 @@ impl ActionVariant for Add { type Variant = Add; fn default_action() -> Action { - Action::add(Self::default()) + Action::Add(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::add(v) => Ok(v), + Action::Add(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect Add action, got: {:?}", a @@ -156,7 +157,7 @@ impl ActionVariant for Remove { type Variant = Remove; fn default_action() -> Action { - Action::remove(Self { + Action::Remove(Self { data_change: true, extended_file_metadata: Some(false), ..Default::default() @@ -165,7 +166,7 @@ impl ActionVariant for Remove { fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::remove(v) => Ok(v), + Action::Remove(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect remove action, got: {:?}", a @@ -174,16 +175,16 @@ impl ActionVariant for Remove { } } -impl ActionVariant for MetaData { - type Variant = MetaData; +impl ActionVariant for Metadata { + type Variant = Metadata; fn default_action() -> Action { - Action::metaData(Self::default()) + Action::Metadata(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::metaData(v) => Ok(v), + Action::Metadata(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect metadata action, got: {:?}", a @@ -196,12 +197,12 @@ impl ActionVariant for Txn { type Variant = Txn; fn default_action() -> Action { - Action::txn(Self::default()) + Action::Txn(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::txn(v) => Ok(v), + Action::Txn(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect txn action, got: {:?}", a @@ -214,12 +215,12 @@ impl ActionVariant for Protocol { type Variant = Protocol; fn default_action() -> Action { - Action::protocol(Self::default()) + Action::Protocol(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::protocol(v) => Ok(v), + Action::Protocol(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect protocol action, got: {:?}", a @@ -232,12 +233,12 @@ impl ActionVariant for CommitInfo { type Variant = CommitInfo; fn default_action() -> Action { - Action::commitInfo(CommitInfo::default()) + Action::CommitInfo(CommitInfo::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::commitInfo(v) => Ok(v), + Action::CommitInfo(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect commitInfo action, got: {:?}", a @@ -485,7 +486,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: Guid| action.id = v, + |action: &mut Metadata, v: String| action.id = v, )?; } "name" => { @@ -494,7 +495,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.name = Some(v), + |action: &mut Metadata, v: String| action.name = Some(v), )?; } "description" => { @@ -503,7 +504,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.description = Some(v), + |action: &mut Metadata, v: String| action.description = Some(v), )?; } "format" => { @@ -515,7 +516,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.format.provider = v, + |action: &mut Metadata, v: String| action.format.provider = v, )?; } "options" => { @@ -526,7 +527,7 @@ fn deserialize_metadata_column_page( dict, descriptor, &mut state.metadata_fromat_options, - |action: &mut MetaData, v: (Vec, Vec>)| { + |action: &mut Metadata, v: (Vec, Vec>)| { action.format.options = hashmap_from_kvpairs(v.0, v.1); }, )?; @@ -545,7 +546,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.schema_string = v, + |action: &mut Metadata, v: String| action.schema_string = v, )?; } "partitionColumns" => { @@ -554,7 +555,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: Vec| action.partition_columns = v, + |action: &mut Metadata, v: Vec| action.partition_columns = v, )?; } "createdTime" => { @@ -563,7 +564,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: i64| action.created_time = Some(v), + |action: &mut Metadata, v: i64| action.created_time = Some(v), )?; } "configuration" => { @@ -574,7 +575,7 @@ fn deserialize_metadata_column_page( dict, descriptor, &mut state.metadata_configuration, - |action: &mut MetaData, v: (Vec, Vec>)| { + |action: &mut Metadata, v: (Vec, Vec>)| { action.configuration = hashmap_from_kvpairs(v.0, v.1); }, )?; @@ -762,20 +763,20 @@ mod tests { for row_group in meta_data.row_groups { let actions = actions_from_row_group(row_group, &mut reader).unwrap(); match &actions[0] { - Action::protocol(protocol) => { + Action::Protocol(protocol) => { assert_eq!(protocol.min_reader_version, 1,); assert_eq!(protocol.min_writer_version, 2,); } _ => panic!("expect protocol action"), } match &actions[1] { - Action::metaData(meta_data) => { + Action::Metadata(meta_data) => { assert_eq!(meta_data.id, "22ef18ba-191c-4c36-a606-3dad5cdf3830"); assert_eq!(meta_data.name, None); assert_eq!(meta_data.description, None); assert_eq!( meta_data.format, - crate::protocol::Format::new("parquet".to_string(), None), + crate::kernel::Format::new("parquet".to_string(), None), ); assert_eq!(meta_data.schema_string, "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}"); assert_eq!(meta_data.partition_columns.len(), 0); @@ -786,7 +787,7 @@ mod tests { } match &actions[2] { - Action::txn(txn) => { + Action::Txn(txn) => { assert_eq!(txn.app_id, "e4a20b59-dd0e-4c50-b074-e8ae4786df30"); assert_eq!(txn.version, 0); assert_eq!(txn.last_updated, Some(1564524299648)); @@ -794,7 +795,7 @@ mod tests { _ => panic!("expect txn action, got: {:?}", &actions[1]), } match &actions[3] { - Action::remove(remove) => { + Action::Remove(remove) => { assert_eq!( remove.path, "part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet" @@ -809,7 +810,7 @@ mod tests { _ => panic!("expect remove action, got: {:?}", &actions[2]), } match &actions[9] { - Action::add(add_action) => { + Action::Add(add_action) => { assert_eq!( add_action.path, "part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet" @@ -837,20 +838,20 @@ mod tests { for row_group in metadata.row_groups { let actions = actions_from_row_group(row_group, &mut reader).unwrap(); match &actions[0] { - Action::protocol(protocol) => { + Action::Protocol(protocol) => { assert_eq!(protocol.min_reader_version, 1,); assert_eq!(protocol.min_writer_version, 2,); } _ => panic!("expect protocol action"), } match &actions[1] { - Action::metaData(meta_data) => { + Action::Metadata(meta_data) => { assert_eq!(meta_data.id, "94ba8468-c676-4468-b326-adde3ab9dcd2"); assert_eq!(meta_data.name, None); assert_eq!(meta_data.description, None); assert_eq!( meta_data.format, - crate::protocol::Format::new("parquet".to_string(), None), + crate::kernel::Format::new("parquet".to_string(), None), ); assert_eq!( meta_data.schema_string, @@ -864,7 +865,7 @@ mod tests { } match &actions[2] { - Action::add(add_action) => { + Action::Add(add_action) => { assert_eq!(add_action.path, "f62d8868-d952-4f9d-8bb2-fd4e011ebf36"); assert_eq!(add_action.size, 100); assert_eq!(add_action.modification_time, 1661662807080); @@ -880,7 +881,7 @@ mod tests { _ => panic!("expect add action, got: {:?}", &actions[9]), } match &actions[3] { - Action::add(add_action) => { + Action::Add(add_action) => { assert_eq!(add_action.path, "8ac7d8e1-daab-48ef-9d05-ec22fb4b0d2f"); assert_eq!(add_action.size, 100); assert_eq!(add_action.modification_time, 1661662807097); diff --git a/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs b/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs index 29147ea8ca..16cb850f05 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs @@ -12,7 +12,7 @@ use parquet2::types::NativeType; use super::dictionary; use super::validity::ValidityRowIndexIter; use super::{split_page, ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; struct ExactChunksIter<'a, T: NativeType> { chunks: std::slice::ChunksExact<'a, u8>, diff --git a/crates/deltalake-core/src/protocol/parquet2_read/string.rs b/crates/deltalake-core/src/protocol/parquet2_read/string.rs index fc0ec574e0..391a9b9390 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/string.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/string.rs @@ -9,7 +9,7 @@ use super::dictionary; use super::dictionary::binary::BinaryPageDict; use super::validity::{ValidityRepeatedRowIndexIter, ValidityRowIndexIter}; use super::{split_page, split_page_nested, ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; pub trait StringValueIter<'a>: Iterator> { fn try_from_encoded_values( diff --git a/crates/deltalake-core/src/protocol/parquet_read/mod.rs b/crates/deltalake-core/src/protocol/parquet_read/mod.rs index d5e3e708b6..e89c73d4bd 100644 --- a/crates/deltalake-core/src/protocol/parquet_read/mod.rs +++ b/crates/deltalake-core/src/protocol/parquet_read/mod.rs @@ -6,12 +6,10 @@ use num_traits::cast::ToPrimitive; use parquet::record::{Field, ListAccessor, MapAccessor, RowAccessor}; use serde_json::json; -use crate::protocol::{ - Action, Add, AddCDCFile, ColumnCountStat, ColumnValueStat, DeletionVector, MetaData, Protocol, - ProtocolError, Remove, Stats, Txn, +use crate::kernel::{ + Action, Add, AddCDCFile, DeletionVectorDescriptor, Metadata, Protocol, Remove, StorageType, Txn, }; - -use super::StorageType; +use crate::protocol::{ColumnCountStat, ColumnValueStat, ProtocolError, Stats}; fn populate_hashmap_with_option_from_parquet_map( map: &mut HashMap>, @@ -46,10 +44,14 @@ impl AddCDCFile { } } -impl DeletionVector { +impl DeletionVectorDescriptor { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + cardinality: -1, + offset: None, + path_or_inline_dv: "".to_string(), + size_in_bytes: -1, + storage_type: StorageType::default(), }; for (i, (name, _)) in record.get_column_iter().enumerate() { match name.as_str() { @@ -99,7 +101,18 @@ impl DeletionVector { impl Add { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + path: "".to_string(), + size: -1, + modification_time: -1, + data_change: true, + partition_values_parsed: None, + partition_values: HashMap::new(), + stats: None, + stats_parsed: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + tags: None, }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -182,7 +195,8 @@ impl Add { }, "deletionVector" => match record.get_group(i) { Ok(row) => { - re.deletion_vector = Some(DeletionVector::from_parquet_record(row)?); + re.deletion_vector = + Some(DeletionVectorDescriptor::from_parquet_record(row)?); } _ => { re.deletion_vector = None; @@ -364,10 +378,17 @@ fn convert_date_to_string(value: i32) -> Result { Ok(format!("{}", dt.format("%Y-%m-%d"))) } -impl MetaData { +impl Metadata { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + id: "".to_string(), + name: None, + description: None, + partition_columns: vec![], + schema_string: "".to_string(), + created_time: None, + configuration: HashMap::new(), + format: Default::default(), }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -480,7 +501,14 @@ impl Remove { let mut re = Self { data_change: true, extended_file_metadata: Some(false), - ..Default::default() + deletion_timestamp: None, + deletion_vector: None, + partition_values: None, + path: "".to_string(), + size: None, + tags: None, + base_row_id: None, + default_row_commit_version: None, }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -595,7 +623,10 @@ impl Txn { impl Protocol { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + min_reader_version: -1, + min_writer_version: -1, + reader_features: None, + writer_features: None, }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -673,12 +704,12 @@ impl Action { let field = &fields[col_idx]; Ok(match field.get_basic_info().name() { - "add" => Action::add(Add::from_parquet_record(col_data)?), - "metaData" => Action::metaData(MetaData::from_parquet_record(col_data)?), - "remove" => Action::remove(Remove::from_parquet_record(col_data)?), - "txn" => Action::txn(Txn::from_parquet_record(col_data)?), - "protocol" => Action::protocol(Protocol::from_parquet_record(col_data)?), - "cdc" => Action::cdc(AddCDCFile::from_parquet_record(col_data)?), + "add" => Action::Add(Add::from_parquet_record(col_data)?), + "metaData" => Action::Metadata(Metadata::from_parquet_record(col_data)?), + "remove" => Action::Remove(Remove::from_parquet_record(col_data)?), + "txn" => Action::Txn(Txn::from_parquet_record(col_data)?), + "protocol" => Action::Protocol(Protocol::from_parquet_record(col_data)?), + "cdc" => Action::Cdc(AddCDCFile::from_parquet_record(col_data)?), name => { return Err(ProtocolError::InvalidField(format!( "Unexpected action from checkpoint: {name}", diff --git a/crates/deltalake-core/src/schema/arrow_convert.rs b/crates/deltalake-core/src/schema/arrow_convert.rs index 2b37b05c4a..d292362604 100644 --- a/crates/deltalake-core/src/schema/arrow_convert.rs +++ b/crates/deltalake-core/src/schema/arrow_convert.rs @@ -1,44 +1,41 @@ -//! Conversion between Delta Table schema and Arrow schema +use std::sync::Arc; -use crate::schema; -use arrow::datatypes::{ - DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, +use arrow_schema::{ + ArrowError, DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, }; -use arrow::error::ArrowError; use lazy_static::lazy_static; -use regex::Regex; -use std::convert::TryFrom; -use std::sync::Arc; -impl TryFrom<&schema::Schema> for ArrowSchema { +use super::super::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; + +impl TryFrom<&StructType> for ArrowSchema { type Error = ArrowError; - fn try_from(s: &schema::Schema) -> Result { + fn try_from(s: &StructType) -> Result { let fields = s - .get_fields() + .fields() .iter() - .map(>::try_from) + .map(>::try_from) .collect::, ArrowError>>()?; Ok(ArrowSchema::new(fields)) } } -impl TryFrom<&schema::SchemaField> for ArrowField { +impl TryFrom<&StructField> for ArrowField { type Error = ArrowError; - fn try_from(f: &schema::SchemaField) -> Result { + fn try_from(f: &StructField) -> Result { let metadata = f - .get_metadata() + .metadata() .iter() .map(|(key, val)| Ok((key.clone(), serde_json::to_string(val)?))) .collect::>() .map_err(|err| ArrowError::JsonError(err.to_string()))?; let field = ArrowField::new( - f.get_name(), - ArrowDataType::try_from(f.get_type())?, + f.name(), + ArrowDataType::try_from(f.data_type())?, f.is_nullable(), ) .with_metadata(metadata); @@ -47,119 +44,113 @@ impl TryFrom<&schema::SchemaField> for ArrowField { } } -impl TryFrom<&schema::SchemaTypeArray> for ArrowField { +impl TryFrom<&ArrayType> for ArrowField { type Error = ArrowError; - fn try_from(a: &schema::SchemaTypeArray) -> Result { + fn try_from(a: &ArrayType) -> Result { Ok(ArrowField::new( "item", - ArrowDataType::try_from(a.get_element_type())?, + ArrowDataType::try_from(a.element_type())?, a.contains_null(), )) } } -impl TryFrom<&schema::SchemaTypeMap> for ArrowField { +impl TryFrom<&MapType> for ArrowField { type Error = ArrowError; - fn try_from(a: &schema::SchemaTypeMap) -> Result { - Ok(ArrowField::new_map( - "map", + fn try_from(a: &MapType) -> Result { + Ok(ArrowField::new( "entries", - ArrowField::new("key", ArrowDataType::try_from(a.get_key_type())?, false), - ArrowField::new( - "value", - ArrowDataType::try_from(a.get_value_type())?, - a.get_value_contains_null(), + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::try_from(a.key_type())?, false), + ArrowField::new( + "value", + ArrowDataType::try_from(a.value_type())?, + a.value_contains_null(), + ), + ] + .into(), ), - false, - false, + false, // always non-null )) } } -impl TryFrom<&schema::SchemaDataType> for ArrowDataType { +impl TryFrom<&DataType> for ArrowDataType { type Error = ArrowError; - fn try_from(t: &schema::SchemaDataType) -> Result { + fn try_from(t: &DataType) -> Result { match t { - schema::SchemaDataType::primitive(p) => { - lazy_static! { - static ref DECIMAL_REGEX: Regex = - Regex::new(r"\((\d{1,2}),(\d{1,2})\)").unwrap(); - } - match p.as_str() { - "string" => Ok(ArrowDataType::Utf8), - "long" => Ok(ArrowDataType::Int64), // undocumented type - "integer" => Ok(ArrowDataType::Int32), - "short" => Ok(ArrowDataType::Int16), - "byte" => Ok(ArrowDataType::Int8), - "float" => Ok(ArrowDataType::Float32), - "double" => Ok(ArrowDataType::Float64), - "boolean" => Ok(ArrowDataType::Boolean), - "binary" => Ok(ArrowDataType::Binary), - decimal if DECIMAL_REGEX.is_match(decimal) => { - let extract = DECIMAL_REGEX.captures(decimal).ok_or_else(|| { + DataType::Primitive(p) => { + match p { + PrimitiveType::String => Ok(ArrowDataType::Utf8), + PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type + PrimitiveType::Integer => Ok(ArrowDataType::Int32), + PrimitiveType::Short => Ok(ArrowDataType::Int16), + PrimitiveType::Byte => Ok(ArrowDataType::Int8), + PrimitiveType::Float => Ok(ArrowDataType::Float32), + PrimitiveType::Double => Ok(ArrowDataType::Float64), + PrimitiveType::Boolean => Ok(ArrowDataType::Boolean), + PrimitiveType::Binary => Ok(ArrowDataType::Binary), + PrimitiveType::Decimal(precision, scale) => { + let precision = u8::try_from(*precision).map_err(|_| { ArrowError::SchemaError(format!( - "Invalid decimal type for Arrow: {decimal}" + "Invalid precision for decimal: {}", + precision )) })?; - let precision = extract.get(1).and_then(|v| v.as_str().parse::().ok()); - let scale = extract.get(2).and_then(|v| v.as_str().parse::().ok()); - match (precision, scale) { - // TODO how do we decide which variant (128 / 256) to use? - (Some(p), Some(s)) => Ok(ArrowDataType::Decimal128(p, s)), - _ => Err(ArrowError::SchemaError(format!( - "Invalid precision or scale decimal type for Arrow: {decimal}" - ))), + let scale = i8::try_from(*scale).map_err(|_| { + ArrowError::SchemaError(format!("Invalid scale for decimal: {}", scale)) + })?; + + if precision <= 38 { + Ok(ArrowDataType::Decimal128(precision, scale)) + } else if precision <= 76 { + Ok(ArrowDataType::Decimal256(precision, scale)) + } else { + Err(ArrowError::SchemaError(format!( + "Precision too large to be represented in Arrow: {}", + precision + ))) } } - "date" => { + PrimitiveType::Date => { // A calendar date, represented as a year-month-day triple without a - // timezone. Stored as 4 bytes integer representing days sinece 1970-01-01 + // timezone. Stored as 4 bytes integer representing days since 1970-01-01 Ok(ArrowDataType::Date32) } - "timestamp" => { + PrimitiveType::Timestamp => { // Issue: https://github.com/delta-io/delta/issues/643 Ok(ArrowDataType::Timestamp(TimeUnit::Microsecond, None)) } - s => Err(ArrowError::SchemaError(format!( - "Invalid data type for Arrow: {s}" - ))), } } - schema::SchemaDataType::r#struct(s) => Ok(ArrowDataType::Struct( - s.get_fields() + DataType::Struct(s) => Ok(ArrowDataType::Struct( + s.fields() .iter() - .map(>::try_from) + .map(>::try_from) .collect::, ArrowError>>()? .into(), )), - schema::SchemaDataType::array(a) => { - Ok(ArrowDataType::List(Arc::new(>::try_from( - a - )?))) - } - schema::SchemaDataType::map(m) => Ok(ArrowDataType::Map( + DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(>::try_from(a)?))), + DataType::Map(m) => Ok(ArrowDataType::Map( Arc::new(ArrowField::new( "entries", ArrowDataType::Struct( vec![ ArrowField::new( "keys", - >::try_from( - m.get_key_type(), - )?, + >::try_from(m.key_type())?, false, ), ArrowField::new( "values", - >::try_from( - m.get_value_type(), - )?, - m.get_value_contains_null(), + >::try_from(m.value_type())?, + m.value_contains_null(), ), ] .into(), @@ -172,19 +163,20 @@ impl TryFrom<&schema::SchemaDataType> for ArrowDataType { } } -impl TryFrom<&ArrowSchema> for schema::Schema { +impl TryFrom<&ArrowSchema> for StructType { type Error = ArrowError; + fn try_from(arrow_schema: &ArrowSchema) -> Result { - let new_fields: Result, _> = arrow_schema + let new_fields: Result, _> = arrow_schema .fields() .iter() .map(|field| field.as_ref().try_into()) .collect(); - Ok(schema::Schema::new(new_fields?)) + Ok(StructType::new(new_fields?)) } } -impl TryFrom for schema::Schema { +impl TryFrom for StructType { type Error = ArrowError; fn try_from(arrow_schema: ArrowSchemaRef) -> Result { @@ -192,99 +184,86 @@ impl TryFrom for schema::Schema { } } -impl TryFrom<&ArrowField> for schema::SchemaField { +impl TryFrom<&ArrowField> for StructField { type Error = ArrowError; + fn try_from(arrow_field: &ArrowField) -> Result { - Ok(schema::SchemaField::new( + Ok(StructField::new( arrow_field.name().clone(), arrow_field.data_type().try_into()?, arrow_field.is_nullable(), - arrow_field - .metadata() - .iter() - .map(|(k, v)| (k.clone(), serde_json::Value::String(v.clone()))) - .collect(), - )) + ) + .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) } } -impl TryFrom<&ArrowDataType> for schema::SchemaDataType { +impl TryFrom<&ArrowDataType> for DataType { type Error = ArrowError; + fn try_from(arrow_datatype: &ArrowDataType) -> Result { match arrow_datatype { - ArrowDataType::Utf8 => Ok(schema::SchemaDataType::primitive("string".to_string())), - ArrowDataType::LargeUtf8 => Ok(schema::SchemaDataType::primitive("string".to_string())), - ArrowDataType::Int64 => Ok(schema::SchemaDataType::primitive("long".to_string())), // undocumented type - ArrowDataType::Int32 => Ok(schema::SchemaDataType::primitive("integer".to_string())), - ArrowDataType::Int16 => Ok(schema::SchemaDataType::primitive("short".to_string())), - ArrowDataType::Int8 => Ok(schema::SchemaDataType::primitive("byte".to_string())), - ArrowDataType::UInt64 => Ok(schema::SchemaDataType::primitive("long".to_string())), // undocumented type - ArrowDataType::UInt32 => Ok(schema::SchemaDataType::primitive("integer".to_string())), - ArrowDataType::UInt16 => Ok(schema::SchemaDataType::primitive("short".to_string())), - ArrowDataType::UInt8 => Ok(schema::SchemaDataType::primitive("byte".to_string())), - ArrowDataType::Float32 => Ok(schema::SchemaDataType::primitive("float".to_string())), - ArrowDataType::Float64 => Ok(schema::SchemaDataType::primitive("double".to_string())), - ArrowDataType::Boolean => Ok(schema::SchemaDataType::primitive("boolean".to_string())), - ArrowDataType::Binary => Ok(schema::SchemaDataType::primitive("binary".to_string())), - ArrowDataType::FixedSizeBinary(_) => { - Ok(schema::SchemaDataType::primitive("binary".to_string())) - } - ArrowDataType::LargeBinary => { - Ok(schema::SchemaDataType::primitive("binary".to_string())) - } - ArrowDataType::Decimal128(p, s) => Ok(schema::SchemaDataType::primitive(format!( - "decimal({p},{s})" + ArrowDataType::Utf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::LargeUtf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::Int64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::Int32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::Int16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::Int8 => Ok(DataType::Primitive(PrimitiveType::Byte)), + ArrowDataType::UInt64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::UInt32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::UInt16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::UInt8 => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Float32 => Ok(DataType::Primitive(PrimitiveType::Float)), + ArrowDataType::Float64 => Ok(DataType::Primitive(PrimitiveType::Double)), + ArrowDataType::Boolean => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::Decimal128(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, ))), - ArrowDataType::Decimal256(p, s) => Ok(schema::SchemaDataType::primitive(format!( - "decimal({p},{s})" + ArrowDataType::Decimal256(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, ))), - ArrowDataType::Date32 => Ok(schema::SchemaDataType::primitive("date".to_string())), - ArrowDataType::Date64 => Ok(schema::SchemaDataType::primitive("date".to_string())), + ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)), + ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)), ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { - Ok(schema::SchemaDataType::primitive("timestamp".to_string())) + Ok(DataType::Primitive(PrimitiveType::Timestamp)) } ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) if tz.eq_ignore_ascii_case("utc") => { - Ok(schema::SchemaDataType::primitive("timestamp".to_string())) + Ok(DataType::Primitive(PrimitiveType::Timestamp)) } ArrowDataType::Struct(fields) => { - let converted_fields: Result, _> = fields + let converted_fields: Result, _> = fields .iter() .map(|field| field.as_ref().try_into()) .collect(); - Ok(schema::SchemaDataType::r#struct( - schema::SchemaTypeStruct::new(converted_fields?), - )) - } - ArrowDataType::List(field) => { - Ok(schema::SchemaDataType::array(schema::SchemaTypeArray::new( - Box::new((*field).data_type().try_into()?), - (*field).is_nullable(), - ))) - } - ArrowDataType::LargeList(field) => { - Ok(schema::SchemaDataType::array(schema::SchemaTypeArray::new( - Box::new((*field).data_type().try_into()?), - (*field).is_nullable(), - ))) - } - ArrowDataType::FixedSizeList(field, _) => { - Ok(schema::SchemaDataType::array(schema::SchemaTypeArray::new( - Box::new((*field).data_type().try_into()?), - (*field).is_nullable(), - ))) + Ok(DataType::Struct(Box::new(StructType::new( + converted_fields?, + )))) } + ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( + ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), + ))), ArrowDataType::Map(field, _) => { if let ArrowDataType::Struct(struct_fields) = field.data_type() { let key_type = struct_fields[0].data_type().try_into()?; let value_type = struct_fields[1].data_type().try_into()?; let value_type_nullable = struct_fields[1].is_nullable(); - Ok(schema::SchemaDataType::map(schema::SchemaTypeMap::new( - Box::new(key_type), - Box::new(value_type), + Ok(DataType::Map(Box::new(MapType::new( + key_type, + value_type, value_type_nullable, - ))) + )))) } else { panic!("DataType::Map should contain a struct field child"); } @@ -635,7 +614,6 @@ fn null_count_schema_for_fields(dest: &mut Vec, f: &ArrowField) { #[cfg(test)] mod tests { use arrow::array::ArrayData; - use arrow::datatypes::DataType; use arrow_array::Array; use arrow_array::{make_array, ArrayRef, MapArray, StringArray, StructArray}; use arrow_buffer::{Buffer, ToByteSlice}; @@ -790,33 +768,18 @@ mod tests { fn test_arrow_from_delta_decimal_type() { let precision = 20; let scale = 2; - let decimal_type = format!["decimal({precision},{scale})"]; - let decimal_field = crate::SchemaDataType::primitive(decimal_type); + let decimal_field = DataType::Primitive(PrimitiveType::Decimal(precision, scale)); assert_eq!( - >::try_from(&decimal_field).unwrap(), - ArrowDataType::Decimal128(precision, scale) + >::try_from(&decimal_field).unwrap(), + ArrowDataType::Decimal128(precision as u8, scale as i8) ); } - #[test] - fn test_arrow_from_delta_wrong_decimal_type() { - let precision = 20; - let scale = "wrong"; - let decimal_type = format!["decimal({precision},{scale})"]; - let _error = format!("Invalid precision or scale decimal type for Arrow: {scale}"); - let decimal_field = crate::SchemaDataType::primitive(decimal_type); - assert!(matches!( - >::try_from(&decimal_field) - .unwrap_err(), - arrow::error::ArrowError::SchemaError(_error), - )); - } - #[test] fn test_arrow_from_delta_timestamp_type() { - let timestamp_field = crate::SchemaDataType::primitive("timestamp".to_string()); + let timestamp_field = DataType::Primitive(PrimitiveType::Timestamp); assert_eq!( - >::try_from(×tamp_field).unwrap(), + >::try_from(×tamp_field).unwrap(), ArrowDataType::Timestamp(TimeUnit::Microsecond, None) ); } @@ -825,8 +788,8 @@ mod tests { fn test_delta_from_arrow_timestamp_type() { let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); assert_eq!( - >::try_from(×tamp_field).unwrap(), - crate::SchemaDataType::primitive("timestamp".to_string()) + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) ); } @@ -835,8 +798,8 @@ mod tests { let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())); assert_eq!( - >::try_from(×tamp_field).unwrap(), - crate::SchemaDataType::primitive("timestamp".to_string()) + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) ); } @@ -856,15 +819,15 @@ mod tests { )), false, ); - let converted_map: crate::SchemaDataType = (&arrow_map).try_into().unwrap(); + let converted_map: DataType = (&arrow_map).try_into().unwrap(); assert_eq!( converted_map, - crate::SchemaDataType::map(crate::SchemaTypeMap::new( - Box::new(crate::SchemaDataType::primitive("byte".to_string())), - Box::new(crate::SchemaDataType::primitive("binary".to_string())), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::Byte), + DataType::Primitive(PrimitiveType::Binary), true, - )) + ))) ); } @@ -891,7 +854,7 @@ mod tests { let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); let keys_data = StringArray::from_iter_values(keys); - let keys_field = Arc::new(Field::new("keys", DataType::Utf8, false)); + let keys_field = Arc::new(Field::new("keys", ArrowDataType::Utf8, false)); let values_field = Arc::new(Field::new( "values", values.data_type().clone(), @@ -903,7 +866,7 @@ mod tests { (values_field, make_array(values.to_data())), ]); - let map_data_type = DataType::Map( + let map_data_type = ArrowDataType::Map( Arc::new(Field::new( "entries", entry_struct.data_type().clone(), @@ -928,19 +891,19 @@ mod tests { ) .expect("Could not create a map array"); - let schema = >::try_from( - &crate::Schema::new(vec![crate::SchemaField::new( - "example".to_string(), - crate::SchemaDataType::map(crate::SchemaTypeMap::new( - Box::new(crate::SchemaDataType::primitive("string".to_string())), - Box::new(crate::SchemaDataType::primitive("binary".to_string())), + let schema = + >::try_from(&StructType::new(vec![ + StructField::new( + "example".to_string(), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::String), + DataType::Primitive(PrimitiveType::Binary), + false, + ))), false, - )), - false, - HashMap::new(), - )]), - ) - .expect("Could not get schema"); + ), + ])) + .expect("Could not get schema"); let record_batch = arrow::record_batch::RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) @@ -1081,6 +1044,6 @@ mod tests { ), true, )); - let _converted: schema::SchemaField = field.as_ref().try_into().unwrap(); + let _converted: StructField = field.as_ref().try_into().unwrap(); } } diff --git a/crates/deltalake-core/src/schema/mod.rs b/crates/deltalake-core/src/schema/mod.rs index a853725fc6..54d73ec664 100644 --- a/crates/deltalake-core/src/schema/mod.rs +++ b/crates/deltalake-core/src/schema/mod.rs @@ -1,379 +1,2 @@ //! Delta Table schema implementation. -#![allow(non_snake_case, non_camel_case_types)] - -use serde::{Deserialize, Serialize}; -use serde_json::Value; -use std::borrow::Cow; -use std::collections::HashMap; - -use crate::errors::DeltaTableError; - -#[cfg(all(feature = "arrow", feature = "parquet"))] -pub mod arrow_convert; pub mod partitions; - -/// Type alias for a string expected to match a GUID/UUID format -pub type Guid = String; - -static STRUCT_TAG: &str = "struct"; -static ARRAY_TAG: &str = "array"; -static MAP_TAG: &str = "map"; - -/// An invariant for a column that is enforced on all writes to a Delta table. -#[derive(Eq, PartialEq, Debug, Default, Clone)] -pub struct Invariant { - /// The full path to the field. - pub field_name: String, - /// The SQL string that must always evaluate to true. - pub invariant_sql: String, -} - -impl Invariant { - /// Create a new invariant - pub fn new(field_name: &str, invariant_sql: &str) -> Self { - Self { - field_name: field_name.to_string(), - invariant_sql: invariant_sql.to_string(), - } - } -} - -/// Represents a struct field defined in the Delta table schema. -// https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format -#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] -pub struct SchemaTypeStruct { - r#type: Cow<'static, str>, - fields: Vec, -} - -impl SchemaTypeStruct { - /// Create a new Schema using a vector of SchemaFields - pub fn new(fields: Vec) -> Self { - let tag = Cow::Borrowed(STRUCT_TAG); - Self { - r#type: tag, - fields, - } - } - - /// Returns the list of fields contained within the column struct. - pub fn get_fields(&self) -> &Vec { - &self.fields - } - - /// Returns an immutable reference of a specific `Field` instance selected by name. - pub fn get_field_with_name(&self, name: &str) -> Result<&SchemaField, DeltaTableError> { - Ok(&self.fields[self.index_of(name)?]) - } - - /// Find the index of the column with the given name. - pub fn index_of(&self, name: &str) -> Result { - for i in 0..self.fields.len() { - if self.fields[i].get_name() == name { - return Ok(i); - } - } - let valid_fields: Vec = self.fields.iter().map(|f| f.name.clone()).collect(); - Err(DeltaTableError::Generic(format!( - "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" - ))) - } - - /// Get all invariants in the schemas - pub fn get_invariants(&self) -> Result, DeltaTableError> { - let mut remaining_fields: Vec<(String, SchemaField)> = self - .get_fields() - .iter() - .map(|field| (field.name.clone(), field.clone())) - .collect(); - let mut invariants: Vec = Vec::new(); - - let add_segment = |prefix: &str, segment: &str| -> String { - if prefix.is_empty() { - segment.to_owned() - } else { - format!("{prefix}.{segment}") - } - }; - - while let Some((field_path, field)) = remaining_fields.pop() { - match field.r#type { - SchemaDataType::r#struct(inner) => { - remaining_fields.extend( - inner - .get_fields() - .iter() - .map(|field| { - let new_prefix = add_segment(&field_path, &field.name); - (new_prefix, field.clone()) - }) - .collect::>(), - ); - } - SchemaDataType::array(inner) => { - let element_field_name = add_segment(&field_path, "element"); - remaining_fields.push(( - element_field_name, - SchemaField::new("".to_string(), *inner.elementType, false, HashMap::new()), - )); - } - SchemaDataType::map(inner) => { - let key_field_name = add_segment(&field_path, "key"); - remaining_fields.push(( - key_field_name, - SchemaField::new("".to_string(), *inner.keyType, false, HashMap::new()), - )); - let value_field_name = add_segment(&field_path, "value"); - remaining_fields.push(( - value_field_name, - SchemaField::new("".to_string(), *inner.valueType, false, HashMap::new()), - )); - } - _ => {} - } - // JSON format: {"expression": {"expression": ""} } - if let Some(Value::String(invariant_json)) = field.metadata.get("delta.invariants") { - let json: Value = serde_json::from_str(invariant_json).map_err(|e| { - DeltaTableError::InvalidInvariantJson { - json_err: e, - line: invariant_json.to_string(), - } - })?; - if let Value::Object(json) = json { - if let Some(Value::Object(expr1)) = json.get("expression") { - if let Some(Value::String(sql)) = expr1.get("expression") { - invariants.push(Invariant::new(&field_path, sql)); - } - } - } - } - } - Ok(invariants) - } -} - -/// Describes a specific field of the Delta table schema. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -pub struct SchemaField { - // Name of this (possibly nested) column - name: String, - r#type: SchemaDataType, - // Boolean denoting whether this field can be null - nullable: bool, - // A JSON map containing information about this column. Keys prefixed with Delta are reserved - // for the implementation. - metadata: HashMap, -} - -impl SchemaField { - /// Create a new SchemaField from scratch - pub fn new( - name: String, - r#type: SchemaDataType, - nullable: bool, - metadata: HashMap, - ) -> Self { - Self { - name, - r#type, - nullable, - metadata, - } - } - - /// The column name of the schema field. - pub fn get_name(&self) -> &str { - &self.name - } - - /// The data type of the schema field. SchemaDataType defines the possible values. - pub fn get_type(&self) -> &SchemaDataType { - &self.r#type - } - - /// Whether the column/field is nullable. - pub fn is_nullable(&self) -> bool { - self.nullable - } - - /// Additional metadata about the column/field. - pub fn get_metadata(&self) -> &HashMap { - &self.metadata - } -} - -/// Schema definition for array type fields. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -pub struct SchemaTypeArray { - // type field is always the string "array", so we are ignoring it here - r#type: String, - // The type of element stored in this array represented as a string containing the name of a - // primitive type, a struct definition, an array definition or a map definition - elementType: Box, - // Boolean denoting whether this array can contain one or more null values - containsNull: bool, -} - -impl SchemaTypeArray { - /// Create a new SchemaTypeArray - pub fn new(elementType: Box, containsNull: bool) -> Self { - Self { - r#type: String::from(ARRAY_TAG), - elementType, - containsNull, - } - } - - /// The data type of each element contained in the array. - pub fn get_element_type(&self) -> &SchemaDataType { - &self.elementType - } - - /// Whether the column/field is allowed to contain null elements. - pub fn contains_null(&self) -> bool { - self.containsNull - } -} - -/// Schema definition for map type fields. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -pub struct SchemaTypeMap { - r#type: String, - keyType: Box, - valueType: Box, - valueContainsNull: bool, -} - -impl SchemaTypeMap { - /// Create a new SchemaTypeMap - pub fn new( - keyType: Box, - valueType: Box, - valueContainsNull: bool, - ) -> Self { - Self { - r#type: String::from(MAP_TAG), - keyType, - valueType, - valueContainsNull, - } - } - - /// The type of element used for the key of this map, represented as a string containing the - /// name of a primitive type, a struct definition, an array definition or a map definition - pub fn get_key_type(&self) -> &SchemaDataType { - &self.keyType - } - - /// The type of element contained in the value of this map, represented as a string containing the - /// name of a primitive type, a struct definition, an array definition or a map definition - pub fn get_value_type(&self) -> &SchemaDataType { - &self.valueType - } - - /// Whether the value field is allowed to contain null elements. - pub fn get_value_contains_null(&self) -> bool { - self.valueContainsNull - } -} - -/// Enum with variants for each top level schema data type. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -#[serde(untagged)] -pub enum SchemaDataType { - /// Variant representing non-array, non-map, non-struct fields. Wrapped value will contain the - /// the string name of the primitive type. - /// - /// Valid values are: - /// * string: utf8 - /// * long // undocumented, i64? - /// * integer: i32 - /// * short: i16 - /// * byte: i8 - /// * float: f32 - /// * double: f64 - /// * boolean: bool - /// * binary: a sequence of binary data - /// * date: A calendar date, represented as a year-month-day triple without a timezone - /// * timestamp: Microsecond precision timestamp without a timezone - /// * decimal: Signed decimal number with fixed precision (maximum number of digits) and scale (number of digits on right side of dot), where the precision and scale can be up to 38 - primitive(String), - /// Variant representing a struct. - r#struct(SchemaTypeStruct), - /// Variant representing an array. - array(SchemaTypeArray), - /// Variant representing a map. - map(SchemaTypeMap), -} - -/// Represents the schema of the delta table. -pub type Schema = SchemaTypeStruct; - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_get_invariants() { - let schema: Schema = serde_json::from_value(json!({ - "type": "struct", - "fields": [{"name": "x", "type": "string", "nullable": true, "metadata": {}}] - })) - .unwrap(); - let invariants = schema.get_invariants().unwrap(); - assert_eq!(invariants.len(), 0); - - let schema: Schema = serde_json::from_value(json!({ - "type": "struct", - "fields": [ - {"name": "x", "type": "integer", "nullable": true, "metadata": { - "delta.invariants": "{\"expression\": { \"expression\": \"x > 2\"} }" - }}, - {"name": "y", "type": "integer", "nullable": true, "metadata": { - "delta.invariants": "{\"expression\": { \"expression\": \"y < 4\"} }" - }} - ] - })) - .unwrap(); - let invariants = schema.get_invariants().unwrap(); - assert_eq!(invariants.len(), 2); - assert!(invariants.contains(&Invariant::new("x", "x > 2"))); - assert!(invariants.contains(&Invariant::new("y", "y < 4"))); - - let schema: Schema = serde_json::from_value(json!({ - "type": "struct", - "fields": [{ - "name": "a_map", - "type": { - "type": "map", - "keyType": "string", - "valueType": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [{ - "name": "d", - "type": "integer", - "metadata": { - "delta.invariants": "{\"expression\": { \"expression\": \"a_map.value.element.d < 4\"} }" - }, - "nullable": false - }] - }, - "containsNull": false - }, - "valueContainsNull": false - }, - "nullable": false, - "metadata": {} - }] - })).unwrap(); - let invariants = schema.get_invariants().unwrap(); - assert_eq!(invariants.len(), 1); - assert_eq!( - invariants[0], - Invariant::new("a_map.value.element.d", "a_map.value.element.d < 4") - ); - } -} diff --git a/crates/deltalake-core/src/schema/partitions.rs b/crates/deltalake-core/src/schema/partitions.rs index 3750038b3a..c2db1903fa 100644 --- a/crates/deltalake-core/src/schema/partitions.rs +++ b/crates/deltalake-core/src/schema/partitions.rs @@ -2,8 +2,8 @@ use std::convert::TryFrom; -use super::SchemaDataType; use crate::errors::DeltaTableError; +use crate::kernel::{DataType, PrimitiveType}; use std::cmp::Ordering; use std::collections::HashMap; @@ -40,18 +40,21 @@ pub struct PartitionFilter { fn compare_typed_value( partition_value: &str, filter_value: &str, - data_type: &SchemaDataType, + data_type: &DataType, ) -> Option { match data_type { - SchemaDataType::primitive(primitive_type) => match primitive_type.as_str() { - "long" | "integer" | "short" | "byte" => match filter_value.parse::() { + DataType::Primitive(primitive_type) => match primitive_type { + PrimitiveType::Long + | PrimitiveType::Integer + | PrimitiveType::Short + | PrimitiveType::Byte => match filter_value.parse::() { Ok(parsed_filter_value) => { let parsed_partition_value = partition_value.parse::().unwrap(); parsed_partition_value.partial_cmp(&parsed_filter_value) } _ => None, }, - "float" | "double" => match filter_value.parse::() { + PrimitiveType::Float | PrimitiveType::Double => match filter_value.parse::() { Ok(parsed_filter_value) => { let parsed_partition_value = partition_value.parse::().unwrap(); parsed_partition_value.partial_cmp(&parsed_filter_value) @@ -67,11 +70,7 @@ fn compare_typed_value( /// Partition filters methods for filtering the DeltaTable partitions. impl PartitionFilter { /// Indicates if a DeltaTable partition matches with the partition filter by key and value. - pub fn match_partition( - &self, - partition: &DeltaTablePartition, - data_type: &SchemaDataType, - ) -> bool { + pub fn match_partition(&self, partition: &DeltaTablePartition, data_type: &DataType) -> bool { if self.key != partition.key { return false; } @@ -109,12 +108,9 @@ impl PartitionFilter { pub fn match_partitions( &self, partitions: &[DeltaTablePartition], - partition_col_data_types: &HashMap<&str, &SchemaDataType>, + partition_col_data_types: &HashMap<&String, &DataType>, ) -> bool { - let data_type = partition_col_data_types - .get(self.key.as_str()) - .unwrap() - .to_owned(); + let data_type = partition_col_data_types.get(&self.key).unwrap().to_owned(); partitions .iter() .any(|partition| self.match_partition(partition, data_type)) diff --git a/crates/deltalake-core/src/storage/utils.rs b/crates/deltalake-core/src/storage/utils.rs index 80710efd9b..7e516c7217 100644 --- a/crates/deltalake-core/src/storage/utils.rs +++ b/crates/deltalake-core/src/storage/utils.rs @@ -9,7 +9,7 @@ use object_store::path::Path; use object_store::{DynObjectStore, ObjectMeta, Result as ObjectStoreResult}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::Add; +use crate::kernel::Add; use crate::table::builder::DeltaTableBuilder; /// Copies the contents from the `from` location into the `to` location @@ -109,7 +109,15 @@ mod tests { .to_string(), size: 123, modification_time: 123456789, - ..Default::default() + data_change: true, + stats: None, + partition_values: Default::default(), + tags: Default::default(), + base_row_id: None, + default_row_commit_version: None, + deletion_vector: None, + partition_values_parsed: None, + stats_parsed: None, }; let meta: ObjectMeta = (&add).try_into().unwrap(); diff --git a/crates/deltalake-core/src/table/config.rs b/crates/deltalake-core/src/table/config.rs index 60498767ab..3fa021ce6e 100644 --- a/crates/deltalake-core/src/table/config.rs +++ b/crates/deltalake-core/src/table/config.rs @@ -387,12 +387,12 @@ fn parse_int(value: &str) -> Result { #[cfg(test)] mod tests { use super::*; + use crate::kernel::StructType; use crate::table::DeltaTableMetaData; - use crate::Schema; use std::collections::HashMap; fn dummy_metadata() -> DeltaTableMetaData { - let schema = Schema::new(Vec::new()); + let schema = StructType::new(Vec::new()); DeltaTableMetaData::new(None, None, None, schema, Vec::new(), HashMap::new()) } diff --git a/crates/deltalake-core/src/table/mod.rs b/crates/deltalake-core/src/table/mod.rs index 0a1e3116f1..2b011ff608 100644 --- a/crates/deltalake-core/src/table/mod.rs +++ b/crates/deltalake-core/src/table/mod.rs @@ -22,13 +22,14 @@ use uuid::Uuid; use self::builder::DeltaTableConfig; use self::state::DeltaTableState; use crate::errors::DeltaTableError; +use crate::kernel::{ + Action, Add, CommitInfo, DataType, Format, Metadata, ReaderFeatures, Remove, StructType, + WriterFeatures, +}; use crate::partitions::PartitionFilter; use crate::protocol::{ - self, find_latest_check_point_for_version, get_last_checkpoint, Action, ReaderFeatures, - WriterFeatures, + find_latest_check_point_for_version, get_last_checkpoint, ProtocolError, Stats, }; -use crate::protocol::{Add, ProtocolError, Stats}; -use crate::schema::*; use crate::storage::{commit_uri_from_version, ObjectStoreRef}; pub mod builder; @@ -133,16 +134,17 @@ impl Eq for CheckPoint {} /// Delta table metadata #[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] pub struct DeltaTableMetaData { + // TODO make this a UUID? /// Unique identifier for this table - pub id: Guid, + pub id: String, /// User-provided identifier for this table pub name: Option, /// User-provided description for this table pub description: Option, /// Specification of the encoding for the files stored in the table - pub format: protocol::Format, + pub format: Format, /// Schema of the table - pub schema: Schema, + pub schema: StructType, /// An array containing the names of columns by which the data should be partitioned pub partition_columns: Vec, /// The time when this metadata action is created, in milliseconds since the Unix epoch @@ -156,8 +158,8 @@ impl DeltaTableMetaData { pub fn new( name: Option, description: Option, - format: Option, - schema: Schema, + format: Option, + schema: StructType, partition_columns: Vec, configuration: HashMap>, ) -> Self { @@ -181,19 +183,19 @@ impl DeltaTableMetaData { } /// Return partition fields along with their data type from the current schema. - pub fn get_partition_col_data_types(&self) -> Vec<(&str, &SchemaDataType)> { + pub fn get_partition_col_data_types(&self) -> Vec<(&String, &DataType)> { // JSON add actions contain a `partitionValues` field which is a map. // When loading `partitionValues_parsed` we have to convert the stringified partition values back to the correct data type. self.schema - .get_fields() + .fields() .iter() .filter_map(|f| { if self .partition_columns .iter() - .any(|s| s.as_str() == f.get_name()) + .any(|s| s.as_str() == f.name()) { - Some((f.get_name(), f.get_type())) + Some((f.name(), f.data_type())) } else { None } @@ -212,16 +214,16 @@ impl fmt::Display for DeltaTableMetaData { } } -impl TryFrom for DeltaTableMetaData { +impl TryFrom for DeltaTableMetaData { type Error = ProtocolError; - fn try_from(action_metadata: protocol::MetaData) -> Result { - let schema = action_metadata.get_schema()?; + fn try_from(action_metadata: Metadata) -> Result { + let schema = action_metadata.schema()?; Ok(Self { id: action_metadata.id, name: action_metadata.name, description: action_metadata.description, - format: action_metadata.format, + format: Format::default(), schema, partition_columns: action_metadata.partition_columns, created_time: action_metadata.created_time, @@ -667,7 +669,7 @@ impl DeltaTable { pub async fn history( &mut self, limit: Option, - ) -> Result, DeltaTableError> { + ) -> Result, DeltaTableError> { let mut version = match limit { Some(l) => max(self.version() - l as i64 + 1, 0), None => self.get_earliest_delta_log_version().await?, @@ -800,7 +802,7 @@ impl DeltaTable { } /// Returns a vector of active tombstones (i.e. `Remove` actions present in the current delta log). - pub fn get_tombstones(&self) -> impl Iterator { + pub fn get_tombstones(&self) -> impl Iterator { self.state.unexpired_tombstones() } @@ -833,13 +835,13 @@ impl DeltaTable { /// Return table schema parsed from transaction log. Return None if table hasn't been loaded or /// no metadata was found in the log. - pub fn schema(&self) -> Option<&Schema> { + pub fn schema(&self) -> Option<&StructType> { self.state.schema() } /// Return table schema parsed from transaction log. Return `DeltaTableError` if table hasn't /// been loaded or no metadata was found in the log. - pub fn get_schema(&self) -> Result<&Schema, DeltaTableError> { + pub fn get_schema(&self) -> Result<&StructType, DeltaTableError> { self.schema().ok_or(DeltaTableError::NoSchema) } @@ -923,13 +925,14 @@ impl std::fmt::Debug for DeltaTable { #[cfg(test)] mod tests { + use pretty_assertions::assert_eq; + use tempdir::TempDir; + use super::*; + use crate::kernel::{DataType, PrimitiveType, StructField}; use crate::operations::create::CreateBuilder; #[cfg(any(feature = "s3", feature = "s3-native-tls"))] use crate::table::builder::DeltaTableBuilder; - use pretty_assertions::assert_eq; - use std::collections::HashMap; - use tempdir::TempDir; #[tokio::test] async fn table_round_trip() { @@ -966,17 +969,15 @@ mod tests { .with_table_name("Test Table Create") .with_comment("This table is made to test the create function for a DeltaTable") .with_columns(vec![ - SchemaField::new( + StructField::new( "Id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "Name".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]) .await diff --git a/crates/deltalake-core/src/table/state.rs b/crates/deltalake-core/src/table/state.rs index af4ff6369d..26becd0703 100644 --- a/crates/deltalake-core/src/table/state.rs +++ b/crates/deltalake-core/src/table/state.rs @@ -12,13 +12,15 @@ use serde::{Deserialize, Serialize}; use super::config::TableConfig; use crate::errors::DeltaTableError; +use crate::kernel::{ + Action, Add, CommitInfo, DataType, DomainMetadata, ReaderFeatures, Remove, StructType, + WriterFeatures, +}; use crate::partitions::{DeltaTablePartition, PartitionFilter}; -use crate::protocol::{self, Action, Add, ProtocolError, ReaderFeatures, WriterFeatures}; -use crate::schema::SchemaDataType; +use crate::protocol::ProtocolError; use crate::storage::commit_uri_from_version; use crate::table::DeltaTableMetaData; use crate::DeltaTable; -use crate::Schema; #[cfg(any(feature = "parquet", feature = "parquet2"))] use super::{CheckPoint, DeltaTableConfig}; @@ -31,13 +33,13 @@ pub struct DeltaTableState { version: i64, // A remove action should remain in the state of the table as a tombstone until it has expired. // A tombstone expires when the creation timestamp of the delta file exceeds the expiration - tombstones: HashSet, + tombstones: HashSet, // active files for table state - files: Vec, + files: Vec, // Information added to individual commits - commit_infos: Vec, + commit_infos: Vec, // Domain metadatas provided by the system or user - domain_metadatas: Vec, + domain_metadatas: Vec, app_transaction_version: HashMap, min_reader_version: i32, min_writer_version: i32, @@ -78,7 +80,7 @@ impl DeltaTableState { let mut new_state = DeltaTableState::with_version(version); for line in reader.lines() { - let action: protocol::Action = serde_json::from_str(line?.as_str())?; + let action: Action = serde_json::from_str(line?.as_str())?; new_state.process_action( action, table.config.require_tombstones, @@ -112,13 +114,13 @@ impl DeltaTableState { let preader = SerializedFileReader::new(data)?; let schema = preader.metadata().file_metadata().schema(); if !schema.is_group() { - return Err(DeltaTableError::from(protocol::ProtocolError::Generic( + return Err(DeltaTableError::from(ProtocolError::Generic( "Action record in checkpoint should be a struct".to_string(), ))); } for record in preader.get_row_iter(None)? { self.process_action( - protocol::Action::from_parquet_record(schema, &record.unwrap())?, + Action::from_parquet_record(schema, &record.unwrap())?, table_config.require_tombstones, table_config.require_files, )?; @@ -134,8 +136,8 @@ impl DeltaTableState { let metadata = read_metadata(&mut reader)?; for row_group in metadata.row_groups { - for action in actions_from_row_group(row_group, &mut reader) - .map_err(protocol::ProtocolError::from)? + for action in + actions_from_row_group(row_group, &mut reader).map_err(ProtocolError::from)? { self.process_action( action, @@ -167,7 +169,7 @@ impl DeltaTableState { } /// List of commit info maps. - pub fn commit_infos(&self) -> &Vec { + pub fn commit_infos(&self) -> &Vec { &self.commit_infos } @@ -187,13 +189,13 @@ impl DeltaTableState { } /// Full list of tombstones (remove actions) representing files removed from table state). - pub fn all_tombstones(&self) -> &HashSet { + pub fn all_tombstones(&self) -> &HashSet { &self.tombstones } /// List of unexpired tombstones (remove actions) representing files removed from table state. /// The retention period is set by `deletedFileRetentionDuration` with default value of 1 week. - pub fn unexpired_tombstones(&self) -> impl Iterator { + pub fn unexpired_tombstones(&self) -> impl Iterator { let retention_timestamp = Utc::now().timestamp_millis() - self.tombstone_retention_millis; self.tombstones .iter() @@ -202,7 +204,7 @@ impl DeltaTableState { /// Full list of add actions representing all parquet files that are part of the current /// delta table state. - pub fn files(&self) -> &Vec { + pub fn files(&self) -> &Vec { self.files.as_ref() } @@ -247,7 +249,7 @@ impl DeltaTableState { } /// The table schema - pub fn schema(&self) -> Option<&Schema> { + pub fn schema(&self) -> Option<&StructType> { self.current_metadata.as_ref().map(|m| &m.schema) } @@ -339,30 +341,30 @@ impl DeltaTableState { /// Process given action by updating current state. fn process_action( &mut self, - action: protocol::Action, + action: Action, require_tombstones: bool, require_files: bool, ) -> Result<(), ProtocolError> { match action { // TODO: optionally load CDC into TableState - protocol::Action::cdc(_v) => {} - protocol::Action::add(v) => { + Action::Cdc(_v) => {} + Action::Add(v) => { if require_files { self.files.push(v); } } - protocol::Action::remove(v) => { + Action::Remove(v) => { if require_tombstones && require_files { self.tombstones.insert(v); } } - protocol::Action::protocol(v) => { + Action::Protocol(v) => { self.min_reader_version = v.min_reader_version; self.min_writer_version = v.min_writer_version; self.reader_features = v.reader_features; self.writer_features = v.writer_features; } - protocol::Action::metaData(v) => { + Action::Metadata(v) => { let md = DeltaTableMetaData::try_from(v)?; let table_config = TableConfig(&md.configuration); self.tombstone_retention_millis = @@ -372,16 +374,16 @@ impl DeltaTableState { self.enable_expired_log_cleanup = table_config.enable_expired_log_cleanup(); self.current_metadata = Some(md); } - protocol::Action::txn(v) => { + Action::Txn(v) => { *self .app_transaction_version .entry(v.app_id) .or_insert(v.version) = v.version; } - protocol::Action::commitInfo(v) => { + Action::CommitInfo(v) => { self.commit_infos.push(v); } - protocol::Action::domainMetadata(v) => { + Action::DomainMetadata(v) => { self.domain_metadatas.push(v); } } @@ -408,7 +410,7 @@ impl DeltaTableState { }); } - let partition_col_data_types: HashMap<&str, &SchemaDataType> = current_metadata + let partition_col_data_types: HashMap<&String, &DataType> = current_metadata .get_partition_col_data_types() .into_iter() .collect(); @@ -430,6 +432,7 @@ impl DeltaTableState { #[cfg(test)] mod tests { use super::*; + use crate::kernel::Txn; use pretty_assertions::assert_eq; #[test] @@ -478,7 +481,7 @@ mod tests { enable_expired_log_cleanup: true, }; - let txn_action = protocol::Action::txn(protocol::Txn { + let txn_action = Action::Txn(Txn { app_id: "abc".to_string(), version: 2, last_updated: Some(0), diff --git a/crates/deltalake-core/src/table/state_arrow.rs b/crates/deltalake-core/src/table/state_arrow.rs index 34f858f415..9d82c87326 100644 --- a/crates/deltalake-core/src/table/state_arrow.rs +++ b/crates/deltalake-core/src/table/state_arrow.rs @@ -18,9 +18,8 @@ use itertools::Itertools; use super::state::DeltaTableState; use crate::errors::DeltaTableError; +use crate::kernel::{DataType as DeltaDataType, StructType}; use crate::protocol::{ColumnCountStat, ColumnValueStat, Stats}; -use crate::SchemaDataType; -use crate::SchemaTypeStruct; impl DeltaTableState { /// Get an [arrow::record_batch::RecordBatch] containing add action data. @@ -152,8 +151,8 @@ impl DeltaTableState { .iter() .map( |name| -> Result { - let field = metadata.schema.get_field_with_name(name)?; - Ok(field.get_type().try_into()?) + let field = metadata.schema.field_with_name(name)?; + Ok(field.data_type().try_into()?) }, ) .collect::>()?; @@ -299,7 +298,7 @@ impl DeltaTableState { for add in self.files() { if let Some(value) = &add.deletion_vector { - storage_type.append_value(value.storage_type.to_string()); + storage_type.append_value(&value.storage_type); path_or_inline_div.append_value(value.path_or_inline_dv.clone()); if let Some(ofs) = value.offset { offset.append_value(ofs); @@ -415,7 +414,7 @@ impl DeltaTableState { }; let mut columnar_stats: Vec = SchemaLeafIterator::new(schema) - .filter(|(_path, datatype)| !matches!(datatype, SchemaDataType::r#struct(_))) + .filter(|(_path, datatype)| !matches!(datatype, DeltaDataType::Struct(_))) .map(|(path, datatype)| -> Result { let null_count = stats .iter() @@ -432,7 +431,7 @@ impl DeltaTableState { let arrow_type: arrow::datatypes::DataType = datatype.try_into()?; // Min and max are collected for primitive values, not list or maps - let min_values = if matches!(datatype, SchemaDataType::primitive(_)) { + let min_values = if matches!(datatype, DeltaDataType::Primitive(_)) { let min_values = stats .iter() .flat_map(|maybe_stat| { @@ -449,7 +448,7 @@ impl DeltaTableState { None }; - let max_values = if matches!(datatype, SchemaDataType::primitive(_)) { + let max_values = if matches!(datatype, DeltaDataType::Primitive(_)) { let max_values = stats .iter() .flat_map(|maybe_stat| { @@ -636,33 +635,33 @@ fn resolve_column_count_stat( } struct SchemaLeafIterator<'a> { - fields_remaining: VecDeque<(Vec<&'a str>, &'a SchemaDataType)>, + fields_remaining: VecDeque<(Vec<&'a str>, &'a DeltaDataType)>, } impl<'a> SchemaLeafIterator<'a> { - fn new(schema: &'a SchemaTypeStruct) -> Self { + fn new(schema: &'a StructType) -> Self { SchemaLeafIterator { fields_remaining: schema - .get_fields() + .fields() .iter() - .map(|field| (vec![field.get_name()], field.get_type())) + .map(|field| (vec![field.name().as_ref()], field.data_type())) .collect(), } } } impl<'a> std::iter::Iterator for SchemaLeafIterator<'a> { - type Item = (Vec<&'a str>, &'a SchemaDataType); + type Item = (Vec<&'a str>, &'a DeltaDataType); fn next(&mut self) -> Option { if let Some((path, datatype)) = self.fields_remaining.pop_front() { - if let SchemaDataType::r#struct(struct_type) = datatype { + if let DeltaDataType::Struct(struct_type) = datatype { // push child fields to front - for field in struct_type.get_fields() { + for field in struct_type.fields() { let mut new_path = path.clone(); - new_path.push(field.get_name()); + new_path.push(field.name()); self.fields_remaining - .push_front((new_path, field.get_type())); + .push_front((new_path, field.data_type())); } }; diff --git a/crates/deltalake-core/src/writer/json.rs b/crates/deltalake-core/src/writer/json.rs index f8d6d1a9e3..044ffc20e2 100644 --- a/crates/deltalake-core/src/writer/json.rs +++ b/crates/deltalake-core/src/writer/json.rs @@ -23,9 +23,10 @@ use super::utils::{ }; use super::{utils::PartitionPath, DeltaWriter, DeltaWriterError}; use crate::errors::DeltaTableError; +use crate::kernel::{Add, StructType}; use crate::table::builder::DeltaTableBuilder; use crate::table::DeltaTableMetaData; -use crate::{protocol::Add, DeltaTable, Schema}; +use crate::DeltaTable; use crate::{storage::DeltaObjectStore, writer::utils::ShareableBuffer}; type BadValue = (Value, ParquetError); @@ -33,7 +34,7 @@ type BadValue = (Value, ParquetError); /// Writes messages to a delta lake table. pub struct JsonWriter { storage: Arc, - arrow_schema_ref: Arc, + arrow_schema_ref: Arc, writer_properties: WriterProperties, partition_columns: Vec, arrow_writers: HashMap, @@ -206,7 +207,7 @@ impl JsonWriter { pub fn for_table(table: &DeltaTable) -> Result { // Initialize an arrow schema ref from the delta table schema let metadata = table.get_metadata()?; - let arrow_schema = >::try_from(&metadata.schema)?; + let arrow_schema = >::try_from(&metadata.schema)?; let arrow_schema_ref = Arc::new(arrow_schema); let partition_columns = metadata.partition_columns.clone(); @@ -232,7 +233,8 @@ impl JsonWriter { &mut self, metadata: &DeltaTableMetaData, ) -> Result { - let schema: ArrowSchema = >::try_from(&metadata.schema)?; + let schema: ArrowSchema = + >::try_from(&metadata.schema)?; let schema_updated = self.arrow_schema_ref.as_ref() != &schema || self.partition_columns != metadata.partition_columns; @@ -440,6 +442,11 @@ fn extract_partition_values( #[cfg(test)] mod tests { + use parquet::file::reader::FileReader; + use parquet::file::serialized_reader::SerializedFileReader; + use std::fs::File; + use std::sync::Arc; + use super::*; use crate::arrow::array::Int32Array; use crate::arrow::datatypes::{ @@ -448,11 +455,6 @@ mod tests { use crate::writer::test_utils::get_delta_schema; use crate::writer::DeltaWriter; use crate::writer::JsonWriter; - use crate::Schema; - use parquet::file::reader::FileReader; - use parquet::file::serialized_reader::SerializedFileReader; - use std::fs::File; - use std::sync::Arc; #[tokio::test] async fn test_partition_not_written_to_parquet() { @@ -460,7 +462,7 @@ mod tests { let schema = get_delta_schema(); let path = table_dir.path().to_str().unwrap().to_string(); - let arrow_schema = >::try_from(&schema).unwrap(); + let arrow_schema = >::try_from(&schema).unwrap(); let mut writer = JsonWriter::try_new( path.clone(), Arc::new(arrow_schema), diff --git a/crates/deltalake-core/src/writer/mod.rs b/crates/deltalake-core/src/writer/mod.rs index 8c5512127f..478a0b11f2 100644 --- a/crates/deltalake-core/src/writer/mod.rs +++ b/crates/deltalake-core/src/writer/mod.rs @@ -8,8 +8,9 @@ use parquet::errors::ParquetError; use serde_json::Value; use crate::errors::DeltaTableError; +use crate::kernel::{Action, Add}; use crate::operations::transaction::commit; -use crate::protocol::{Action, Add, ColumnCountStat, DeltaOperation, SaveMode}; +use crate::protocol::{ColumnCountStat, DeltaOperation, SaveMode}; use crate::DeltaTable; pub use json::JsonWriter; @@ -133,7 +134,7 @@ pub trait DeltaWriter { /// Flush the internal write buffers to files in the delta table folder structure. /// and commit the changes to the Delta log, creating a new table version. async fn flush_and_commit(&mut self, table: &mut DeltaTable) -> Result { - let adds: Vec<_> = self.flush().await?.drain(..).map(Action::add).collect(); + let adds: Vec<_> = self.flush().await?.drain(..).map(Action::Add).collect(); let partition_cols = table.get_metadata()?.partition_columns.clone(); let partition_by = if !partition_cols.is_empty() { Some(partition_cols) diff --git a/crates/deltalake-core/src/writer/record_batch.rs b/crates/deltalake-core/src/writer/record_batch.rs index a6486ae109..b673146907 100644 --- a/crates/deltalake-core/src/writer/record_batch.rs +++ b/crates/deltalake-core/src/writer/record_batch.rs @@ -26,9 +26,10 @@ use super::utils::{ }; use super::{DeltaWriter, DeltaWriterError}; use crate::errors::DeltaTableError; +use crate::kernel::{Add, StructType}; use crate::table::builder::DeltaTableBuilder; use crate::table::DeltaTableMetaData; -use crate::{protocol::Add, storage::DeltaObjectStore, DeltaTable, Schema}; +use crate::{storage::DeltaObjectStore, DeltaTable}; /// Writes messages to a delta lake table. pub struct RecordBatchWriter { @@ -76,7 +77,8 @@ impl RecordBatchWriter { pub fn for_table(table: &DeltaTable) -> Result { // Initialize an arrow schema ref from the delta table schema let metadata = table.get_metadata()?; - let arrow_schema = >::try_from(&metadata.schema.clone())?; + let arrow_schema = + >::try_from(&metadata.schema.clone())?; let arrow_schema_ref = Arc::new(arrow_schema); let partition_columns = metadata.partition_columns.clone(); @@ -103,7 +105,8 @@ impl RecordBatchWriter { &mut self, metadata: &DeltaTableMetaData, ) -> Result { - let schema: ArrowSchema = >::try_from(&metadata.schema)?; + let schema: ArrowSchema = + >::try_from(&metadata.schema)?; let schema_updated = self.arrow_schema_ref.as_ref() != &schema || self.partition_columns != metadata.partition_columns; @@ -450,7 +453,7 @@ mod tests { */ #[tokio::test] async fn test_divide_record_batch_with_map_single_partition() { - use crate::{DeltaOps, SchemaTypeStruct}; + use crate::DeltaOps; let table = crate::writer::test_utils::create_bare_table(); let partition_cols = vec!["modified".to_string()]; @@ -466,13 +469,13 @@ mod tests { ] }"#; - let delta_schema: SchemaTypeStruct = + let delta_schema: StructType = serde_json::from_str(delta_schema).expect("Failed to parse schema"); let table = DeltaOps(table) .create() .with_partition_columns(partition_cols.to_vec()) - .with_columns(delta_schema.get_fields().clone()) + .with_columns(delta_schema.fields().clone()) .await .unwrap(); @@ -484,7 +487,7 @@ mod tests { .as_bytes(); let schema: ArrowSchema = - >::try_from(&delta_schema).unwrap(); + >::try_from(&delta_schema).unwrap(); // Using a batch size of two since the buf above only has two records let mut decoder = ReaderBuilder::new(Arc::new(schema)) diff --git a/crates/deltalake-core/src/writer/stats.rs b/crates/deltalake-core/src/writer/stats.rs index 6cd1961798..2e4f6ac177 100644 --- a/crates/deltalake-core/src/writer/stats.rs +++ b/crates/deltalake-core/src/writer/stats.rs @@ -11,7 +11,8 @@ use parquet::{ }; use super::*; -use crate::protocol::{Add, ColumnValueStat, Stats}; +use crate::kernel::Add; +use crate::protocol::{ColumnValueStat, Stats}; /// Creates an [`Add`] log action struct. pub fn create_add( @@ -32,13 +33,15 @@ pub fn create_add( path, size, partition_values: partition_values.to_owned(), - partition_values_parsed: None, modification_time, data_change: true, stats: Some(stats_string), - stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + stats_parsed: None, + partition_values_parsed: None, }) } diff --git a/crates/deltalake-core/src/writer/test_utils.rs b/crates/deltalake-core/src/writer/test_utils.rs index f140c2aa7b..d67931c096 100644 --- a/crates/deltalake-core/src/writer/test_utils.rs +++ b/crates/deltalake-core/src/writer/test_utils.rs @@ -7,10 +7,11 @@ use arrow::compute::take; use arrow_array::{Int32Array, Int64Array, RecordBatch, StringArray, StructArray, UInt32Array}; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; -use crate::operations::{create::CreateBuilder, DeltaOps}; -use crate::schema::{Schema, SchemaTypeStruct}; +use crate::kernel::{DataType as DeltaDataType, PrimitiveType, StructField, StructType}; +use crate::operations::create::CreateBuilder; +use crate::operations::DeltaOps; use crate::table::DeltaTableMetaData; -use crate::{DeltaConfigKey, DeltaTable, DeltaTableBuilder, SchemaDataType, SchemaField}; +use crate::{DeltaConfigKey, DeltaTable, DeltaTableBuilder}; pub type TestResult = Result<(), Box>; @@ -131,25 +132,22 @@ fn data_without_null() -> (Int32Array, StringArray, StringArray) { (base_int, base_str, base_mod) } -pub fn get_delta_schema() -> Schema { - Schema::new(vec![ - SchemaField::new( +pub fn get_delta_schema() -> StructType { + StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DeltaDataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]) } @@ -250,36 +248,31 @@ pub fn get_record_batch_with_nested_struct() -> RecordBatch { .unwrap() } -pub fn get_delta_schema_with_nested_struct() -> Schema { - Schema::new(vec![ - SchemaField::new( +pub fn get_delta_schema_with_nested_struct() -> StructType { + StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DeltaDataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( String::from("nested"), - SchemaDataType::r#struct(SchemaTypeStruct::new(vec![SchemaField::new( + DeltaDataType::Struct(Box::new(StructType::new(vec![StructField::new( String::from("count"), - SchemaDataType::primitive(String::from("integer")), + DeltaDataType::Primitive(PrimitiveType::Integer), true, - Default::default(), - )])), + )]))), true, - Default::default(), ), ]) } @@ -291,7 +284,7 @@ pub async fn setup_table_with_configuration( let table_schema = get_delta_schema(); DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_configuration_property(key, value) .await .expect("Failed to create table") @@ -314,7 +307,7 @@ pub async fn create_initialized_table(partition_cols: &[String]) -> DeltaTable { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partition_cols) .await .unwrap() diff --git a/crates/deltalake-core/tests/checkpoint_writer.rs b/crates/deltalake-core/tests/checkpoint_writer.rs index 6788346ef8..b1fc28faed 100644 --- a/crates/deltalake-core/tests/checkpoint_writer.rs +++ b/crates/deltalake-core/tests/checkpoint_writer.rs @@ -211,7 +211,7 @@ mod checkpoints_with_tombstones { use super::*; use ::object_store::path::Path as ObjectStorePath; use chrono::Utc; - use deltalake_core::protocol::*; + use deltalake_core::kernel::*; use deltalake_core::table::config::DeltaConfigKey; use deltalake_core::*; use maplit::hashmap; @@ -346,6 +346,8 @@ mod checkpoints_with_tombstones { size: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, }) .collect(); @@ -357,8 +359,8 @@ mod checkpoints_with_tombstones { let actions = removes .iter() .cloned() - .map(Action::remove) - .chain(std::iter::once(Action::add(add.clone()))) + .map(Action::Remove) + .chain(std::iter::once(Action::Add(add.clone()))) .collect(); let operation = DeltaOperation::Optimize { predicate: None, @@ -389,7 +391,7 @@ mod checkpoints_with_tombstones { let actions = actions .iter() .filter_map(|a| match a { - Action::remove(r) => Some(r.clone()), + Action::Remove(r) => Some(r.clone()), _ => None, }) .collect(); @@ -408,6 +410,8 @@ mod checkpoints_with_tombstones { size: Some(100), tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, } } diff --git a/crates/deltalake-core/tests/command_optimize.rs b/crates/deltalake-core/tests/command_optimize.rs index 70d161d69e..a923d0064d 100644 --- a/crates/deltalake-core/tests/command_optimize.rs +++ b/crates/deltalake-core/tests/command_optimize.rs @@ -4,18 +4,19 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::{collections::HashMap, error::Error, sync::Arc}; use arrow_array::{Int32Array, RecordBatch, StringArray}; -use arrow_schema::{DataType, Field, Schema as ArrowSchema}; +use arrow_schema::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use arrow_select::concat::concat_batches; use deltalake_core::errors::DeltaTableError; +use deltalake_core::kernel::{Action, DataType, PrimitiveType, Remove, StructField}; use deltalake_core::operations::optimize::{ create_merge_plan, MetricDetails, Metrics, OptimizeType, }; use deltalake_core::operations::transaction::commit; use deltalake_core::operations::DeltaOps; -use deltalake_core::protocol::{Action, DeltaOperation, Remove}; +use deltalake_core::protocol::DeltaOperation; use deltalake_core::storage::ObjectStoreRef; use deltalake_core::writer::{DeltaWriter, RecordBatchWriter}; -use deltalake_core::{DeltaTable, PartitionFilter, Path, SchemaDataType, SchemaField}; +use deltalake_core::{DeltaTable, PartitionFilter, Path}; use futures::TryStreamExt; use object_store::ObjectStore; use parquet::arrow::async_reader::ParquetObjectReader; @@ -32,23 +33,20 @@ struct Context { async fn setup_test(partitioned: bool) -> Result> { let columns = vec![ - SchemaField::new( + StructField::new( "x".to_owned(), - SchemaDataType::primitive("integer".to_owned()), + DataType::Primitive(PrimitiveType::Integer), false, - HashMap::new(), ), - SchemaField::new( + StructField::new( "y".to_owned(), - SchemaDataType::primitive("integer".to_owned()), + DataType::Primitive(PrimitiveType::Integer), false, - HashMap::new(), ), - SchemaField::new( + StructField::new( "date".to_owned(), - SchemaDataType::primitive("string".to_owned()), + DataType::Primitive(PrimitiveType::String), false, - HashMap::new(), ), ]; @@ -92,9 +90,9 @@ fn generate_random_batch>( Ok(RecordBatch::try_new( Arc::new(ArrowSchema::new(vec![ - Field::new("x", DataType::Int32, false), - Field::new("y", DataType::Int32, false), - Field::new("date", DataType::Utf8, false), + Field::new("x", ArrowDataType::Int32, false), + Field::new("y", ArrowDataType::Int32, false), + Field::new("date", ArrowDataType::Utf8, false), ])), vec![Arc::new(x_array), Arc::new(y_array), Arc::new(date_array)], )?) @@ -121,9 +119,9 @@ fn tuples_to_batch>( Ok(RecordBatch::try_new( Arc::new(ArrowSchema::new(vec![ - Field::new("x", DataType::Int32, false), - Field::new("y", DataType::Int32, false), - Field::new("date", DataType::Utf8, false), + Field::new("x", ArrowDataType::Int32, false), + Field::new("y", ArrowDataType::Int32, false), + Field::new("date", ArrowDataType::Utf8, false), ])), vec![Arc::new(x_array), Arc::new(y_array), Arc::new(date_array)], )?) @@ -294,12 +292,14 @@ async fn test_conflict_for_remove_actions() -> Result<(), Box> { partition_values: Some(add.partition_values.clone()), tags: Some(HashMap::new()), deletion_vector: add.deletion_vector.clone(), + base_row_id: add.base_row_id, + default_row_commit_version: add.default_row_commit_version, }; let operation = DeltaOperation::Delete { predicate: None }; commit( other_dt.object_store().as_ref(), - &vec![Action::remove(remove)], + &vec![Action::Remove(remove)], operation, &other_dt.state, None, diff --git a/crates/deltalake-core/tests/command_restore.rs b/crates/deltalake-core/tests/command_restore.rs index ac9a37d73b..80c2083261 100644 --- a/crates/deltalake-core/tests/command_restore.rs +++ b/crates/deltalake-core/tests/command_restore.rs @@ -2,12 +2,12 @@ use arrow::datatypes::Schema as ArrowSchema; use arrow_array::{Int32Array, RecordBatch}; -use arrow_schema::{DataType, Field}; +use arrow_schema::{DataType as ArrowDataType, Field}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; +use deltalake_core::kernel::{DataType, PrimitiveType, StructField}; use deltalake_core::protocol::SaveMode; -use deltalake_core::{DeltaOps, DeltaTable, SchemaDataType, SchemaField}; +use deltalake_core::{DeltaOps, DeltaTable}; use rand::Rng; -use std::collections::HashMap; use std::error::Error; use std::fs; use std::sync::Arc; @@ -21,17 +21,15 @@ struct Context { async fn setup_test() -> Result> { let columns = vec![ - SchemaField::new( + StructField::new( "id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), ]; @@ -77,8 +75,8 @@ fn get_record_batch() -> RecordBatch { } let schema = ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, true), - Field::new("value", DataType::Int32, true), + Field::new("id", ArrowDataType::Int32, true), + Field::new("value", ArrowDataType::Int32, true), ]); let id_array = Int32Array::from(id_vec); diff --git a/crates/deltalake-core/tests/command_vacuum.rs b/crates/deltalake-core/tests/command_vacuum.rs index f44e1f86b1..0007f479d5 100644 --- a/crates/deltalake-core/tests/command_vacuum.rs +++ b/crates/deltalake-core/tests/command_vacuum.rs @@ -1,9 +1,9 @@ use chrono::Duration; use common::clock::TestClock; use common::TestContext; +use deltalake_core::kernel::StructType; use deltalake_core::operations::vacuum::Clock; use deltalake_core::operations::DeltaOps; -use deltalake_core::Schema; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; use serde_json::json; use std::sync::Arc; @@ -11,7 +11,7 @@ use std::sync::Arc; mod common; /// Basic schema -pub fn get_xy_date_schema() -> Schema { +pub fn get_xy_date_schema() -> StructType { serde_json::from_value(json!({ "type": "struct", "fields": [ @@ -24,8 +24,8 @@ pub fn get_xy_date_schema() -> Schema { } /// Schema that contains a column prefiexed with _ -pub fn get_vacuum_underscore_schema() -> Schema { - serde_json::from_value::(json!({ +pub fn get_vacuum_underscore_schema() -> StructType { + serde_json::from_value::(json!({ "type": "struct", "fields": [ {"name": "x", "type": "integer", "nullable": false, "metadata": {}}, diff --git a/crates/deltalake-core/tests/commit_info_format.rs b/crates/deltalake-core/tests/commit_info_format.rs index ba7d80a726..de69397e32 100644 --- a/crates/deltalake-core/tests/commit_info_format.rs +++ b/crates/deltalake-core/tests/commit_info_format.rs @@ -1,8 +1,9 @@ #![allow(dead_code)] mod fs_common; +use deltalake_core::kernel::Action; use deltalake_core::operations::transaction::commit; -use deltalake_core::protocol::{Action, DeltaOperation, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use serde_json::json; use std::error::Error; use tempdir::TempDir; @@ -13,7 +14,7 @@ async fn test_operational_parameters() -> Result<(), Box> { let mut table = fs_common::create_table(path.path().to_str().unwrap(), None).await; let add = fs_common::add(0); - let actions = vec![Action::add(add)]; + let actions = vec![Action::Add(add)]; let operation = DeltaOperation::Write { mode: SaveMode::Append, partition_by: Some(vec!["some_partition".to_string()]), diff --git a/crates/deltalake-core/tests/common/mod.rs b/crates/deltalake-core/tests/common/mod.rs index a53d8b7641..80df899323 100644 --- a/crates/deltalake-core/tests/common/mod.rs +++ b/crates/deltalake-core/tests/common/mod.rs @@ -1,12 +1,13 @@ #![allow(dead_code, unused_variables)] use bytes::Bytes; +use deltalake_core::kernel::{Action, Add, Remove, StructType}; use deltalake_core::operations::create::CreateBuilder; use deltalake_core::operations::transaction::commit; -use deltalake_core::protocol::{self, Add, DeltaOperation, Remove, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::DeltaObjectStore; +use deltalake_core::DeltaTable; use deltalake_core::DeltaTableBuilder; -use deltalake_core::{DeltaTable, Schema}; use object_store::{path::Path, ObjectStore}; use std::any::Any; use std::collections::HashMap; @@ -74,7 +75,7 @@ impl TestContext { //Create and set a new table from the provided schema pub async fn create_table_from_schema( &mut self, - schema: Schema, + schema: StructType, partitions: &[&str], ) -> DeltaTable { let p = partitions @@ -86,7 +87,7 @@ impl TestContext { .with_object_store(backend) .with_table_name("delta-rs_test_table") .with_comment("Table created by delta-rs tests") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_partition_columns(p) .await .unwrap() @@ -133,14 +134,20 @@ pub async fn add_file( modification_time: create_time, partition_values: part_values, data_change: true, - ..Default::default() + stats: None, + stats_parsed: None, + partition_values_parsed: None, + tags: None, + default_row_commit_version: None, + base_row_id: None, + deletion_vector: None, }; let operation = DeltaOperation::Write { mode: SaveMode::Append, partition_by: None, predicate: None, }; - let actions = vec![protocol::Action::add(add)]; + let actions = vec![Action::Add(add)]; commit( table.object_store().as_ref(), &actions, @@ -170,10 +177,15 @@ pub async fn remove_file( deletion_timestamp: Some(deletion_timestamp), partition_values: Some(part_values), data_change: true, - ..Default::default() + extended_file_metadata: None, + size: None, + deletion_vector: None, + default_row_commit_version: None, + base_row_id: None, + tags: None, }; let operation = DeltaOperation::Delete { predicate: None }; - let actions = vec![protocol::Action::remove(remove)]; + let actions = vec![Action::Remove(remove)]; commit( table.object_store().as_ref(), &actions, diff --git a/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..3760ad9930 --- /dev/null +++ b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}} +{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}} +{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..e5bcdc1163 --- /dev/null +++ b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1677811194429,"operation":"DELETE","operationParameters":{"predicate":"[\"(spark_catalog.delta.`/tmp/table-with-dv-small`.value IN (0, 9))\"]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"0","numRemovedBytes":"0","numCopiedRows":"0","numDeletionVectorsAdded":"1","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"0","executionTimeMs":"10364","numDeletedRows":"2","scanTimeMs":"9869","numAddedFiles":"0","numAddedBytes":"0","rewriteTimeMs":"479"},"engineInfo":"Databricks-Runtime/","txnId":"6d9555a2-0e3b-4c15-80c0-d5c3b0cf1277"}} +{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}} diff --git a/crates/deltalake-core/tests/data/table-with-dv-small/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin b/crates/deltalake-core/tests/data/table-with-dv-small/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin new file mode 100644 index 0000000000000000000000000000000000000000..f1a01e661cdcca08ff5d67e7d2de53381980735a GIT binary patch literal 45 lcmZQ%U|>+Wc-b+>@u6oHnRZaa}xk;gAOoS z#<6}t%0rSCNt@&m3E|*~#Cc*mq6q?Bjf0sEC&T3yyEjN{9KCms%M}*aS7r27rN~Tj zzA#1W7Im$F+m7qFUCSlpIkYWjA7Cc8`45BN8(r(ouK2DKhm&oqokhKfIaLJUzYWIu zPlHKl678-<$u_Z>3nwp@5?4qBej=(UKN%Q>#iA`8S!W3Kv+Rn6JI+Zl%15 zS5`$GRbJ1F6QviWH~GBwGEAG$c3l+NBa^IBOI45~tF^{Z6NZvi&--82o2)mRFB=fg z4w&qQd5e|0q&+nA%=X0kY0=qF(izCXnGE_3#gJBXUG{z7KkJ-?b)pv?9F9Xjj^mLU lg%~U_#xTV3XgCPMU~!JGNsB^M@u}mw^bwBeg)ZpZ{R3QVoUi}@ literal 0 HcmV?d00001 diff --git a/crates/deltalake-core/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json b/crates/deltalake-core/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..a7941cd087 --- /dev/null +++ b/crates/deltalake-core/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1678020185201,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"548"},"engineInfo":"Apache-Spark/3.3.0 Delta-Lake/2.3.0rc1","txnId":"07c0f996-3854-4456-b68b-d1e35e3888cd"}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"6524c99f-9a76-4ea1-8ad4-e428a7e065d7","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1678020184802}} +{"add":{"path":"part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet","partitionValues":{},"size":548,"modificationTime":1678020185157,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0}}"}} diff --git a/crates/deltalake-core/tests/data/table-without-dv-small/part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet b/crates/deltalake-core/tests/data/table-without-dv-small/part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7ce78a86b0bdb21b622b4297ad585b378397aa53 GIT binary patch literal 548 zcmZWn%}OId5Uw6GaRfaG(@i(bAv72iVv|g((I5-n#FKasL_|81js_=x^P@<}#oO-5 zSMa#7kKl1%z!%tO*)o}!gMIm`zptvkD!R{4E*cc*A3}c-`upwUryEcoi8A#7F1Z5$ zK7kFgxkeZvutMB*q)Kye=TM%*DsfLy!@Aj7D^P4Yf-gwGa3N4fFaQxSkG>O(0Eyb^ zdO9-6!)+Sf+rFU=(9)K}-A5DW7ML#r5mdc^mc+gXWl;#!VLtmu*gL6u$4i5yD}(JX z+a-(MxGCBKuos6i_LtBf@Y%?(df3dz>1v!0lRITb1_x5J+0>&=(%x{uU9GgL1K;_U zk&a`9Ym9T7sbu)1`ZhBilNo+Bk=gUe%9c#CB#$y#lii+;JE?44PvpS#Kc(5Ss1%v; zaBzpWK_17Z9b*Dk+L~PSlhdNT!nRygkVlS(mSrJ)+33M<>~ycc&W&8F4$=&*jI n_2Gey_QOa=U2_oWy>`=fwGJCP45OpH9T5ch9^zr=-Ga0E&LN~ zr_iBHB@|SZYRgzQRg8sDO>AOaS9KMXs4B2&EQ_EjfoT*KRBchkAj(jOI+P)_bFSaL z@A)}NpOda^h{~<6@4NRq=bm%!IrpCPy5;LzgG3;;q=C#{_~YLznursAI*3ycs^khm z5a91Mc>a_~MnrP%N3ZQ$X`ryVn^JHJ8-Y-tlFj=9qIl7#1S5f{8c}^}INs}P4MQs$ zZg2Mm+Lg}saBo{sjYNF0Kp^1HrIo(Ey?&T_f4`asK39`9WNdu;IXk1`5QG&08(oJ+ z3r01OW^(9>p=U+IsM^-TC_9d=ql;cc+~kSBo&OmJ1EH+!)L_oxH~axO9QV@sf4#As z!5PQhUGem#M@2fWjx>^=Km5pnO2as3lZJ;t$o;#Izc^CM2AH|G<0$0*sq-&A%5?>! zins^zFl@#~PTI)p-#<6)WN`4{1@WUD8pWf){X_0_e`>25%Vgs@_l7pFJC#UpSCaiI z{CdcpRP%W?>)zm2++KGgop)~th5}x9T*<4@8H{uUeSr>NAm|=|iOPf9?}!LOtUglH z-rnJ@k2Hq@!A@TwBxiQl(-cy6H%d+;6cUhm8g9Qc#faQXf{NtHdo~t&n ztXYZ8Y$fra(i)8d{SLL$*V@|}@I~7KQD18$5)8ycfruK2&0{5~u%BK&eby)#O|drN zLwk|J4*cY$H;e|LDGZDS7^WJ1>#=8yn&$&V7=RjmGOd*t2WTi7Vw%xuCRbaOA!PA$2 z$jJeY=3u&!ovS@xOIpa$CqEf7s;uF5%E!w>74V_^0NfuJAO7fJKGZBt!NCMzVc6*k za>z#B|KPW}PXX576W`Wk@Fv|w`MrI(C35*}e=HB1r8l7_aYuazHkgvaN7`QU-t^U> zM0ytt!5yJQlkf~Csjz|6yaGE*fsHpX;C-mR+nz{^JB0kQ-S%YW*w=(cy6wF6T`OvZ zbov3oz7w|$E_Uki-ybo$j3Q?sP@l1tY$Jz$_4IM09j=!(zbHELFAFlPBd#X%ia)kn zjosImf&G4G8n*u)B?)vqQvC5kfE)?exzto*(26N;lMf zLgM2Q<_{&<9>00(*O!drs#~aBn<*Gz;h2AZxbJnN{qZqK-c>KUWC1?J=8~LO{b_T_ z-GxX4@guOfP_uN1EHdk`!mN|U8JEnPY%;9H$-;}FX?QGxU|_dfCDy&vZ~&57B1DKW zYgW`^yuC?Li-6#YLRquOU}ze+EtWNm7r`(^k&Rf`N)ev9a6{Abfx;eB5#hvAzRc29IhanwB}R=*{n5B*vS^HIatY? zla=HfTJNxfPrK;g(~ebiT zQM0*3CSCYdom6u9yZbPX$LU-p9w)Vhx%NIKzgwFMuq!b}CGs$}a2h8zV%y{e*fzVo z>^^nxy%3fARRXcSW}P5cKo28J8KF;E4>DrUCBCZ`xTs5I;)&iwOhNQ*i4=4TFk(5L zJvQ!E(z{gT&7N0w<(5K~@{!9c`JBEgM^;T`_Na5MnTo3(_MOVug1M=_Qr;-Djp~DW zH62&ucO;VPH?@Nkvi;wY+cl5}KsfP)cRNU#)D?7YOK@XBY1#3l$dra1)p0c!%O?6z zA_gJ$X0jn{JsoMj0h8kC!1J02xxU&m8n%E?#)FpL$ zoL%kkqzR6t051sAJATqA4c3!Jf^H7Ru&vP zTHIaIW|(Kfd=)VduR+WcaGB$FgK9?~Pr2D7ZGop<`i((vf%y47Lhm&S;sR25ehs-z z@C-tx%1EIkbZ}~el|YT%Mf4U@EA2H>yNuL6rqm{Zn!xrDtj7?MD4pA+Z+Iqwjwrwf zkmp%doPTWk~f@j!Kq(K_Z=26?Q!zAJngn8>`6!FCcVNANZXa+ioK$j`dv8v)N zT1g`#UDjxYbjjv9R@EgR6_;BK!1%vrgzNWP9wKNCgMw|$MFCA*8zH}IJqbxXS=Rb3N8eVRSPm8#yWB$K5J}nXi z?4&3@EfVIM@M4tdUs0yVAOQ^VX$sQg%r9#)Ej>DO-#k?fzM17N^cMQ&(q1FUS@d78 zt;I-ivPSpK8pXNX^U0b8-1ATn;0~@s_k5)kZq6e;_xo26ScDJyef&1^`78n-rNF0F zaKEpyX)$=8T49O;rxEn*I*bDQA-Sp@2d)ZgiV6E`O|X*)J56CH0oJ5NJ%nTMpR5JN zv8<5ZLdBK#8U^w`m9z&HHwp>cV%`^ug`+D??0$ww;~td4^wp3|OP@Al4Tk{h62%&; z)13mp7QuCh>idu=4T+wyx-M(!Dr=)aXqJ*+*3gW?el7$XbN^S!F;VbL)X%Xldat7^ z!#l)8y-CEsL$RD_K@lHpm}9!}7RXT!HkeS}MwD|DWkN^k>fJD!3)iMbd_lh`@?|x%k?OUnQ~A~ zhBsrzeG+kBrMRaX=a6vGJ(mh*rb0Sn^PFz%lHY_g*^<(X3l@1;aR_UxUPdRGu~Hvm zQwvogHdp(#U~OsVz!!5>V6dPRapMnq(V>jrPzqa|jGDts?GvRzNT1x$HGf55{wH9L z^VSjOpJf!wdnlGsFZmt>7TG%c26wnk+5uDH=oEqvFhswNB(9)@MtpcCI|7L#m=fImmCk=mhFp!Tv4&n1_EnoYax^R!(_x{i{(UFw11U4{Bc z;a?y{t0jeBMiNJm#AiO@7d&ID=OIy)`r{r3e*Suv{c~N^8a#W*dO)>1$lkRh}`{2_WqJTn|Z*`UIUO{ zm#jn{NK*1GZ)H3!QSce~!a`Y2LR*gxP+qVy6hO*@*a>eKEg00i{6!1M9C2(b62Za? zpHQ|ycrLt9r{O@Ha!TogPRJ8(wb>CT zbZZr6a~y>-^P)p&QHi>g@^z88Zt;c^H|NEYuP#GrtZdE+a4|0sh|HyRJ{jK>S9igC z8E@|BP{V;(XCM*| xcebmswooh_>s8ug;nrw}+8Yjsg5gjo7HM4%uL{6BKG)$R?1gu%x4{3=_zw_Op~V0I literal 0 HcmV?d00001 diff --git a/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..f59c40dd67 --- /dev/null +++ b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json @@ -0,0 +1,3 @@ +{"add":{"path":"part-00000-a190be9e-e3df-439e-b366-06a863f51e99-c000.snappy.parquet","partitionValues":{},"size":976,"modificationTime":1674611458901,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"letter\":\"a\",\"int\":120,\"date\":\"1971-07-01\"},\"maxValues\":{\"letter\":\"c\",\"int\":667,\"date\":\"2018-02-01\"},\"nullCount\":{\"letter\":2,\"int\":0,\"date\":0}}"}} +{"remove":{"path":"part-00000-ad1a4bb7-07e8-4f40-b50b-49910d209e0c-c000.snappy.parquet","deletionTimestamp":1674611459307,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":965}} +{"commitInfo":{"timestamp":1674611459307,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"976"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"b08f5758-a8e9-4dd1-af7e-7b6e53928d7a"}} diff --git a/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..21a717332f --- /dev/null +++ b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json @@ -0,0 +1,3 @@ +{"add":{"path":"part-00000-70b1dcdf-0236-4f63-a072-124cdbafd8a0-c000.snappy.parquet","partitionValues":{},"size":1010,"modificationTime":1674611461541,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"letter\":\"a\",\"int\":93,\"date\":\"1975-06-01\"},\"maxValues\":{\"letter\":\"c\",\"int\":753,\"date\":\"2013-03-01\"},\"nullCount\":{\"letter\":1,\"int\":0,\"date\":0}}"}} +{"remove":{"path":"part-00000-a190be9e-e3df-439e-b366-06a863f51e99-c000.snappy.parquet","deletionTimestamp":1674611461982,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":976}} +{"commitInfo":{"timestamp":1674611461982,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":2,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"1010"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"0403bbaf-a6f2-4543-9e6c-bd068e76670f"}} diff --git a/crates/deltalake-core/tests/fs_common/mod.rs b/crates/deltalake-core/tests/fs_common/mod.rs index 61227ca46b..dc9ec2547a 100644 --- a/crates/deltalake-core/tests/fs_common/mod.rs +++ b/crates/deltalake-core/tests/fs_common/mod.rs @@ -1,9 +1,12 @@ use chrono::Utc; +use deltalake_core::kernel::{ + Action, Add, DataType, PrimitiveType, Remove, StructField, StructType, +}; use deltalake_core::operations::create::CreateBuilder; use deltalake_core::operations::transaction::commit; -use deltalake_core::protocol::{Action, Add, DeltaOperation, Remove, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::{DeltaObjectStore, GetResult, ObjectStoreResult}; -use deltalake_core::{DeltaTable, Schema, SchemaDataType, SchemaField}; +use deltalake_core::DeltaTable; use object_store::path::Path as StorePath; use object_store::ObjectStore; use serde_json::Value; @@ -36,14 +39,14 @@ pub async fn create_table_from_json( std::fs::create_dir_all(path).unwrap(); std::fs::remove_dir_all(path).unwrap(); std::fs::create_dir_all(path).unwrap(); - let schema: Schema = serde_json::from_value(schema).unwrap(); + let schema: StructType = serde_json::from_value(schema).unwrap(); let config: HashMap> = serde_json::from_value(config).unwrap(); create_test_table(path, schema, partition_columns, config).await } pub async fn create_test_table( path: &str, - schema: Schema, + schema: StructType, partition_columns: Vec<&str>, config: HashMap>, ) -> DeltaTable { @@ -51,7 +54,7 @@ pub async fn create_test_table( .with_location(path) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_partition_columns(partition_columns) .with_configuration(config) .await @@ -66,11 +69,10 @@ pub async fn create_table( fs::create_dir_all(&log_dir).unwrap(); cleanup_dir_except(log_dir, vec![]); - let schema = Schema::new(vec![SchemaField::new( + let schema = StructType::new(vec![StructField::new( "id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), )]); create_test_table(path, schema, Vec::new(), config.unwrap_or_default()).await @@ -88,6 +90,8 @@ pub fn add(offset_millis: i64) -> Add { stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, } } @@ -97,13 +101,13 @@ pub async fn commit_add(table: &mut DeltaTable, add: &Add) -> i64 { partition_by: None, predicate: None, }; - commit_actions(table, vec![Action::add(add.clone())], operation).await + commit_actions(table, vec![Action::Add(add.clone())], operation).await } pub async fn commit_removes(table: &mut DeltaTable, removes: Vec<&Remove>) -> i64 { let vec = removes .iter() - .map(|r| Action::remove((*r).clone())) + .map(|r| Action::Remove((*r).clone())) .collect(); let operation = DeltaOperation::Delete { predicate: None }; commit_actions(table, vec, operation).await diff --git a/crates/deltalake-core/tests/integration_checkpoint.rs b/crates/deltalake-core/tests/integration_checkpoint.rs index 7b2f9ea026..9b5b0a73ff 100644 --- a/crates/deltalake-core/tests/integration_checkpoint.rs +++ b/crates/deltalake-core/tests/integration_checkpoint.rs @@ -2,11 +2,10 @@ use chrono::Utc; use deltalake_core::checkpoints::{cleanup_expired_logs_for, create_checkpoint}; +use deltalake_core::kernel::{DataType, PrimitiveType}; use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestResult}; use deltalake_core::writer::{DeltaWriter, JsonWriter}; -use deltalake_core::{ - errors::DeltaResult, DeltaOps, DeltaTableBuilder, ObjectStore, SchemaDataType, -}; +use deltalake_core::{errors::DeltaResult, DeltaOps, DeltaTableBuilder, ObjectStore}; use object_store::path::Path; use serde_json::json; use serial_test::serial; @@ -121,7 +120,7 @@ async fn test_issue_1420_cleanup_expired_logs_for() -> DeltaResult<()> { .create() .with_column( "id", - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), false, None, ) diff --git a/crates/deltalake-core/tests/integration_concurrent_writes.rs b/crates/deltalake-core/tests/integration_concurrent_writes.rs index 0a6470d5d0..bef44d0693 100644 --- a/crates/deltalake-core/tests/integration_concurrent_writes.rs +++ b/crates/deltalake-core/tests/integration_concurrent_writes.rs @@ -1,10 +1,11 @@ #![cfg(feature = "integration_test")] +use deltalake_core::kernel::{Action, Add, DataType, PrimitiveType, StructField, StructType}; use deltalake_core::operations::transaction::commit; use deltalake_core::operations::DeltaOps; -use deltalake_core::protocol::{Action, Add, DeltaOperation, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestResult, TestTables}; -use deltalake_core::{DeltaTable, DeltaTableBuilder, Schema, SchemaDataType, SchemaField}; +use deltalake_core::{DeltaTable, DeltaTableBuilder}; use std::collections::HashMap; use std::future::Future; use std::iter::FromIterator; @@ -49,11 +50,10 @@ async fn test_concurrent_writes(integration: StorageIntegration) -> TestResult { async fn prepare_table( context: &IntegrationContext, ) -> Result<(DeltaTable, String), Box> { - let schema = Schema::new(vec![SchemaField::new( + let schema = StructType::new(vec![StructField::new( "Id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), )]); let table_uri = context.uri_for_table(TestTables::Custom("concurrent_workers".into())); @@ -64,7 +64,7 @@ async fn prepare_table( let table = DeltaOps(table) .create() - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await?; assert_eq!(0, table.version()); @@ -153,7 +153,7 @@ impl Worker { partition_by: None, predicate: None, }; - let actions = vec![Action::add(Add { + let actions = vec![Action::Add(Add { path: format!("{}.parquet", name), size: 396, partition_values: HashMap::new(), @@ -164,6 +164,8 @@ impl Worker { stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, })]; let version = commit( self.table.object_store().as_ref(), diff --git a/crates/deltalake-core/tests/integration_datafusion.rs b/crates/deltalake-core/tests/integration_datafusion.rs index 4978ea2a11..3476de6839 100644 --- a/crates/deltalake-core/tests/integration_datafusion.rs +++ b/crates/deltalake-core/tests/integration_datafusion.rs @@ -10,11 +10,10 @@ use std::path::PathBuf; use std::sync::Arc; use arrow::array::*; -use arrow::datatypes::{ +use arrow::record_batch::RecordBatch; +use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, TimeUnit, }; -use arrow::record_batch::RecordBatch; -use arrow_schema::{DataType, Field}; use datafusion::assert_batches_sorted_eq; use datafusion::datasource::physical_plan::ParquetExec; use datafusion::datasource::TableProvider; @@ -32,20 +31,22 @@ use datafusion_proto::bytes::{ use url::Url; use deltalake_core::delta_datafusion::{DeltaPhysicalCodec, DeltaScan}; +use deltalake_core::kernel::{DataType, MapType, PrimitiveType, StructField, StructType}; use deltalake_core::operations::create::CreateBuilder; use deltalake_core::protocol::SaveMode; use deltalake_core::storage::DeltaObjectStore; use deltalake_core::writer::{DeltaWriter, RecordBatchWriter}; use deltalake_core::{ + open_table, operations::{write::WriteBuilder, DeltaOps}, - DeltaTable, DeltaTableError, Schema, SchemaDataType, SchemaField, + DeltaTable, DeltaTableError, }; use std::error::Error; mod common; mod local { - use deltalake::{writer::JsonWriter, SchemaTypeMap}; + use deltalake_core::writer::JsonWriter; use super::*; #[tokio::test] @@ -96,14 +97,14 @@ mod local { let table_dir = tempfile::tempdir().unwrap(); let table_path = table_dir.path(); let table_uri = table_path.to_str().unwrap().to_string(); - let table_schema: Schema = batches[0].schema().try_into().unwrap(); + let table_schema: StructType = batches[0].schema().try_into().unwrap(); let mut table = DeltaOps::try_from_uri(table_uri) .await .unwrap() .create() .with_save_mode(SaveMode::Ignore) - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions) .await .unwrap(); @@ -153,7 +154,7 @@ mod local { #[tokio::test] async fn test_datafusion_simple_query_partitioned() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/delta-0.8.0-partitioned") + let table = open_table("./tests/data/delta-0.8.0-partitioned") .await .unwrap(); ctx.register_table("demo", Arc::new(table))?; @@ -182,7 +183,7 @@ mod local { let source_scan_bytes = { let ctx = SessionContext::new(); let state = ctx.state(); - let source_table = deltalake::open_table("./tests/data/delta-0.8.0-date").await?; + let source_table = open_table("./tests/data/delta-0.8.0-date").await?; let source_scan = source_table.scan(&state, None, &[], None).await?; physical_plan_to_bytes_with_extension_codec(source_scan, &DeltaPhysicalCodec {})? }; @@ -195,9 +196,9 @@ mod local { &ctx, &DeltaPhysicalCodec {}, )?; - let fields = Schema::try_from(source_scan.schema()) + let fields = StructType::try_from(source_scan.schema()) .unwrap() - .get_fields() + .fields() .clone(); // Create target Delta Table @@ -262,9 +263,7 @@ mod local { #[tokio::test] async fn test_datafusion_date_column() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/delta-0.8.0-date") - .await - .unwrap(); + let table = open_table("./tests/data/delta-0.8.0-date").await.unwrap(); ctx.register_table("dates", Arc::new(table))?; let batches = ctx @@ -283,9 +282,7 @@ mod local { #[tokio::test] async fn test_datafusion_stats() -> Result<()> { - let table = deltalake::open_table("./tests/data/delta-0.8.0") - .await - .unwrap(); + let table = open_table("./tests/data/delta-0.8.0").await.unwrap(); let statistics = table.state.datafusion_table_statistics(); assert_eq!(statistics.num_rows, Some(4),); @@ -735,7 +732,7 @@ mod local { assert_eq!(metrics.num_scanned_files(), 1); // Ensure that tables without stats and partition columns can be pruned for just partitions - // let table = deltalake::open_table("./tests/data/delta-0.8.0-null-partition").await?; + // let table = open_table("./tests/data/delta-0.8.0-null-partition").await?; /* // Logically this should prune. See above @@ -765,7 +762,7 @@ mod local { #[tokio::test] async fn test_datafusion_partitioned_types() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/delta-2.2.0-partitioned-types") + let table = open_table("./tests/data/delta-2.2.0-partitioned-types") .await .unwrap(); ctx.register_table("demo", Arc::new(table))?; @@ -814,7 +811,7 @@ mod local { #[tokio::test] async fn test_datafusion_scan_timestamps() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/table_with_edge_timestamps") + let table = open_table("./tests/data/table_with_edge_timestamps") .await .unwrap(); ctx.register_table("demo", Arc::new(table))?; @@ -838,9 +835,7 @@ mod local { #[tokio::test] async fn test_issue_1292_datafusion_sql_projection() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/http_requests") - .await - .unwrap(); + let table = open_table("./tests/data/http_requests").await.unwrap(); ctx.register_table("http_requests", Arc::new(table))?; let batches = ctx @@ -869,9 +864,7 @@ mod local { #[tokio::test] async fn test_issue_1291_datafusion_sql_partitioned_data() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/http_requests") - .await - .unwrap(); + let table = open_table("./tests/data/http_requests").await.unwrap(); ctx.register_table("http_requests", Arc::new(table))?; let batches = ctx @@ -902,9 +895,7 @@ mod local { #[tokio::test] async fn test_issue_1374() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/issue_1374") - .await - .unwrap(); + let table = open_table("./tests/data/issue_1374").await.unwrap(); ctx.register_table("t", Arc::new(table))?; let batches = ctx @@ -939,24 +930,24 @@ mod local { #[tokio::test] async fn test_issue_1619_parquet_panic_using_map_type() -> Result<()> { let _ = tokio::fs::remove_dir_all("./tests/data/issue-1619").await; - let fields: Vec = vec![SchemaField::new( + let fields: Vec = vec![StructField::new( "metadata".to_string(), - SchemaDataType::map(SchemaTypeMap::new( - Box::new(SchemaDataType::primitive("string".to_string())), - Box::new(SchemaDataType::primitive("string".to_string())), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::String), + DataType::Primitive(PrimitiveType::String), true, - )), + ))), true, - HashMap::new(), )]; - let schema = deltalake::Schema::new(fields); - let table = deltalake::DeltaTableBuilder::from_uri("./tests/data/issue-1619").build()?; + let schema = StructType::new(fields); + let table = + deltalake_core::DeltaTableBuilder::from_uri("./tests/data/issue-1619").build()?; let _ = DeltaOps::from(table) .create() - .with_columns(schema.get_fields().to_owned()) + .with_columns(schema.fields().to_owned()) .await?; - let mut table = deltalake::open_table("./tests/data/issue-1619").await?; + let mut table = open_table("./tests/data/issue-1619").await?; let mut writer = JsonWriter::for_table(&table).unwrap(); writer @@ -1082,17 +1073,15 @@ mod date_partitions { async fn setup_test() -> Result> { let columns = vec![ - SchemaField::new( + StructField::new( "id".to_owned(), - SchemaDataType::primitive("integer".to_owned()), + DataType::Primitive(PrimitiveType::Integer), false, - HashMap::new(), ), - SchemaField::new( + StructField::new( "date".to_owned(), - SchemaDataType::primitive("date".to_owned()), + DataType::Primitive(PrimitiveType::Date), false, - HashMap::new(), ), ]; @@ -1114,8 +1103,8 @@ mod date_partitions { Ok(RecordBatch::try_new( Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("date", DataType::Date32, false), + ArrowField::new("id", ArrowDataType::Int32, false), + ArrowField::new("date", ArrowDataType::Date32, false), ])), vec![Arc::new(ids_array), Arc::new(date_array)], )?) diff --git a/crates/deltalake-core/tests/integration_read.rs b/crates/deltalake-core/tests/integration_read.rs index 3056a3263b..0e17d34397 100644 --- a/crates/deltalake-core/tests/integration_read.rs +++ b/crates/deltalake-core/tests/integration_read.rs @@ -60,7 +60,7 @@ mod local { assert_eq!(table.get_files(), vec![Path::from(a.path.clone())]); // Remove added file. - let r = deltalake::protocol::Remove { + let r = deltalake_core::kernel::Remove { path: a.path.clone(), deletion_timestamp: Some(chrono::Utc::now().timestamp_millis()), data_change: false, @@ -69,6 +69,8 @@ mod local { size: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, }; assert_eq!(2, fs_common::commit_removes(&mut table, vec![&r]).await); @@ -210,12 +212,17 @@ async fn read_simple_table(integration: &IntegrationContext) -> TestResult { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 31); - assert!(tombstones.contains(&deltalake::protocol::Remove { + assert!(tombstones.contains(&deltalake_core::kernel::Remove { path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1587968596250), data_change: true, extended_file_metadata: None, - ..Default::default() + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + partition_values: None, + tags: None, })); Ok(()) @@ -246,11 +253,17 @@ async fn read_simple_table_with_version(integration: &IntegrationContext) -> Tes ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 29); - assert!(tombstones.contains(&deltalake::protocol::Remove { + assert!(tombstones.contains(&deltalake_core::kernel::Remove { path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1587968596250), data_change: true, - ..Default::default() + tags: None, + partition_values: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + deletion_vector: None, + extended_file_metadata: None, })); Ok(()) @@ -291,7 +304,7 @@ mod gcs { #[tokio::test] async fn test_gcs_simple() { let bucket = std::env::var("GCS_DELTA_BUCKET").unwrap(); - let table = deltalake::open_table(format!("gs://{}/simple_table", bucket).as_str()) + let table = deltalake_core::open_table(format!("gs://{}/simple_table", bucket).as_str()) .await .unwrap(); assert_eq!(table.version(), 4); @@ -309,11 +322,17 @@ mod gcs { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 31); - assert!(tombstones.contains(&deltalake::protocol::Remove { + assert!(tombstones.contains(&deltalake_core::kernel::Remove { path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1587968596250), data_change: true, - ..Default::default() + extended_file_metadata: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + partition_values: None, + tags: None, })); } } diff --git a/crates/deltalake-core/tests/read_delta_partitions_test.rs b/crates/deltalake-core/tests/read_delta_partitions_test.rs index c579e242a6..514cdefde8 100644 --- a/crates/deltalake-core/tests/read_delta_partitions_test.rs +++ b/crates/deltalake-core/tests/read_delta_partitions_test.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::convert::TryFrom; -use deltalake_core::schema::SchemaDataType; +use deltalake_core::kernel::{DataType, PrimitiveType}; #[allow(dead_code)] mod fs_common; @@ -50,7 +50,7 @@ fn test_match_partition() { key: "month".to_string(), value: deltalake_core::PartitionValue::Equal("12".to_string()), }; - let string_type = SchemaDataType::primitive(String::from("string")); + let string_type = DataType::Primitive(PrimitiveType::String); assert!(!partition_year_2020_filter.match_partition(&partition_2021, &string_type)); assert!(partition_year_2020_filter.match_partition(&partition_2020, &string_type)); @@ -71,11 +71,13 @@ fn test_match_filters() { }, ]; - let string_type = SchemaDataType::primitive(String::from("string")); - let partition_data_types: HashMap<&str, &SchemaDataType> = - vec![("year", &string_type), ("month", &string_type)] - .into_iter() - .collect(); + let string_type = DataType::Primitive(PrimitiveType::String); + let partition_data_types: HashMap<&String, &DataType> = vec![ + (&partitions[0].key, &string_type), + (&partitions[1].key, &string_type), + ] + .into_iter() + .collect(); let valid_filters = deltalake_core::PartitionFilter { key: "year".to_string(), @@ -101,7 +103,7 @@ fn test_match_filters() { #[cfg(all(feature = "arrow", feature = "parquet"))] #[tokio::test] async fn read_null_partitions_from_checkpoint() { - use deltalake_core::protocol::Add; + use deltalake_core::kernel::Add; use maplit::hashmap; use serde_json::json; diff --git a/crates/deltalake-core/tests/serde/checkpoint_schema.json b/crates/deltalake-core/tests/serde/checkpoint_schema.json new file mode 100644 index 0000000000..9e397cd978 --- /dev/null +++ b/crates/deltalake-core/tests/serde/checkpoint_schema.json @@ -0,0 +1,267 @@ +{ + "type": "struct", + "fields": [ + { + "name": "txn", + "type": { + "type": "struct", + "fields": [ + { + "name": "appId", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "version", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "lastUpdated", + "type": "long", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "add", + "type": { + "type": "struct", + "fields": [ + { + "name": "path", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "partitionValues", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "size", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "modificationTime", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "dataChange", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "tags", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "stats", + "type": "string", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "remove", + "type": { + "type": "struct", + "fields": [ + { + "name": "path", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "deletionTimestamp", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "dataChange", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "extendedFileMetadata", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "partitionValues", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "size", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "tags", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "metaData", + "type": { + "type": "struct", + "fields": [ + { + "name": "id", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "description", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "format", + "type": { + "type": "struct", + "fields": [ + { + "name": "provider", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "options", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "schemaString", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "partitionColumns", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "configuration", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "createdTime", + "type": "long", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "protocol", + "type": { + "type": "struct", + "fields": [ + { + "name": "minReaderVersion", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "minWriterVersion", + "type": "integer", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + } + ] +} diff --git a/crates/deltalake-core/tests/serde/schema.json b/crates/deltalake-core/tests/serde/schema.json new file mode 100644 index 0000000000..710a9e5080 --- /dev/null +++ b/crates/deltalake-core/tests/serde/schema.json @@ -0,0 +1,68 @@ +{ + "type": "struct", + "fields": [ + { + "name": "a", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "b", + "type": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "c", + "type": { + "type": "array", + "elementType": "integer", + "containsNull": false + }, + "nullable": true, + "metadata": {} + }, + { + "name": "e", + "type": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": {} + } + ] + }, + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "f", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + ] +} diff --git a/crates/deltalake/examples/basic_operations.rs b/crates/deltalake/examples/basic_operations.rs index 13a1c60f73..e697e4cf53 100644 --- a/crates/deltalake/examples/basic_operations.rs +++ b/crates/deltalake/examples/basic_operations.rs @@ -1,47 +1,45 @@ use deltalake::arrow::{ array::{Int32Array, StringArray, TimestampMicrosecondArray}, - datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}, + datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema, TimeUnit}, record_batch::RecordBatch, }; +use deltalake::kernel::{DataType, PrimitiveType, StructField}; use deltalake::operations::collect_sendable_stream; use deltalake::parquet::{ basic::{Compression, ZstdLevel}, file::properties::WriterProperties, }; -use deltalake::{protocol::SaveMode, DeltaOps, SchemaDataType, SchemaField}; +use deltalake::{protocol::SaveMode, DeltaOps}; use std::sync::Arc; -fn get_table_columns() -> Vec { +fn get_table_columns() -> Vec { vec![ - SchemaField::new( + StructField::new( String::from("int"), - SchemaDataType::primitive(String::from("integer")), + DataType::Primitive(PrimitiveType::Integer), false, - Default::default(), ), - SchemaField::new( + StructField::new( String::from("string"), - SchemaDataType::primitive(String::from("string")), + DataType::Primitive(PrimitiveType::String), true, - Default::default(), ), - SchemaField::new( + StructField::new( String::from("timestamp"), - SchemaDataType::primitive(String::from("timestamp")), + DataType::Primitive(PrimitiveType::Timestamp), true, - Default::default(), ), ] } fn get_table_batches() -> RecordBatch { let schema = Arc::new(ArrowSchema::new(vec![ - Field::new("int", DataType::Int32, false), - Field::new("string", DataType::Utf8, true), + Field::new("int", ArrowDataType::Int32, false), + Field::new("string", ArrowDataType::Utf8, true), Field::new( "timestamp", - DataType::Timestamp(TimeUnit::Microsecond, None), + ArrowDataType::Timestamp(TimeUnit::Microsecond, None), true, ), ])); diff --git a/crates/deltalake/examples/recordbatch-writer.rs b/crates/deltalake/examples/recordbatch-writer.rs index 1347da1baa..e7fd7125cd 100644 --- a/crates/deltalake/examples/recordbatch-writer.rs +++ b/crates/deltalake/examples/recordbatch-writer.rs @@ -6,11 +6,11 @@ * This example was originally posted by @rtyler in: * */ - use chrono::prelude::*; use deltalake::arrow::array::*; use deltalake::arrow::record_batch::RecordBatch; use deltalake::errors::DeltaTableError; +use deltalake::kernel::{DataType, PrimitiveType, StructField, StructType}; use deltalake::parquet::{ basic::{Compression, ZstdLevel}, file::properties::WriterProperties, @@ -19,8 +19,6 @@ use deltalake::writer::{DeltaWriter, RecordBatchWriter}; use deltalake::Path; use deltalake::*; use log::*; - -use std::collections::HashMap; use std::sync::Arc; /* @@ -86,31 +84,27 @@ struct WeatherRecord { } impl WeatherRecord { - fn columns() -> Vec { + fn columns() -> Vec { vec![ - SchemaField::new( + StructField::new( "timestamp".to_string(), - SchemaDataType::primitive("timestamp".to_string()), + DataType::Primitive(PrimitiveType::Timestamp), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "temp".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "lat".to_string(), - SchemaDataType::primitive("double".to_string()), + DataType::Primitive(PrimitiveType::Float), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "long".to_string(), - SchemaDataType::primitive("double".to_string()), + DataType::Primitive(PrimitiveType::Float), true, - HashMap::new(), ), ] } @@ -167,7 +161,7 @@ fn convert_to_batch(table: &DeltaTable, records: &Vec) -> RecordB let metadata = table .get_metadata() .expect("Failed to get metadata for the table"); - let arrow_schema = >::try_from( + let arrow_schema = >::try_from( &metadata.schema.clone(), ) .expect("Failed to convert to arrow schema"); diff --git a/python/src/error.rs b/python/src/error.rs index 1b5a9f6839..f72c6361d2 100644 --- a/python/src/error.rs +++ b/python/src/error.rs @@ -73,6 +73,7 @@ fn checkpoint_to_py(err: ProtocolError) -> PyErr { ProtocolError::ParquetParseError { source } => PyIOError::new_err(source.to_string()), ProtocolError::IO { source } => PyIOError::new_err(source.to_string()), ProtocolError::Generic(msg) => DeltaError::new_err(msg), + ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()), } } diff --git a/python/src/lib.rs b/python/src/lib.rs index cc6b2202c3..923a06d159 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -26,6 +26,7 @@ use deltalake::datafusion::datasource::provider::TableProvider; use deltalake::datafusion::prelude::SessionContext; use deltalake::delta_datafusion::DeltaDataChecker; use deltalake::errors::DeltaTableError; +use deltalake::kernel::{Action, Add, Invariant, Metadata, Remove, StructType}; use deltalake::operations::delete::DeleteBuilder; use deltalake::operations::filesystem_check::FileSystemCheckBuilder; use deltalake::operations::merge::MergeBuilder; @@ -36,11 +37,9 @@ use deltalake::operations::update::UpdateBuilder; use deltalake::operations::vacuum::VacuumBuilder; use deltalake::parquet::file::properties::WriterProperties; use deltalake::partitions::PartitionFilter; -use deltalake::protocol::{ - self, Action, ColumnCountStat, ColumnValueStat, DeltaOperation, SaveMode, Stats, -}; +use deltalake::protocol::{ColumnCountStat, ColumnValueStat, DeltaOperation, SaveMode, Stats}; +use deltalake::DeltaOps; use deltalake::DeltaTableBuilder; -use deltalake::{DeltaOps, Invariant, Schema}; use pyo3::exceptions::{PyIOError, PyRuntimeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyFrozenSet, PyType}; @@ -262,7 +261,7 @@ impl RawDeltaTable { #[getter] pub fn schema(&self, py: Python) -> PyResult { - let schema: &Schema = self._table.get_schema().map_err(PythonError::from)?; + let schema: &StructType = self._table.get_schema().map_err(PythonError::from)?; schema_to_pyobject(schema, py) } @@ -688,9 +687,9 @@ impl RawDeltaTable { ._table .schema() .ok_or_else(|| DeltaProtocolError::new_err("table does not yet have a schema"))? - .get_fields() + .fields() .iter() - .map(|field| field.get_name()) + .map(|field| field.name().as_str()) .collect(); let partition_columns: HashSet<&str> = self ._table @@ -760,13 +759,13 @@ impl RawDeltaTable { partitions_filters: Option>, ) -> PyResult<()> { let mode = save_mode_from_str(mode)?; - let schema: Schema = (&schema.0).try_into().map_err(PythonError::from)?; + let schema: StructType = (&schema.0).try_into().map_err(PythonError::from)?; let existing_schema = self._table.get_schema().map_err(PythonError::from)?; - let mut actions: Vec = add_actions + let mut actions: Vec = add_actions .iter() - .map(|add| Action::add(add.into())) + .map(|add| Action::Add(add.into())) .collect(); match mode { @@ -782,7 +781,7 @@ impl RawDeltaTable { .map_err(PythonError::from)?; for old_add in add_actions { - let remove_action = Action::remove(protocol::Remove { + let remove_action = Action::Remove(Remove { path: old_add.path.clone(), deletion_timestamp: Some(current_timestamp()), data_change: true, @@ -791,6 +790,8 @@ impl RawDeltaTable { size: Some(old_add.size), deletion_vector: old_add.deletion_vector.clone(), tags: old_add.tags.clone(), + base_row_id: old_add.base_row_id, + default_row_commit_version: old_add.default_row_commit_version, }); actions.push(remove_action); } @@ -803,9 +804,9 @@ impl RawDeltaTable { .map_err(PythonError::from)? .clone(); metadata.schema = schema; - let metadata_action = protocol::MetaData::try_from(metadata) + let metadata_action = Metadata::try_from(metadata) .map_err(|_| PyValueError::new_err("Failed to reparse metadata"))?; - actions.push(Action::metaData(metadata_action)); + actions.push(Action::Metadata(metadata_action)); } } _ => { @@ -1108,9 +1109,9 @@ pub struct PyAddAction { stats: Option, } -impl From<&PyAddAction> for protocol::Add { +impl From<&PyAddAction> for Add { fn from(action: &PyAddAction) -> Self { - protocol::Add { + Add { path: action.path.clone(), size: action.size, partition_values: action.partition_values.clone(), @@ -1121,6 +1122,8 @@ impl From<&PyAddAction> for protocol::Add { stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, } } } @@ -1143,13 +1146,13 @@ fn write_new_deltalake( .build() .map_err(PythonError::from)?; - let schema: Schema = (&schema.0).try_into().map_err(PythonError::from)?; + let schema: StructType = (&schema.0).try_into().map_err(PythonError::from)?; let mut builder = DeltaOps(table) .create() - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_partition_columns(partition_by) - .with_actions(add_actions.iter().map(|add| Action::add(add.into()))); + .with_actions(add_actions.iter().map(|add| Action::Add(add.into()))); if let Some(name) = &name { builder = builder.with_table_name(name); diff --git a/python/src/schema.rs b/python/src/schema.rs index 77e5f0d4da..c56010f131 100644 --- a/python/src/schema.rs +++ b/python/src/schema.rs @@ -6,15 +6,14 @@ use deltalake::arrow::datatypes::{ }; use deltalake::arrow::error::ArrowError; use deltalake::arrow::pyarrow::PyArrowType; -use deltalake::schema::{ - Schema, SchemaDataType, SchemaField, SchemaTypeArray, SchemaTypeMap, SchemaTypeStruct, +use deltalake::kernel::{ + ArrayType as DeltaArrayType, DataType, MapType as DeltaMapType, PrimitiveType as DeltaPrimitve, + StructField, StructType as DeltaStructType, }; -use lazy_static::lazy_static; use pyo3::exceptions::{PyException, PyNotImplementedError, PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::IntoPyDict; use pyo3::{PyRef, PyResult}; -use regex::Regex; use std::collections::HashMap; // PyO3 doesn't yet support converting classes with inheritance with Python @@ -23,55 +22,30 @@ use std::collections::HashMap; // See: https://github.com/PyO3/pyo3/issues/1836 // Decimal is separate special case, since it has parameters -const VALID_PRIMITIVE_TYPES: [&str; 11] = [ - "string", - "long", - "integer", - "short", - "byte", - "float", - "double", - "boolean", - "binary", - "date", - "timestamp", -]; - -fn try_parse_decimal_type(data_type: &str) -> Option<(usize, usize)> { - lazy_static! { - static ref DECIMAL_REGEX: Regex = Regex::new(r"\((\d{1,2}),(\d{1,2})\)").unwrap(); - } - let extract = DECIMAL_REGEX.captures(data_type)?; - let precision = extract - .get(1) - .and_then(|v| v.as_str().parse::().ok())?; - let scale = extract - .get(2) - .and_then(|v| v.as_str().parse::().ok())?; - Some((precision, scale)) -} -fn schema_type_to_python(schema_type: SchemaDataType, py: Python) -> PyResult { +fn schema_type_to_python(schema_type: DataType, py: Python) -> PyResult { match schema_type { - SchemaDataType::primitive(data_type) => Ok((PrimitiveType::new(data_type)?).into_py(py)), - SchemaDataType::array(array_type) => { - let array_type: ArrayType = array_type.into(); + DataType::Primitive(data_type) => { + Ok((PrimitiveType::new(data_type.to_string())?).into_py(py)) + } + DataType::Array(array_type) => { + let array_type: ArrayType = (*array_type).into(); Ok(array_type.into_py(py)) } - SchemaDataType::map(map_type) => { - let map_type: MapType = map_type.into(); + DataType::Map(map_type) => { + let map_type: MapType = (*map_type).into(); Ok(map_type.into_py(py)) } - SchemaDataType::r#struct(struct_type) => { - let struct_type: StructType = struct_type.into(); + DataType::Struct(struct_type) => { + let struct_type: StructType = (*struct_type).into(); Ok(struct_type.into_py(py)) } } } -fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { +fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { if let Ok(data_type) = ob.extract::(py) { - return Ok(SchemaDataType::primitive(data_type.inner_type)); + return Ok(DataType::Primitive(data_type.inner_type)); } if let Ok(array_type) = ob.extract::(py) { return Ok(array_type.into()); @@ -85,7 +59,7 @@ fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { if let Ok(raw_primitive) = ob.extract::(py) { // Pass through PrimitiveType::new() to do validation return PrimitiveType::new(raw_primitive) - .map(|data_type| SchemaDataType::primitive(data_type.inner_type)); + .map(|data_type| DataType::Primitive(data_type.inner_type)); } Err(PyValueError::new_err("Invalid data type")) } @@ -93,14 +67,14 @@ fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct PrimitiveType { - inner_type: String, + inner_type: DeltaPrimitve, } -impl TryFrom for PrimitiveType { +impl TryFrom for PrimitiveType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::primitive(type_name) => Self::new(type_name), + DataType::Primitive(type_name) => Self::new(type_name.to_string()), _ => Err(PyTypeError::new_err("Type is not primitive")), } } @@ -111,34 +85,41 @@ impl PrimitiveType { #[new] #[pyo3(signature = (data_type))] fn new(data_type: String) -> PyResult { - if data_type.starts_with("decimal") { - if try_parse_decimal_type(&data_type).is_none() { - Err(PyValueError::new_err(format!( - "invalid decimal type: {data_type}" - ))) - } else { - Ok(Self { - inner_type: data_type, - }) - } - } else if !VALID_PRIMITIVE_TYPES - .iter() - .any(|&valid| data_type == valid) - { - Err(PyValueError::new_err(format!( - "data_type must be one of decimal(, ), {}.", - VALID_PRIMITIVE_TYPES.join(", ") - ))) - } else { - Ok(Self { - inner_type: data_type, - }) - } + let data_type: DeltaPrimitve = serde_json::from_str(&format!("\"{data_type}\"")) + .map_err(|_| PyValueError::new_err(format!("invalid type string: {data_type}")))?; + + Ok(Self { + inner_type: data_type, + }) + + // if data_type.starts_with("decimal") { + // if try_parse_decimal_type(&data_type).is_none() { + // Err(PyValueError::new_err(format!( + // "invalid decimal type: {data_type}" + // ))) + // } else { + // Ok(Self { + // inner_type: data_type, + // }) + // } + // } else if !VALID_PRIMITIVE_TYPES + // .iter() + // .any(|&valid| data_type == valid) + // { + // Err(PyValueError::new_err(format!( + // "data_type must be one of decimal(, ), {}.", + // VALID_PRIMITIVE_TYPES.join(", ") + // ))) + // } else { + // Ok(Self { + // inner_type: data_type, + // }) + // } } #[getter] fn get_type(&self) -> PyResult { - Ok(self.inner_type.clone()) + Ok(self.inner_type.to_string()) } fn __richcmp__(&self, other: PrimitiveType, cmp: pyo3::basic::CompareOp) -> PyResult { @@ -157,14 +138,14 @@ impl PrimitiveType { #[pyo3(text_signature = "($self)")] fn to_json(&self) -> PyResult { - let inner_type = SchemaDataType::primitive(self.inner_type.clone()); + let inner_type = DataType::Primitive(self.inner_type.clone()); serde_json::to_string(&inner_type).map_err(|err| PyException::new_err(err.to_string())) } #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -172,7 +153,7 @@ impl PrimitiveType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { - let inner_type = SchemaDataType::primitive(self.inner_type.clone()); + let inner_type = DataType::Primitive(self.inner_type.clone()); Ok(PyArrowType((&inner_type).try_into().map_err( |err: ArrowError| PyException::new_err(err.to_string()), )?)) @@ -181,7 +162,7 @@ impl PrimitiveType { #[pyo3(text_signature = "(data_type)")] #[staticmethod] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -192,26 +173,28 @@ impl PrimitiveType { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct ArrayType { - inner_type: SchemaTypeArray, + inner_type: DeltaArrayType, } -impl From for ArrayType { - fn from(inner_type: SchemaTypeArray) -> Self { +impl From for ArrayType { + fn from(inner_type: DeltaArrayType) -> Self { Self { inner_type } } } -impl From for SchemaDataType { - fn from(arr: ArrayType) -> SchemaDataType { - SchemaDataType::array(arr.inner_type) +impl From for DataType { + fn from(arr: ArrayType) -> DataType { + DataType::Array(Box::new(arr.inner_type)) } } -impl TryFrom for ArrayType { +impl TryFrom for ArrayType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::array(inner_type) => Ok(Self { inner_type }), + DataType::Array(inner_type) => Ok(Self { + inner_type: *inner_type, + }), _ => Err(PyTypeError::new_err("Type is not an array")), } } @@ -222,18 +205,15 @@ impl ArrayType { #[new] #[pyo3(signature = (element_type, contains_null = true))] fn new(element_type: PyObject, contains_null: bool, py: Python) -> PyResult { - let inner_type = SchemaTypeArray::new( - Box::new(python_type_to_schema(element_type, py)?), - contains_null, - ); + let inner_type = + DeltaArrayType::new(python_type_to_schema(element_type, py)?, contains_null); Ok(Self { inner_type }) } fn __repr__(&self, py: Python) -> PyResult { - let type_repr: String = - schema_type_to_python(self.inner_type.get_element_type().clone(), py)? - .call_method0(py, "__repr__")? - .extract(py)?; + let type_repr: String = schema_type_to_python(self.inner_type.element_type().clone(), py)? + .call_method0(py, "__repr__")? + .extract(py)?; Ok(format!( "ArrayType({}, contains_null={})", type_repr, @@ -262,7 +242,7 @@ impl ArrayType { #[getter] fn element_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner_type.get_element_type().to_owned(), py) + schema_type_to_python(self.inner_type.element_type().to_owned(), py) } #[getter] @@ -278,7 +258,7 @@ impl ArrayType { #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -287,7 +267,7 @@ impl ArrayType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( - (&SchemaDataType::array(self.inner_type.clone())) + (&DataType::Array(Box::new(self.inner_type.clone()))) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, )) @@ -296,7 +276,7 @@ impl ArrayType { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -307,26 +287,28 @@ impl ArrayType { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct MapType { - inner_type: SchemaTypeMap, + inner_type: DeltaMapType, } -impl From for MapType { - fn from(inner_type: SchemaTypeMap) -> Self { +impl From for MapType { + fn from(inner_type: DeltaMapType) -> Self { Self { inner_type } } } -impl From for SchemaDataType { - fn from(map: MapType) -> SchemaDataType { - SchemaDataType::map(map.inner_type) +impl From for DataType { + fn from(map: MapType) -> DataType { + DataType::Map(Box::new(map.inner_type)) } } -impl TryFrom for MapType { +impl TryFrom for MapType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::map(inner_type) => Ok(Self { inner_type }), + DataType::Map(inner_type) => Ok(Self { + inner_type: *inner_type, + }), _ => Err(PyTypeError::new_err("Type is not a map")), } } @@ -342,27 +324,26 @@ impl MapType { value_contains_null: bool, py: Python, ) -> PyResult { - let inner_type = SchemaTypeMap::new( - Box::new(python_type_to_schema(key_type, py)?), - Box::new(python_type_to_schema(value_type, py)?), + let inner_type = DeltaMapType::new( + python_type_to_schema(key_type, py)?, + python_type_to_schema(value_type, py)?, value_contains_null, ); Ok(Self { inner_type }) } fn __repr__(&self, py: Python) -> PyResult { - let key_repr: String = schema_type_to_python(self.inner_type.get_key_type().clone(), py)? + let key_repr: String = schema_type_to_python(self.inner_type.key_type().clone(), py)? + .call_method0(py, "__repr__")? + .extract(py)?; + let value_repr: String = schema_type_to_python(self.inner_type.value_type().clone(), py)? .call_method0(py, "__repr__")? .extract(py)?; - let value_repr: String = - schema_type_to_python(self.inner_type.get_value_type().clone(), py)? - .call_method0(py, "__repr__")? - .extract(py)?; Ok(format!( "MapType({}, {}, value_contains_null={})", key_repr, value_repr, - if self.inner_type.get_value_contains_null() { + if self.inner_type.value_contains_null() { "True" } else { "False" @@ -387,17 +368,17 @@ impl MapType { #[getter] fn key_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner_type.get_key_type().to_owned(), py) + schema_type_to_python(self.inner_type.key_type().to_owned(), py) } #[getter] fn value_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner_type.get_value_type().to_owned(), py) + schema_type_to_python(self.inner_type.value_type().to_owned(), py) } #[getter] fn value_contains_null(&self, py: Python) -> PyResult { - Ok(self.inner_type.get_value_contains_null().into_py(py)) + Ok(self.inner_type.value_contains_null().into_py(py)) } #[pyo3(text_signature = "($self)")] @@ -408,7 +389,7 @@ impl MapType { #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -417,7 +398,7 @@ impl MapType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( - (&SchemaDataType::map(self.inner_type.clone())) + (&DataType::Map(Box::new(self.inner_type.clone()))) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, )) @@ -426,7 +407,7 @@ impl MapType { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -437,7 +418,7 @@ impl MapType { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct Field { - inner: SchemaField, + inner: StructField, } #[pymethods] @@ -466,19 +447,20 @@ impl Field { HashMap::new() }; - Ok(Self { - inner: SchemaField::new(name, ty, nullable, metadata), - }) + let mut inner = StructField::new(name, ty, nullable); + inner = inner.with_metadata(metadata); + + Ok(Self { inner }) } #[getter] fn name(&self) -> String { - self.inner.get_name().to_string() + self.inner.name().to_string() } #[getter] fn get_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner.get_type().clone(), py) + schema_type_to_python(self.inner.data_type().clone(), py) } #[getter] @@ -489,17 +471,17 @@ impl Field { #[getter] fn metadata(&self, py: Python) -> PyResult { let json_loads = PyModule::import(py, "json")?.getattr("loads")?; - let metadata_json: String = serde_json::to_string(self.inner.get_metadata()) + let metadata_json: String = serde_json::to_string(self.inner.metadata()) .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(json_loads.call1((metadata_json,))?.to_object(py)) } fn __repr__(&self, py: Python) -> PyResult { - let type_repr: String = schema_type_to_python(self.inner.get_type().clone(), py)? + let type_repr: String = schema_type_to_python(self.inner.data_type().clone(), py)? .call_method0(py, "__repr__")? .extract(py)?; - let metadata = self.inner.get_metadata(); + let metadata = self.inner.metadata(); let maybe_metadata = if metadata.is_empty() { "".to_string() } else { @@ -511,7 +493,7 @@ impl Field { }; Ok(format!( "Field({}, {}, nullable={}{})", - self.inner.get_name(), + self.inner.name(), type_repr, if self.inner.is_nullable() { "True" @@ -540,7 +522,7 @@ impl Field { #[staticmethod] #[pyo3(text_signature = "(field_json)")] fn from_json(field_json: String) -> PyResult { - let field: SchemaField = serde_json::from_str(&field_json) + let field: StructField = serde_json::from_str(&field_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(Self { inner: field }) @@ -557,7 +539,7 @@ impl Field { #[pyo3(text_signature = "(field)")] fn from_pyarrow(field: PyArrowType) -> PyResult { Ok(Self { - inner: SchemaField::try_from(&field.0) + inner: StructField::try_from(&field.0) .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, }) } @@ -566,26 +548,28 @@ impl Field { #[pyclass(subclass, module = "deltalake._internal")] #[derive(Clone)] pub struct StructType { - inner_type: SchemaTypeStruct, + inner_type: DeltaStructType, } -impl From for StructType { - fn from(inner_type: SchemaTypeStruct) -> Self { +impl From for StructType { + fn from(inner_type: DeltaStructType) -> Self { Self { inner_type } } } -impl From for SchemaDataType { - fn from(str: StructType) -> SchemaDataType { - SchemaDataType::r#struct(str.inner_type) +impl From for DataType { + fn from(str: StructType) -> DataType { + DataType::Struct(Box::new(str.inner_type)) } } -impl TryFrom for StructType { +impl TryFrom for StructType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::r#struct(inner_type) => Ok(Self { inner_type }), + DataType::Struct(inner_type) => Ok(Self { + inner_type: *inner_type, + }), _ => Err(PyTypeError::new_err("Type is not a struct")), } } @@ -594,18 +578,18 @@ impl TryFrom for StructType { impl StructType { #[new] fn new(fields: Vec>) -> Self { - let fields: Vec = fields + let fields: Vec = fields .into_iter() .map(|field| field.inner.clone()) .collect(); - let inner_type = SchemaTypeStruct::new(fields); + let inner_type = DeltaStructType::new(fields); Self { inner_type } } fn __repr__(&self, py: Python) -> PyResult { let inner_data: Vec = self .inner_type - .get_fields() + .fields() .iter() .map(|field| { let field = Field { @@ -636,7 +620,7 @@ impl StructType { #[getter] fn fields(&self) -> Vec { self.inner_type - .get_fields() + .fields() .iter() .map(|field| Field { inner: field.clone(), @@ -652,7 +636,7 @@ impl StructType { #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -661,7 +645,7 @@ impl StructType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( - (&SchemaDataType::r#struct(self.inner_type.clone())) + (&DataType::Struct(Box::new(self.inner_type.clone()))) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, )) @@ -670,7 +654,7 @@ impl StructType { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -678,9 +662,9 @@ impl StructType { } } -pub fn schema_to_pyobject(schema: &Schema, py: Python) -> PyResult { +pub fn schema_to_pyobject(schema: &DeltaStructType, py: Python) -> PyResult { let fields: Vec = schema - .get_fields() + .fields() .iter() .map(|field| Field { inner: field.clone(), @@ -714,11 +698,11 @@ impl PySchema { #[new] #[pyo3(signature = (fields))] fn new(fields: Vec>) -> PyResult<(Self, StructType)> { - let fields: Vec = fields + let fields: Vec = fields .into_iter() .map(|field| field.inner.clone()) .collect(); - let inner_type = SchemaTypeStruct::new(fields); + let inner_type = DeltaStructType::new(fields); Ok((Self {}, StructType { inner_type })) } @@ -726,7 +710,7 @@ impl PySchema { let super_ = self_.as_ref(); let inner_data: Vec = super_ .inner_type - .get_fields() + .fields() .iter() .map(|field| { let field = Field { @@ -836,7 +820,7 @@ impl PySchema { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType, py: Python) -> PyResult { - let inner_type: SchemaTypeStruct = (&data_type.0) + let inner_type: DeltaStructType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -852,11 +836,19 @@ impl PySchema { #[staticmethod] #[pyo3(text_signature = "(schema_json)")] fn from_json(schema_json: String, py: Python) -> PyResult> { - let data_type: SchemaDataType = serde_json::from_str(&schema_json) + let data_type: DataType = serde_json::from_str(&schema_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; - if let SchemaDataType::r#struct(inner_type) = data_type { - Py::new(py, (Self {}, StructType { inner_type })) + if let DataType::Struct(inner_type) = data_type { + Py::new( + py, + ( + Self {}, + StructType { + inner_type: *inner_type, + }, + ), + ) } else { Err(PyTypeError::new_err("Type is not a struct")) } diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py index e6d45441c0..f63df0e9fb 100644 --- a/python/tests/test_schema.py +++ b/python/tests/test_schema.py @@ -175,7 +175,10 @@ def test_delta_field(): assert field.name == name assert field.type == (PrimitiveType(ty) if isinstance(ty, str) else ty) assert field.nullable == nullable - assert field.metadata == (metadata or {}) + if metadata: + assert json.loads(field.metadata["x"]) == {"y": 3} + else: + assert field.metadata == {} # Field metadata doesn't roundtrip currently # See: https://github.com/apache/arrow-rs/issues/478