diff --git a/crates/deltalake-core/Cargo.toml b/crates/deltalake-core/Cargo.toml index e645b6bfd0..ce1c7490ad 100644 --- a/crates/deltalake-core/Cargo.toml +++ b/crates/deltalake-core/Cargo.toml @@ -68,6 +68,8 @@ tokio = { workspace = true, features = [ # other deps (these should be organized and pulled into workspace.dependencies as necessary) cfg-if = "1" errno = "0.3" +either = "1.8" +fix-hidden-lifetime-bug = "0.2" hyper = { version = "0.14", optional = true } itertools = "0.11" lazy_static = "1" @@ -80,8 +82,10 @@ once_cell = "1.16.0" parking_lot = "0.12" parquet2 = { version = "0.17", optional = true } percent-encoding = "2" +roaring = "0.10.1" tracing = { version = "0.1", optional = true } rand = "0.8" +z85 = "3.0.5" # hdfs datafusion-objectstore-hdfs = { version = "0.1.3", default-features = false, features = [ diff --git a/crates/deltalake-core/src/delta_datafusion/expr.rs b/crates/deltalake-core/src/delta_datafusion/expr.rs index 815b01831f..e451484183 100644 --- a/crates/deltalake-core/src/delta_datafusion/expr.rs +++ b/crates/deltalake-core/src/delta_datafusion/expr.rs @@ -338,14 +338,13 @@ impl<'a> fmt::Display for ScalarValueFormat<'a> { #[cfg(test)] mod test { - use std::collections::HashMap; - - use arrow_schema::DataType; + use arrow_schema::DataType as ArrowDataType; use datafusion::prelude::SessionContext; use datafusion_common::{DFSchema, ScalarValue}; use datafusion_expr::{col, decode, lit, substring, Cast, Expr, ExprSchemable}; - use crate::{DeltaOps, DeltaTable, Schema, SchemaDataType, SchemaField}; + use crate::kernel::{DataType, PrimitiveType, StructField, StructType}; + use crate::{DeltaOps, DeltaTable}; use super::fmt_expr_to_sql; @@ -366,66 +365,57 @@ mod test { } async fn setup_table() -> DeltaTable { - let schema = Schema::new(vec![ - SchemaField::new( + let schema = StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value2".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "active".to_string(), - SchemaDataType::primitive("boolean".to_string()), + DataType::Primitive(PrimitiveType::Boolean), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "money".to_string(), - SchemaDataType::primitive("decimal(12,2)".to_string()), + DataType::Primitive(PrimitiveType::Decimal(12, 2)), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "_date".to_string(), - SchemaDataType::primitive("date".to_string()), + DataType::Primitive(PrimitiveType::Date), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "_timestamp".to_string(), - SchemaDataType::primitive("timestamp".to_string()), + DataType::Primitive(PrimitiveType::Timestamp), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "_binary".to_string(), - SchemaDataType::primitive("binary".to_string()), + DataType::Primitive(PrimitiveType::Binary), true, - HashMap::new(), ), ]); let table = DeltaOps::new_in_memory() .create() - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -441,7 +431,7 @@ mod test { simple!( Expr::Cast(Cast { expr: Box::new(lit(1_i64)), - data_type: DataType::Int32 + data_type: ArrowDataType::Int32 }), "arrow_cast(1, 'Int32')".to_string() ), diff --git a/crates/deltalake-core/src/delta_datafusion/mod.rs b/crates/deltalake-core/src/delta_datafusion/mod.rs index 7fbe362afc..19d7a510ef 100644 --- a/crates/deltalake-core/src/delta_datafusion/mod.rs +++ b/crates/deltalake-core/src/delta_datafusion/mod.rs @@ -70,11 +70,12 @@ use serde::{Deserialize, Serialize}; use url::Url; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{self, Add}; +use crate::kernel::{Add, DataType as DeltaDataType, Invariant, PrimitiveType}; +use crate::protocol::{self}; use crate::storage::ObjectStoreRef; use crate::table::builder::ensure_table_uri; use crate::table::state::DeltaTableState; -use crate::{open_table, open_table_with_storage_options, DeltaTable, Invariant, SchemaDataType}; +use crate::{open_table, open_table_with_storage_options, DeltaTable}; const PATH_COLUMN: &str = "__delta_rs_path"; @@ -121,7 +122,7 @@ impl DeltaTableState { min_value: None, distinct_count: None }; - self.schema().unwrap().get_fields().len() + self.schema().unwrap().fields().len() ]), is_exact: true, }, @@ -139,13 +140,13 @@ impl DeltaTableState { column_statistics: acc.column_statistics.map(|col_stats| { self.schema() .unwrap() - .get_fields() + .fields() .iter() .zip(col_stats) .map(|(field, stats)| { let null_count = new_stats .null_count - .get(field.get_name()) + .get(field.name()) .and_then(|x| { let null_count_acc = stats.null_count?; let null_count = x.as_value()? as usize; @@ -155,7 +156,7 @@ impl DeltaTableState { let max_value = new_stats .max_values - .get(field.get_name()) + .get(field.name()) .and_then(|x| { let old_stats = stats.clone(); let max_value = to_scalar_value(x.as_value()?); @@ -179,7 +180,7 @@ impl DeltaTableState { let min_value = new_stats .min_values - .get(field.get_name()) + .get(field.name()) .and_then(|x| { let old_stats = stats.clone(); let min_value = to_scalar_value(x.as_value()?); @@ -222,7 +223,7 @@ impl DeltaTableState { num_rows: stats.num_rows, total_byte_size: stats.total_byte_size, column_statistics: stats.column_statistics.map(|col_stats| { - let fields = self.schema().unwrap().get_fields(); + let fields = self.schema().unwrap().fields(); col_stats .iter() .zip(fields) @@ -230,7 +231,7 @@ impl DeltaTableState { let dt = self .arrow_schema() .unwrap() - .field_with_name(field.get_name()) + .field_with_name(field.name()) .unwrap() .data_type() .clone(); @@ -258,16 +259,14 @@ fn get_prune_stats(table: &DeltaTable, column: &Column, get_max: bool) -> Option let field = table .get_schema() .ok() - .map(|s| s.get_field_with_name(&column.name).ok())??; + .map(|s| s.field_with_name(&column.name).ok())??; // See issue 1214. Binary type does not support natural order which is required for Datafusion to prune - if let SchemaDataType::primitive(t) = &field.get_type() { - if t == "binary" { - return None; - } + if let DeltaDataType::Primitive(PrimitiveType::Binary) = &field.data_type() { + return None; } - let data_type = field.get_type().try_into().ok()?; + let data_type = field.data_type().try_into().ok()?; let partition_columns = &table.get_metadata().ok()?.partition_columns; let values = table.get_state().files().iter().map(|add| { @@ -921,7 +920,7 @@ pub(crate) fn get_null_of_arrow_type(t: &ArrowDataType) -> DeltaResult PartitionedFile { @@ -1790,7 +1789,7 @@ mod tests { let mut partition_values = std::collections::HashMap::new(); partition_values.insert("month".to_string(), Some("1".to_string())); partition_values.insert("year".to_string(), Some("2015".to_string())); - let action = protocol::Add { + let action = Add { path: "year=2015/month=1/part-00000-4dcb50d3-d017-450c-9df7-a7257dbd3c5d-c000.snappy.parquet".to_string(), size: 10644, partition_values, @@ -1801,6 +1800,8 @@ mod tests { deletion_vector: None, stats_parsed: None, tags: None, + base_row_id: None, + default_row_commit_version: None, }; let schema = ArrowSchema::new(vec![ Field::new("year", ArrowDataType::Int64, true), @@ -1953,7 +1954,7 @@ mod tests { let table = crate::DeltaOps::new_in_memory() .create() - .with_columns(get_delta_schema().get_fields().clone()) + .with_columns(get_delta_schema().fields().clone()) .with_partition_columns(["modified", "id"]) .await .unwrap(); diff --git a/crates/deltalake-core/src/errors.rs b/crates/deltalake-core/src/errors.rs index 24989b2814..bd088e9a4f 100644 --- a/crates/deltalake-core/src/errors.rs +++ b/crates/deltalake-core/src/errors.rs @@ -205,6 +205,12 @@ pub enum DeltaTableError { /// Source error source: Box, }, + + #[error("Kernel: {source}")] + Kernel { + #[from] + source: crate::kernel::Error, + }, } impl From for DeltaTableError { diff --git a/crates/deltalake-core/src/kernel/actions/arrow.rs b/crates/deltalake-core/src/kernel/actions/arrow.rs new file mode 100644 index 0000000000..d292362604 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/arrow.rs @@ -0,0 +1,1049 @@ +use std::sync::Arc; + +use arrow_schema::{ + ArrowError, DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, + Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use lazy_static::lazy_static; + +use super::super::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; + +impl TryFrom<&StructType> for ArrowSchema { + type Error = ArrowError; + + fn try_from(s: &StructType) -> Result { + let fields = s + .fields() + .iter() + .map(>::try_from) + .collect::, ArrowError>>()?; + + Ok(ArrowSchema::new(fields)) + } +} + +impl TryFrom<&StructField> for ArrowField { + type Error = ArrowError; + + fn try_from(f: &StructField) -> Result { + let metadata = f + .metadata() + .iter() + .map(|(key, val)| Ok((key.clone(), serde_json::to_string(val)?))) + .collect::>() + .map_err(|err| ArrowError::JsonError(err.to_string()))?; + + let field = ArrowField::new( + f.name(), + ArrowDataType::try_from(f.data_type())?, + f.is_nullable(), + ) + .with_metadata(metadata); + + Ok(field) + } +} + +impl TryFrom<&ArrayType> for ArrowField { + type Error = ArrowError; + + fn try_from(a: &ArrayType) -> Result { + Ok(ArrowField::new( + "item", + ArrowDataType::try_from(a.element_type())?, + a.contains_null(), + )) + } +} + +impl TryFrom<&MapType> for ArrowField { + type Error = ArrowError; + + fn try_from(a: &MapType) -> Result { + Ok(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::try_from(a.key_type())?, false), + ArrowField::new( + "value", + ArrowDataType::try_from(a.value_type())?, + a.value_contains_null(), + ), + ] + .into(), + ), + false, // always non-null + )) + } +} + +impl TryFrom<&DataType> for ArrowDataType { + type Error = ArrowError; + + fn try_from(t: &DataType) -> Result { + match t { + DataType::Primitive(p) => { + match p { + PrimitiveType::String => Ok(ArrowDataType::Utf8), + PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type + PrimitiveType::Integer => Ok(ArrowDataType::Int32), + PrimitiveType::Short => Ok(ArrowDataType::Int16), + PrimitiveType::Byte => Ok(ArrowDataType::Int8), + PrimitiveType::Float => Ok(ArrowDataType::Float32), + PrimitiveType::Double => Ok(ArrowDataType::Float64), + PrimitiveType::Boolean => Ok(ArrowDataType::Boolean), + PrimitiveType::Binary => Ok(ArrowDataType::Binary), + PrimitiveType::Decimal(precision, scale) => { + let precision = u8::try_from(*precision).map_err(|_| { + ArrowError::SchemaError(format!( + "Invalid precision for decimal: {}", + precision + )) + })?; + let scale = i8::try_from(*scale).map_err(|_| { + ArrowError::SchemaError(format!("Invalid scale for decimal: {}", scale)) + })?; + + if precision <= 38 { + Ok(ArrowDataType::Decimal128(precision, scale)) + } else if precision <= 76 { + Ok(ArrowDataType::Decimal256(precision, scale)) + } else { + Err(ArrowError::SchemaError(format!( + "Precision too large to be represented in Arrow: {}", + precision + ))) + } + } + PrimitiveType::Date => { + // A calendar date, represented as a year-month-day triple without a + // timezone. Stored as 4 bytes integer representing days since 1970-01-01 + Ok(ArrowDataType::Date32) + } + PrimitiveType::Timestamp => { + // Issue: https://github.com/delta-io/delta/issues/643 + Ok(ArrowDataType::Timestamp(TimeUnit::Microsecond, None)) + } + } + } + DataType::Struct(s) => Ok(ArrowDataType::Struct( + s.fields() + .iter() + .map(>::try_from) + .collect::, ArrowError>>()? + .into(), + )), + DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(>::try_from(a)?))), + DataType::Map(m) => Ok(ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new( + "keys", + >::try_from(m.key_type())?, + false, + ), + ArrowField::new( + "values", + >::try_from(m.value_type())?, + m.value_contains_null(), + ), + ] + .into(), + ), + false, + )), + false, + )), + } + } +} + +impl TryFrom<&ArrowSchema> for StructType { + type Error = ArrowError; + + fn try_from(arrow_schema: &ArrowSchema) -> Result { + let new_fields: Result, _> = arrow_schema + .fields() + .iter() + .map(|field| field.as_ref().try_into()) + .collect(); + Ok(StructType::new(new_fields?)) + } +} + +impl TryFrom for StructType { + type Error = ArrowError; + + fn try_from(arrow_schema: ArrowSchemaRef) -> Result { + arrow_schema.as_ref().try_into() + } +} + +impl TryFrom<&ArrowField> for StructField { + type Error = ArrowError; + + fn try_from(arrow_field: &ArrowField) -> Result { + Ok(StructField::new( + arrow_field.name().clone(), + arrow_field.data_type().try_into()?, + arrow_field.is_nullable(), + ) + .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) + } +} + +impl TryFrom<&ArrowDataType> for DataType { + type Error = ArrowError; + + fn try_from(arrow_datatype: &ArrowDataType) -> Result { + match arrow_datatype { + ArrowDataType::Utf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::LargeUtf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::Int64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::Int32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::Int16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::Int8 => Ok(DataType::Primitive(PrimitiveType::Byte)), + ArrowDataType::UInt64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::UInt32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::UInt16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::UInt8 => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Float32 => Ok(DataType::Primitive(PrimitiveType::Float)), + ArrowDataType::Float64 => Ok(DataType::Primitive(PrimitiveType::Double)), + ArrowDataType::Boolean => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::Decimal128(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, + ))), + ArrowDataType::Decimal256(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, + ))), + ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)), + ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)), + ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { + Ok(DataType::Primitive(PrimitiveType::Timestamp)) + } + ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) + if tz.eq_ignore_ascii_case("utc") => + { + Ok(DataType::Primitive(PrimitiveType::Timestamp)) + } + ArrowDataType::Struct(fields) => { + let converted_fields: Result, _> = fields + .iter() + .map(|field| field.as_ref().try_into()) + .collect(); + Ok(DataType::Struct(Box::new(StructType::new( + converted_fields?, + )))) + } + ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( + ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), + ))), + ArrowDataType::Map(field, _) => { + if let ArrowDataType::Struct(struct_fields) = field.data_type() { + let key_type = struct_fields[0].data_type().try_into()?; + let value_type = struct_fields[1].data_type().try_into()?; + let value_type_nullable = struct_fields[1].is_nullable(); + Ok(DataType::Map(Box::new(MapType::new( + key_type, + value_type, + value_type_nullable, + )))) + } else { + panic!("DataType::Map should contain a struct field child"); + } + } + s => Err(ArrowError::SchemaError(format!( + "Invalid data type for Delta Lake: {s}" + ))), + } + } +} + +macro_rules! arrow_map { + ($fieldname: ident, null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Utf8, false), + ArrowField::new("value", ArrowDataType::Utf8, true), + ] + .into(), + ), + false, + )), + false, + ), + true, + ) + }; + ($fieldname: ident, not_null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Utf8, false), + ArrowField::new("value", ArrowDataType::Utf8, false), + ] + .into(), + ), + false, + )), + false, + ), + false, + ) + }; +} + +macro_rules! arrow_field { + ($fieldname:ident, $type_qual:ident, null) => { + ArrowField::new(stringify!($fieldname), ArrowDataType::$type_qual, true) + }; + ($fieldname:ident, $type_qual:ident, not_null) => { + ArrowField::new(stringify!($fieldname), ArrowDataType::$type_qual, false) + }; +} + +macro_rules! arrow_list { + ($fieldname:ident, $element_name:ident, $type_qual:ident, null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::List(Arc::new(ArrowField::new( + stringify!($element_name), + ArrowDataType::$type_qual, + true, + ))), + true, + ) + }; + ($fieldname:ident, $element_name:ident, $type_qual:ident, not_null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::List(Arc::new(ArrowField::new( + stringify!($element_name), + ArrowDataType::$type_qual, + true, + ))), + false, + ) + }; +} + +macro_rules! arrow_struct { + ($fieldname:ident, [$($inner:tt)+], null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Struct( + arrow_defs! [$($inner)+].into() + ), + true + ) + }; + ($fieldname:ident, [$($inner:tt)+], not_null) => { + ArrowField::new( + stringify!($fieldname), + ArrowDataType::Struct( + arrow_defs! [$($inner)+].into() + ), + false + ) + } +} + +macro_rules! arrow_def { + ($fieldname:ident $(null)?) => { + arrow_map!($fieldname, null) + }; + ($fieldname:ident not_null) => { + arrow_map!($fieldname, not_null) + }; + ($fieldname:ident[$inner_name:ident]{$type_qual:ident} $(null)?) => { + arrow_list!($fieldname, $inner_name, $type_qual, null) + }; + ($fieldname:ident[$inner_name:ident]{$type_qual:ident} not_null) => { + arrow_list!($fieldname, $inner_name, $type_qual, not_null) + }; + ($fieldname:ident:$type_qual:ident $(null)?) => { + arrow_field!($fieldname, $type_qual, null) + }; + ($fieldname:ident:$type_qual:ident not_null) => { + arrow_field!($fieldname, $type_qual, not_null) + }; + ($fieldname:ident[$($inner:tt)+] $(null)?) => { + arrow_struct!($fieldname, [$($inner)+], null) + }; + ($fieldname:ident[$($inner:tt)+] not_null) => { + arrow_struct!($fieldname, [$($inner)+], not_null) + } +} + +/// A helper macro to create more readable Arrow field definitions, delimited by commas +/// +/// The argument patterns are as follows: +/// +/// fieldname (null|not_null)? -- An arrow field of type map with name "fieldname" consisting of Utf8 key-value pairs, and an +/// optional nullability qualifier (null if not specified). +/// +/// fieldname:type (null|not_null)? -- An Arrow field consisting of an atomic type. For example, +/// id:Utf8 gets mapped to ArrowField::new("id", ArrowDataType::Utf8, true). +/// where customerCount:Int64 not_null gets mapped to gets mapped to +/// ArrowField::new("customerCount", ArrowDataType::Utf8, true) +/// +/// fieldname[list_element]{list_element_type} (null|not_null)? -- An Arrow list, with the name of the elements wrapped in square brackets +/// and the type of the list elements wrapped in curly brackets. For example, +/// customers[name]{Utf8} is an nullable arrow field of type arrow list consisting +/// of elements called "name" with type Utf8. +/// +/// fieldname[element1, element2, element3, ....] (null|not_null)? -- An arrow struct with name "fieldname" consisting of elements adhering to any of the patterns +/// documented, including additional structs arbitrarily nested up to the recursion +/// limit for Rust macros. +macro_rules! arrow_defs { + () => { + vec![] as Vec + }; + ($($fieldname:ident$(:$type_qual:ident)?$([$($inner:tt)+])?$({$list_type_qual:ident})? $($nullable:ident)?),+) => { + vec![ + $(arrow_def!($fieldname$(:$type_qual)?$([$($inner)+])?$({$list_type_qual})? $($nullable)?)),+ + ] + } +} + +/// Returns an arrow schema representing the delta log for use in checkpoints +/// +/// # Arguments +/// +/// * `table_schema` - The arrow schema representing the table backed by the delta log +/// * `partition_columns` - The list of partition columns of the table. +/// * `use_extended_remove_schema` - Whether to include extended file metadata in remove action schema. +/// Required for compatibility with different versions of Databricks runtime. +pub(crate) fn delta_log_schema_for_table( + table_schema: ArrowSchema, + partition_columns: &[String], + use_extended_remove_schema: bool, +) -> ArrowSchemaRef { + lazy_static! { + static ref SCHEMA_FIELDS: Vec = arrow_defs![ + metaData[ + id:Utf8, + name:Utf8, + description:Utf8, + schemaString:Utf8, + createdTime:Int64, + partitionColumns[element]{Utf8}, + configuration, + format[provider:Utf8, options] + ], + protocol[ + minReaderVersion:Int32, + minWriterVersion:Int32 + ], + txn[ + appId:Utf8, + version:Int64 + ] + ]; + static ref ADD_FIELDS: Vec = arrow_defs![ + path:Utf8, + size:Int64, + modificationTime:Int64, + dataChange:Boolean, + stats:Utf8, + partitionValues, + tags, + deletionVector[ + storageType:Utf8 not_null, + pathOrInlineDv:Utf8 not_null, + offset:Int32 null, + sizeInBytes:Int32 not_null, + cardinality:Int64 not_null + ] + ]; + static ref REMOVE_FIELDS: Vec = arrow_defs![ + path: Utf8, + deletionTimestamp: Int64, + dataChange: Boolean, + extendedFileMetadata: Boolean + ]; + static ref REMOVE_EXTENDED_FILE_METADATA_FIELDS: Vec = + arrow_defs![size: Int64, partitionValues, tags]; + }; + + // create add fields according to the specific data table schema + let (partition_fields, non_partition_fields): (Vec, Vec) = + table_schema + .fields() + .iter() + .map(|field| field.to_owned()) + .partition(|field| partition_columns.contains(field.name())); + + let mut stats_parsed_fields: Vec = + vec![ArrowField::new("numRecords", ArrowDataType::Int64, true)]; + if !non_partition_fields.is_empty() { + let mut max_min_vec = Vec::new(); + non_partition_fields + .iter() + .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); + + stats_parsed_fields.extend(["minValues", "maxValues"].into_iter().map(|name| { + ArrowField::new( + name, + ArrowDataType::Struct(max_min_vec.clone().into()), + true, + ) + })); + + let mut null_count_vec = Vec::new(); + non_partition_fields + .iter() + .for_each(|f| null_count_schema_for_fields(&mut null_count_vec, f)); + let null_count_struct = ArrowField::new( + "nullCount", + ArrowDataType::Struct(null_count_vec.into()), + true, + ); + + stats_parsed_fields.push(null_count_struct); + } + let mut add_fields = ADD_FIELDS.clone(); + add_fields.push(ArrowField::new( + "stats_parsed", + ArrowDataType::Struct(stats_parsed_fields.into()), + true, + )); + if !partition_fields.is_empty() { + add_fields.push(ArrowField::new( + "partitionValues_parsed", + ArrowDataType::Struct(partition_fields.into()), + true, + )); + } + + // create remove fields with or without extendedFileMetadata + let mut remove_fields = REMOVE_FIELDS.clone(); + if use_extended_remove_schema { + remove_fields.extend(REMOVE_EXTENDED_FILE_METADATA_FIELDS.clone()); + } + + // include add and remove fields in checkpoint schema + let mut schema_fields = SCHEMA_FIELDS.clone(); + schema_fields.push(ArrowField::new( + "add", + ArrowDataType::Struct(add_fields.into()), + true, + )); + schema_fields.push(ArrowField::new( + "remove", + ArrowDataType::Struct(remove_fields.into()), + true, + )); + + let arrow_schema = ArrowSchema::new(schema_fields); + + std::sync::Arc::new(arrow_schema) +} + +fn max_min_schema_for_fields(dest: &mut Vec, f: &ArrowField) { + match f.data_type() { + ArrowDataType::Struct(struct_fields) => { + let mut child_dest = Vec::new(); + + for f in struct_fields { + max_min_schema_for_fields(&mut child_dest, f); + } + + dest.push(ArrowField::new( + f.name(), + ArrowDataType::Struct(child_dest.into()), + true, + )); + } + // don't compute min or max for list, map or binary types + ArrowDataType::List(_) | ArrowDataType::Map(_, _) | ArrowDataType::Binary => { /* noop */ } + _ => { + let f = f.clone(); + dest.push(f); + } + } +} + +fn null_count_schema_for_fields(dest: &mut Vec, f: &ArrowField) { + match f.data_type() { + ArrowDataType::Struct(struct_fields) => { + let mut child_dest = Vec::new(); + + for f in struct_fields { + null_count_schema_for_fields(&mut child_dest, f); + } + + dest.push(ArrowField::new( + f.name(), + ArrowDataType::Struct(child_dest.into()), + true, + )); + } + _ => { + let f = ArrowField::new(f.name(), ArrowDataType::Int64, true); + dest.push(f); + } + } +} + +#[cfg(test)] +mod tests { + use arrow::array::ArrayData; + use arrow_array::Array; + use arrow_array::{make_array, ArrayRef, MapArray, StringArray, StructArray}; + use arrow_buffer::{Buffer, ToByteSlice}; + use arrow_schema::Field; + + use super::*; + use std::collections::HashMap; + use std::sync::Arc; + + #[test] + fn delta_log_schema_for_table_test() { + // NOTE: We should future proof the checkpoint schema in case action schema changes. + // See https://github.com/delta-io/delta-rs/issues/287 + + let table_schema = ArrowSchema::new(vec![ + ArrowField::new("pcol", ArrowDataType::Int32, true), + ArrowField::new("col1", ArrowDataType::Int32, true), + ]); + let partition_columns = vec!["pcol".to_string()]; + let log_schema = + delta_log_schema_for_table(table_schema.clone(), partition_columns.as_slice(), false); + + // verify top-level schema contains all expected fields and they are named correctly. + let expected_fields = ["metaData", "protocol", "txn", "remove", "add"]; + for f in log_schema.fields().iter() { + assert!(expected_fields.contains(&f.name().as_str())); + } + assert_eq!(5, log_schema.fields().len()); + + // verify add fields match as expected. a lot of transformation goes into these. + let add_fields: Vec<_> = log_schema + .fields() + .iter() + .filter(|f| f.name() == "add") + .flat_map(|f| { + if let ArrowDataType::Struct(fields) = f.data_type() { + fields.iter().cloned() + } else { + unreachable!(); + } + }) + .collect(); + let field_names: Vec<&String> = add_fields.iter().map(|v| v.name()).collect(); + assert_eq!( + vec![ + "path", + "size", + "modificationTime", + "dataChange", + "stats", + "partitionValues", + "tags", + "deletionVector", + "stats_parsed", + "partitionValues_parsed" + ], + field_names + ); + let add_field_map: HashMap<_, _> = add_fields + .iter() + .map(|f| (f.name().to_owned(), f.clone())) + .collect(); + let partition_values_parsed = add_field_map.get("partitionValues_parsed").unwrap(); + if let ArrowDataType::Struct(fields) = partition_values_parsed.data_type() { + assert_eq!(1, fields.len()); + let field = fields.get(0).unwrap().to_owned(); + assert_eq!( + Arc::new(ArrowField::new("pcol", ArrowDataType::Int32, true)), + field + ); + } else { + unreachable!(); + } + let stats_parsed = add_field_map.get("stats_parsed").unwrap(); + if let ArrowDataType::Struct(fields) = stats_parsed.data_type() { + assert_eq!(4, fields.len()); + + let field_map: HashMap<_, _> = fields + .iter() + .map(|f| (f.name().to_owned(), f.clone())) + .collect(); + + for (k, v) in field_map.iter() { + match k.as_ref() { + "minValues" | "maxValues" | "nullCount" => match v.data_type() { + ArrowDataType::Struct(fields) => { + assert_eq!(1, fields.len()); + let field = fields.get(0).unwrap().to_owned(); + let data_type = if k == "nullCount" { + ArrowDataType::Int64 + } else { + ArrowDataType::Int32 + }; + assert_eq!(Arc::new(ArrowField::new("col1", data_type, true)), field); + } + _ => unreachable!(), + }, + "numRecords" => {} + _ => panic!(), + } + } + } else { + unreachable!(); + } + + // verify extended remove schema fields **ARE NOT** included when `use_extended_remove_schema` is false. + let num_remove_fields = log_schema + .fields() + .iter() + .filter(|f| f.name() == "remove") + .flat_map(|f| { + if let ArrowDataType::Struct(fields) = f.data_type() { + fields.iter().cloned() + } else { + unreachable!(); + } + }) + .count(); + assert_eq!(4, num_remove_fields); + + // verify extended remove schema fields **ARE** included when `use_extended_remove_schema` is true. + let log_schema = + delta_log_schema_for_table(table_schema, partition_columns.as_slice(), true); + let remove_fields: Vec<_> = log_schema + .fields() + .iter() + .filter(|f| f.name() == "remove") + .flat_map(|f| { + if let ArrowDataType::Struct(fields) = f.data_type() { + fields.iter().cloned() + } else { + unreachable!(); + } + }) + .collect(); + assert_eq!(7, remove_fields.len()); + let expected_fields = [ + "path", + "deletionTimestamp", + "dataChange", + "extendedFileMetadata", + "partitionValues", + "size", + "tags", + ]; + for f in remove_fields.iter() { + assert!(expected_fields.contains(&f.name().as_str())); + } + } + + #[test] + fn test_arrow_from_delta_decimal_type() { + let precision = 20; + let scale = 2; + let decimal_field = DataType::Primitive(PrimitiveType::Decimal(precision, scale)); + assert_eq!( + >::try_from(&decimal_field).unwrap(), + ArrowDataType::Decimal128(precision as u8, scale as i8) + ); + } + + #[test] + fn test_arrow_from_delta_timestamp_type() { + let timestamp_field = DataType::Primitive(PrimitiveType::Timestamp); + assert_eq!( + >::try_from(×tamp_field).unwrap(), + ArrowDataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_delta_from_arrow_timestamp_type() { + let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); + assert_eq!( + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) + ); + } + + #[test] + fn test_delta_from_arrow_timestamp_type_with_tz() { + let timestamp_field = + ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())); + assert_eq!( + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) + ); + } + + #[test] + fn test_delta_from_arrow_map_type() { + let arrow_map = ArrowDataType::Map( + Arc::new(ArrowField::new( + "entries", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int8, false), + ArrowField::new("value", ArrowDataType::Binary, true), + ] + .into(), + ), + false, + )), + false, + ); + let converted_map: DataType = (&arrow_map).try_into().unwrap(); + + assert_eq!( + converted_map, + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::Byte), + DataType::Primitive(PrimitiveType::Binary), + true, + ))) + ); + } + + #[test] + fn test_record_batch_from_map_type() { + let keys = vec!["0", "1", "5", "6", "7"]; + let values: Vec<&[u8]> = vec![ + b"test_val_1", + b"test_val_2", + b"long_test_val_3", + b"4", + b"test_val_5", + ]; + let entry_offsets = vec![0u32, 1, 1, 4, 5, 5]; + let num_rows = keys.len(); + + // Copied the function `new_from_string` with the patched code from https://github.com/apache/arrow-rs/pull/4808 + // This should be reverted back [`MapArray::new_from_strings`] once arrow is upgraded in this project. + fn new_from_strings<'a>( + keys: impl Iterator, + values: &dyn Array, + entry_offsets: &[u32], + ) -> Result { + let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); + let keys_data = StringArray::from_iter_values(keys); + + let keys_field = Arc::new(Field::new("keys", ArrowDataType::Utf8, false)); + let values_field = Arc::new(Field::new( + "values", + values.data_type().clone(), + values.null_count() > 0, + )); + + let entry_struct = StructArray::from(vec![ + (keys_field, Arc::new(keys_data) as ArrayRef), + (values_field, make_array(values.to_data())), + ]); + + let map_data_type = ArrowDataType::Map( + Arc::new(Field::new( + "entries", + entry_struct.data_type().clone(), + false, + )), + false, + ); + + let map_data = ArrayData::builder(map_data_type) + .len(entry_offsets.len() - 1) + .add_buffer(entry_offsets_buffer) + .add_child_data(entry_struct.into_data()) + .build()?; + + Ok(MapArray::from(map_data)) + } + + let map_array = new_from_strings( + keys.into_iter(), + &arrow::array::BinaryArray::from(values), + entry_offsets.as_slice(), + ) + .expect("Could not create a map array"); + + let schema = + >::try_from(&StructType::new(vec![ + StructField::new( + "example".to_string(), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::String), + DataType::Primitive(PrimitiveType::Binary), + false, + ))), + false, + ), + ])) + .expect("Could not get schema"); + + let record_batch = + arrow::record_batch::RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) + .expect("Failed to create RecordBatch"); + + assert_eq!(record_batch.num_columns(), 1); + assert_eq!(record_batch.num_rows(), num_rows); + } + + #[test] + fn test_max_min_schema_for_fields() { + let mut max_min_vec: Vec = Vec::new(); + let fields = [ + ArrowField::new("simple", ArrowDataType::Int32, true), + ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ArrowField::new("simple", ArrowDataType::Int32, true)].into(), + ), + true, + ), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "simple", + ArrowDataType::Int32, + true, + ))), + true, + ), + ArrowField::new( + "map", + ArrowDataType::Map( + Arc::new(ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int32, true), + ArrowField::new("value", ArrowDataType::Int32, true), + ] + .into(), + ), + true, + )), + true, + ), + true, + ), + ArrowField::new("binary", ArrowDataType::Binary, true), + ]; + + let expected = vec![fields[0].clone(), fields[1].clone()]; + + fields + .iter() + .for_each(|f| max_min_schema_for_fields(&mut max_min_vec, f)); + + assert_eq!(max_min_vec, expected); + } + + #[test] + fn test_null_count_schema_for_fields() { + let mut null_count_vec: Vec = Vec::new(); + let fields = [ + ArrowField::new("int32", ArrowDataType::Int32, true), + ArrowField::new("int64", ArrowDataType::Int64, true), + ArrowField::new("Utf8", ArrowDataType::Utf8, true), + ArrowField::new( + "list", + ArrowDataType::List(Arc::new(ArrowField::new( + "simple", + ArrowDataType::Int32, + true, + ))), + true, + ), + ArrowField::new( + "map", + ArrowDataType::Map( + Arc::new(ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int32, true), + ArrowField::new("value", ArrowDataType::Int32, true), + ] + .into(), + ), + true, + )), + true, + ), + true, + ), + ArrowField::new( + "struct", + ArrowDataType::Struct( + vec![ArrowField::new("int32", ArrowDataType::Int32, true)].into(), + ), + true, + ), + ]; + let expected = vec![ + ArrowField::new(fields[0].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[1].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[2].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[3].name(), ArrowDataType::Int64, true), + ArrowField::new(fields[4].name(), ArrowDataType::Int64, true), + ArrowField::new( + fields[5].name(), + ArrowDataType::Struct( + vec![ArrowField::new("int32", ArrowDataType::Int64, true)].into(), + ), + true, + ), + ]; + fields + .iter() + .for_each(|f| null_count_schema_for_fields(&mut null_count_vec, f)); + assert_eq!(null_count_vec, expected); + } + + /* + * This test validates the trait implementation of + * TryFrom<&Arc> for schema::SchemaField which is required with Arrow 37 since + * iterators on Fields will give an &Arc + */ + #[test] + fn tryfrom_arrowfieldref_with_structs() { + let field = Arc::new(ArrowField::new( + "test_struct", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::Int32, true), + ArrowField::new("value", ArrowDataType::Int32, true), + ] + .into(), + ), + true, + )); + let _converted: StructField = field.as_ref().try_into().unwrap(); + } +} diff --git a/crates/deltalake-core/src/kernel/actions/checkpoint.rs b/crates/deltalake-core/src/kernel/actions/checkpoint.rs new file mode 100644 index 0000000000..59960f66b8 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/checkpoint.rs @@ -0,0 +1,589 @@ +use std::collections::HashMap; +use std::str::FromStr; + +use arrow_array::{ + BooleanArray, Int32Array, Int64Array, ListArray, MapArray, RecordBatch, StringArray, + StructArray, +}; +use either::Either; +use fix_hidden_lifetime_bug::fix_hidden_lifetime_bug; +use itertools::izip; +use serde::{Deserialize, Serialize}; + +use super::{error::Error, DeltaResult}; + +#[fix_hidden_lifetime_bug] +#[allow(dead_code)] +pub(crate) fn parse_actions<'a>( + batch: &RecordBatch, + types: impl IntoIterator, +) -> DeltaResult> { + Ok(types + .into_iter() + .filter_map(|action| parse_action(batch, action).ok()) + .flatten()) +} + +#[fix_hidden_lifetime_bug] +pub(crate) fn parse_action( + batch: &RecordBatch, + action_type: &ActionType, +) -> DeltaResult> { + let column_name = match action_type { + ActionType::Metadata => "metaData", + ActionType::Protocol => "protocol", + ActionType::Add => "add", + ActionType::Remove => "remove", + _ => unimplemented!(), + }; + + let arr = batch + .column_by_name(column_name) + .ok_or(Error::MissingColumn(column_name.into()))? + .as_any() + .downcast_ref::() + .ok_or(Error::UnexpectedColumnType( + "Cannot downcast to StructArray".into(), + ))?; + + match action_type { + ActionType::Metadata => parse_action_metadata(arr), + ActionType::Protocol => parse_action_protocol(arr), + ActionType::Add => parse_actions_add(arr), + ActionType::Remove => parse_actions_remove(arr), + _ => todo!(), + } +} + +fn parse_action_metadata(arr: &StructArray) -> DeltaResult>> { + let ids = cast_struct_column::(arr, "id")?; + let schema_strings = cast_struct_column::(arr, "schemaString")?; + let metadata = ids + .into_iter() + .zip(schema_strings) + .filter_map(|(maybe_id, maybe_schema_string)| { + if let (Some(id), Some(schema_string)) = (maybe_id, maybe_schema_string) { + Some(Metadata::new( + id, + Format { + provider: "parquet".into(), + options: Default::default(), + }, + schema_string, + Vec::::new(), + None, + )) + } else { + None + } + }) + .next(); + + if metadata.is_none() { + return Ok(Box::new(std::iter::empty())); + } + let mut metadata = metadata.unwrap(); + + metadata.partition_columns = cast_struct_column::(arr, "partitionColumns") + .ok() + .map(|arr| { + arr.iter() + .filter_map(|it| { + if let Some(features) = it { + let vals = features + .as_any() + .downcast_ref::()? + .iter() + .filter_map(|v| v.map(|inner| inner.to_owned())) + .collect::>(); + Some(vals) + } else { + None + } + }) + .flatten() + .collect::>() + }) + .unwrap_or_default(); + + metadata.name = cast_struct_column::(arr, "name") + .ok() + .and_then(|arr| { + arr.iter() + .flat_map(|maybe| maybe.map(|v| v.to_string())) + .next() + }); + metadata.description = cast_struct_column::(arr, "description") + .ok() + .and_then(|arr| { + arr.iter() + .flat_map(|maybe| maybe.map(|v| v.to_string())) + .next() + }); + metadata.created_time = cast_struct_column::(arr, "createdTime") + .ok() + .and_then(|arr| arr.iter().flatten().next()); + + if let Ok(config) = cast_struct_column::(arr, "configuration") { + let keys = config + .keys() + .as_any() + .downcast_ref::() + .ok_or(Error::MissingData("expected key column in map".into()))?; + let values = config + .values() + .as_any() + .downcast_ref::() + .ok_or(Error::MissingData("expected value column in map".into()))?; + metadata.configuration = keys + .into_iter() + .zip(values) + .filter_map(|(k, v)| k.map(|key| (key.to_string(), v.map(|vv| vv.to_string())))) + .collect::>(); + }; + + Ok(Box::new(std::iter::once(Action::Metadata(metadata)))) +} + +fn parse_action_protocol(arr: &StructArray) -> DeltaResult>> { + let min_reader = cast_struct_column::(arr, "minReaderVersion")?; + let min_writer = cast_struct_column::(arr, "minWriterVersion")?; + let protocol = min_reader + .into_iter() + .zip(min_writer) + .filter_map(|(r, w)| { + if let (Some(min_reader_version), Some(min_wrriter_version)) = (r, w) { + Some(Protocol::new(min_reader_version, min_wrriter_version)) + } else { + None + } + }) + .next(); + + if protocol.is_none() { + return Ok(Box::new(std::iter::empty())); + } + let mut protocol = protocol.unwrap(); + + protocol.reader_features = cast_struct_column::(arr, "readerFeatures") + .ok() + .map(|arr| { + arr.iter() + .filter_map(|it| { + if let Some(features) = it { + let vals = features + .as_any() + .downcast_ref::()? + .iter() + .filter_map(|v| v.map(|inner| inner.to_owned())) + .collect::>(); + Some(vals) + } else { + None + } + }) + .flatten() + .collect::>() + }); + + protocol.writer_features = cast_struct_column::(arr, "writerFeatures") + .ok() + .map(|arr| { + arr.iter() + .filter_map(|it| { + if let Some(features) = it { + let vals = features + .as_any() + .downcast_ref::()? + .iter() + .filter_map(|v| v.map(|inner| inner.to_string())) + .collect::>(); + Some(vals) + } else { + None + } + }) + .flatten() + .collect::>() + }); + + Ok(Box::new(std::iter::once(Action::Protocol(protocol)))) +} + +fn parse_actions_add(arr: &StructArray) -> DeltaResult + '_>> { + let paths = cast_struct_column::(arr, "path")?; + let sizes = cast_struct_column::(arr, "size")?; + let modification_times = cast_struct_column::(arr, "modificationTime")?; + let data_changes = cast_struct_column::(arr, "dataChange")?; + let partition_values = cast_struct_column::(arr, "partitionValues")? + .iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())); + + let tags = if let Ok(stats) = cast_struct_column::(arr, "tags") { + Either::Left( + stats + .iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())), + ) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let stats = if let Ok(stats) = cast_struct_column::(arr, "stats") { + Either::Left(stats.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let base_row_ids = if let Ok(row_ids) = cast_struct_column::(arr, "baseRowId") { + Either::Left(row_ids.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let commit_versions = + if let Ok(versions) = cast_struct_column::(arr, "defaultRowCommitVersion") { + Either::Left(versions.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let deletion_vectors = if let Ok(dvs) = cast_struct_column::(arr, "deletionVector") + { + Either::Left(parse_dv(dvs)?) + } else { + Either::Right(std::iter::repeat(None).take(sizes.len())) + }; + + let zipped = izip!( + paths, + sizes, + modification_times, + data_changes, + partition_values, + stats, + tags, + base_row_ids, + commit_versions, + deletion_vectors, + ); + let zipped = zipped.map( + |( + maybe_paths, + maybe_size, + maybe_modification_time, + maybe_data_change, + partition_values, + stat, + tags, + base_row_id, + default_row_commit_version, + deletion_vector, + )| { + if let (Some(path), Some(size), Some(modification_time), Some(data_change)) = ( + maybe_paths, + maybe_size, + maybe_modification_time, + maybe_data_change, + ) { + Some(Add { + path: path.into(), + size, + modification_time, + data_change, + partition_values: partition_values.unwrap_or_default(), + stats: stat.map(|v| v.to_string()), + tags, + base_row_id, + default_row_commit_version, + deletion_vector, + stats_parsed: None, + partition_values_parsed: None, + }) + } else { + None + } + }, + ); + + Ok(Box::new(zipped.flatten().map(Action::Add))) +} + +fn parse_actions_remove(arr: &StructArray) -> DeltaResult + '_>> { + let paths = cast_struct_column::(arr, "path")?; + let data_changes = cast_struct_column::(arr, "dataChange")?; + + let deletion_timestamps = + if let Ok(ts) = cast_struct_column::(arr, "deletionTimestamp") { + Either::Left(ts.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let extended_file_metadata = + if let Ok(metas) = cast_struct_column::(arr, "extendedFileMetadata") { + Either::Left(metas.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let partition_values = + if let Ok(values) = cast_struct_column::(arr, "partitionValues") { + Either::Left( + values + .iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())), + ) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let sizes = if let Ok(size) = cast_struct_column::(arr, "size") { + Either::Left(size.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let tags = if let Ok(tags) = cast_struct_column::(arr, "tags") { + Either::Left( + tags.iter() + .map(|data| data.map(|d| struct_array_to_map(&d).unwrap())), + ) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let deletion_vectors = if let Ok(dvs) = cast_struct_column::(arr, "deletionVector") + { + Either::Left(parse_dv(dvs)?) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let base_row_ids = if let Ok(row_ids) = cast_struct_column::(arr, "baseRowId") { + Either::Left(row_ids.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let commit_versions = + if let Ok(row_ids) = cast_struct_column::(arr, "defaultRowCommitVersion") { + Either::Left(row_ids.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(data_changes.len())) + }; + + let zipped = izip!( + paths, + data_changes, + deletion_timestamps, + extended_file_metadata, + partition_values, + sizes, + tags, + deletion_vectors, + base_row_ids, + commit_versions, + ); + + let zipped = zipped.map( + |( + maybe_paths, + maybe_data_change, + deletion_timestamp, + extended_file_metadata, + partition_values, + size, + tags, + deletion_vector, + base_row_id, + default_row_commit_version, + )| { + if let (Some(path), Some(data_change)) = (maybe_paths, maybe_data_change) { + Some(Remove { + path: path.into(), + data_change, + deletion_timestamp, + extended_file_metadata, + partition_values, + size, + tags, + deletion_vector, + base_row_id, + default_row_commit_version, + }) + } else { + None + } + }, + ); + + Ok(Box::new(zipped.flatten().map(Action::Remove))) +} + +fn parse_dv( + arr: &StructArray, +) -> DeltaResult> + '_> { + let storage_types = cast_struct_column::(arr, "storageType")?; + let paths_or_inlines = cast_struct_column::(arr, "pathOrInlineDv")?; + let sizes_in_bytes = cast_struct_column::(arr, "sizeInBytes")?; + let cardinalities = cast_struct_column::(arr, "cardinality")?; + + let offsets = if let Ok(offsets) = cast_struct_column::(arr, "offset") { + Either::Left(offsets.into_iter()) + } else { + Either::Right(std::iter::repeat(None).take(cardinalities.len())) + }; + + let zipped = izip!( + storage_types, + paths_or_inlines, + sizes_in_bytes, + cardinalities, + offsets, + ); + + Ok(zipped.map( + |(maybe_type, maybe_path_or_inline_dv, maybe_size_in_bytes, maybe_cardinality, offset)| { + if let ( + Some(storage_type), + Some(path_or_inline_dv), + Some(size_in_bytes), + Some(cardinality), + ) = ( + maybe_type, + maybe_path_or_inline_dv, + maybe_size_in_bytes, + maybe_cardinality, + ) { + Some(DeletionVectorDescriptor { + storage_type: StorageType::from_str(storage_type).unwrap(), + path_or_inline_dv: path_or_inline_dv.into(), + size_in_bytes, + cardinality, + offset, + }) + } else { + None + } + }, + )) +} + +fn cast_struct_column(arr: &StructArray, name: impl AsRef) -> DeltaResult<&T> { + arr.column_by_name(name.as_ref()) + .ok_or(Error::MissingColumn(name.as_ref().into()))? + .as_any() + .downcast_ref::() + .ok_or(Error::UnexpectedColumnType( + "Cannot downcast to expected type".into(), + )) +} + +fn struct_array_to_map(arr: &StructArray) -> DeltaResult>> { + let keys = cast_struct_column::(arr, "key")?; + let values = cast_struct_column::(arr, "value")?; + Ok(keys + .into_iter() + .zip(values) + .filter_map(|(k, v)| k.map(|key| (key.to_string(), v.map(|vv| vv.to_string())))) + .collect()) +} + +#[cfg(all(test, feature = "default-client"))] +mod tests { + use std::sync::Arc; + + use object_store::local::LocalFileSystem; + + use super::*; + use crate::actions::Protocol; + use crate::client::json::DefaultJsonHandler; + use crate::executor::tokio::TokioBackgroundExecutor; + use crate::JsonHandler; + + fn action_batch() -> RecordBatch { + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + + let json_strings: StringArray = vec![ + r#"{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}}"#, + r#"{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}}"#, + r#"{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}}"#, + r#"{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}}"#, + ] + .into(); + let output_schema = Arc::new(get_log_schema()); + handler.parse_json(json_strings, output_schema).unwrap() + } + + #[test] + fn test_parse_protocol() { + let batch = action_batch(); + let action = parse_action(&batch, &ActionType::Protocol) + .unwrap() + .collect::>(); + let expected = Action::Protocol(Protocol { + min_reader_version: 3, + min_writer_version: 7, + reader_features: Some(vec!["deletionVectors".into()]), + writer_features: Some(vec!["deletionVectors".into()]), + }); + assert_eq!(action[0], expected) + } + + #[test] + fn test_parse_metadata() { + let batch = action_batch(); + let action = parse_action(&batch, &ActionType::Metadata) + .unwrap() + .collect::>(); + let configuration = HashMap::from_iter([ + ( + "delta.enableDeletionVectors".to_string(), + Some("true".to_string()), + ), + ( + "delta.columnMapping.mode".to_string(), + Some("none".to_string()), + ), + ]); + let expected = Action::Metadata(Metadata { + id: "testId".into(), + name: None, + description: None, + format: Format { + provider: "parquet".into(), + options: Default::default(), + }, + schema_string: r#"{"type":"struct","fields":[{"name":"value","type":"integer","nullable":true,"metadata":{}}]}"#.to_string(), + partition_columns: Vec::new(), + created_time: Some(1677811175819), + configuration, + }); + assert_eq!(action[0], expected) + } + + #[test] + fn test_parse_add_partitioned() { + let store = Arc::new(LocalFileSystem::new()); + let handler = DefaultJsonHandler::new(store, Arc::new(TokioBackgroundExecutor::new())); + + let json_strings: StringArray = vec![ + r#"{"commitInfo":{"timestamp":1670892998177,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[\"c1\",\"c2\"]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"3","numOutputRows":"3","numOutputBytes":"1356"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.2.0","txnId":"046a258f-45e3-4657-b0bf-abfb0f76681c"}}"#, + r#"{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}"#, + r#"{"metaData":{"id":"aff5cb91-8cd9-4195-aef9-446908507302","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"c1\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c2\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"c3\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["c1","c2"],"configuration":{},"createdTime":1670892997849}}"#, + r#"{"add":{"path":"c1=4/c2=c/part-00003-f525f459-34f9-46f5-82d6-d42121d883fd.c000.snappy.parquet","partitionValues":{"c1":"4","c2":"c"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":5},\"maxValues\":{\"c3\":5},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"c1=5/c2=b/part-00007-4e73fa3b-2c88-424a-8051-f8b54328ffdb.c000.snappy.parquet","partitionValues":{"c1":"5","c2":"b"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":6},\"maxValues\":{\"c3\":6},\"nullCount\":{\"c3\":0}}"}}"#, + r#"{"add":{"path":"c1=6/c2=a/part-00011-10619b10-b691-4fd0-acc4-2a9608499d7c.c000.snappy.parquet","partitionValues":{"c1":"6","c2":"a"},"size":452,"modificationTime":1670892998135,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"c3\":4},\"maxValues\":{\"c3\":4},\"nullCount\":{\"c3\":0}}"}}"#, + ] + .into(); + let output_schema = Arc::new(get_log_schema()); + let batch = handler.parse_json(json_strings, output_schema).unwrap(); + + let actions = parse_action(&batch, &ActionType::Add) + .unwrap() + .collect::>(); + println!("{:?}", actions) + } +} diff --git a/crates/deltalake-core/src/kernel/actions/mod.rs b/crates/deltalake-core/src/kernel/actions/mod.rs new file mode 100644 index 0000000000..865c9d3cd9 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/mod.rs @@ -0,0 +1,64 @@ +//! Actions are the fundamental unit of work in Delta Lake. Each action performs a single atomic +//! operation on the state of a Delta table. Actions are stored in the `_delta_log` directory of a +//! Delta table in JSON format. The log is a time series of actions that represent all the changes +//! made to a table. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +#[cfg(all(feature = "arrow", feature = "parquet"))] +pub(crate) mod arrow; +// pub(crate) mod schemas; +mod serde_path; +pub(crate) mod types; + +pub use types::*; + +#[derive(Debug)] +/// The type of action that was performed on the table +pub enum ActionType { + /// modify the data in a table by adding individual logical files + Add, + /// add a file containing only the data that was changed as part of the transaction + Cdc, + /// additional provenance information about what higher-level operation was being performed + CommitInfo, + /// contains a configuration (string-string map) for a named metadata domain + DomainMetadata, + /// changes the current metadata of the table + Metadata, + /// increase the version of the Delta protocol that is required to read or write a given table + Protocol, + /// modify the data in a table by removing individual logical files + Remove, + /// The Row ID high-water mark tracks the largest ID that has been assigned to a row in the table. + RowIdHighWaterMark, + /// Transactional information + Txn, +} + +#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +#[allow(missing_docs)] +pub enum Action { + #[serde(rename = "metaData")] + Metadata(Metadata), + Protocol(Protocol), + Add(Add), + Remove(Remove), + Cdc(AddCDCFile), + Txn(Txn), + CommitInfo(CommitInfo), + DomainMetadata(DomainMetadata), +} + +impl Action { + /// Create a commit info from a map + pub fn commit_info(info: HashMap) -> Self { + Self::CommitInfo(CommitInfo { + info, + ..Default::default() + }) + } +} diff --git a/crates/deltalake-core/src/kernel/actions/schemas.rs b/crates/deltalake-core/src/kernel/actions/schemas.rs new file mode 100644 index 0000000000..0cc870318f --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/schemas.rs @@ -0,0 +1,255 @@ +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Fields, Schema}; + +use super::ActionType; + +impl ActionType { + /// Returns the root field for the action type + pub fn field(&self) -> Field { + match self { + Self::Add => get_root("add", self.fields()), + Self::Cdc => get_root("cdc", self.fields()), + Self::CommitInfo => get_root("commitInfo", self.fields()), + Self::DomainMetadata => get_root("domainMetadata", self.fields()), + Self::Metadata => get_root("metaData", self.fields()), + Self::Protocol => get_root("protocol", self.fields()), + Self::Remove => get_root("remove", self.fields()), + Self::RowIdHighWaterMark => get_root("rowIdHighWaterMark", self.fields()), + Self::Txn => get_root("txn", self.fields()), + } + } + + /// Returns the child fields for the action type + pub fn fields(&self) -> Vec { + match self { + Self::Add => add_fields(), + Self::Cdc => cdc_fields(), + Self::CommitInfo => commit_info_fields(), + Self::DomainMetadata => domain_metadata_fields(), + Self::Metadata => metadata_fields(), + Self::Protocol => protocol_fields(), + Self::Remove => remove_fields(), + Self::RowIdHighWaterMark => watermark_fields(), + Self::Txn => txn_fields(), + } + } +} + +/// Returns the schema for the delta log +pub fn get_log_schema() -> Schema { + Schema { + fields: Fields::from_iter([ + ActionType::Add.field(), + ActionType::Cdc.field(), + ActionType::CommitInfo.field(), + ActionType::DomainMetadata.field(), + ActionType::Metadata.field(), + ActionType::Protocol.field(), + ActionType::Remove.field(), + ActionType::RowIdHighWaterMark.field(), + ActionType::Txn.field(), + ]), + metadata: Default::default(), + } +} + +fn get_root(name: &str, fields: Vec) -> Field { + Field::new(name, DataType::Struct(Fields::from_iter(fields)), true) +} + +fn add_fields() -> Vec { + Vec::from_iter([ + Field::new("path", DataType::Utf8, false), + Field::new("size", DataType::Int64, false), + Field::new("modificationTime", DataType::Int64, false), + Field::new("dataChange", DataType::Boolean, false), + Field::new("stats", DataType::Utf8, true), + Field::new( + "partitionValues", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "tags", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "deletionVector", + DataType::Struct(Fields::from(vec![ + Field::new("storageType", DataType::Utf8, false), + Field::new("pathOrInlineDv", DataType::Utf8, false), + Field::new("offset", DataType::Int32, true), + Field::new("sizeInBytes", DataType::Int32, false), + Field::new("cardinality", DataType::Int64, false), + ])), + true, + ), + Field::new("baseRowId", DataType::Int64, true), + Field::new("defaultRowCommitVersion", DataType::Int64, true), + ]) +} + +fn cdc_fields() -> Vec { + Vec::from_iter([ + Field::new("path", DataType::Utf8, true), + Field::new( + "partitionValues", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new("size", DataType::Int64, true), + Field::new("dataChange", DataType::Boolean, true), + Field::new( + "tags", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + ]) +} + +fn remove_fields() -> Vec { + Vec::from_iter([ + Field::new("path", DataType::Utf8, true), + Field::new("deletionTimestamp", DataType::Int64, true), + Field::new("dataChange", DataType::Boolean, true), + Field::new("extendedFileMetadata", DataType::Boolean, true), + Field::new("size", DataType::Int64, true), + Field::new( + "partitionValues", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "tags", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + ]) +} + +fn metadata_fields() -> Vec { + Vec::from_iter([ + Field::new("id", DataType::Utf8, false), + Field::new("name", DataType::Utf8, true), + Field::new("description", DataType::Utf8, true), + Field::new( + "format", + DataType::Struct(Fields::from_iter([ + Field::new("provider", DataType::Utf8, true), + Field::new( + "options", + DataType::Map( + Arc::new(Field::new( + "key_value", + DataType::Struct(Fields::from_iter([ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ])), + false, + )), + false, + ), + false, + ), + ])), + false, + ), + Field::new("schemaString", DataType::Utf8, false), + Field::new("createdTime", DataType::Int64, true), + Field::new( + "partitionColumns", + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + false, + ), + Field::new( + "configuration", + DataType::Map( + Arc::new(Field::new( + "key_value", + DataType::Struct(Fields::from_iter([ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ])), + false, + )), + false, + ), + true, + ), + ]) +} + +fn protocol_fields() -> Vec { + Vec::from_iter([ + Field::new("minReaderVersion", DataType::Int32, false), + Field::new("minWriterVersion", DataType::Int32, false), + Field::new( + "readerFeatures", + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + true, + ), + Field::new( + "writerFeatures", + DataType::List(Arc::new(Field::new("element", DataType::Utf8, false))), + true, + ), + ]) +} + +fn txn_fields() -> Vec { + Vec::from_iter([ + Field::new("appId", DataType::Utf8, true), + Field::new("version", DataType::Int64, true), + Field::new("lastUpdated", DataType::Int64, true), + ]) +} + +fn watermark_fields() -> Vec { + Vec::from_iter([Field::new("highWaterMark", DataType::Int64, true)]) +} + +fn commit_info_fields() -> Vec { + Vec::from_iter([ + Field::new("timestamp", DataType::Int64, true), + Field::new("operation", DataType::Utf8, true), + Field::new("isolationLevel", DataType::Utf8, true), + Field::new("isBlindAppend", DataType::Boolean, true), + Field::new("txnId", DataType::Utf8, true), + Field::new("readVersion", DataType::Int32, true), + Field::new( + "operationParameters", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new( + "operationMetrics", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + ]) +} + +fn domain_metadata_fields() -> Vec { + Vec::from_iter([ + Field::new("domain", DataType::Utf8, true), + Field::new( + "configuration", + DataType::Map(Arc::new(get_map_field()), false), + true, + ), + Field::new("removed", DataType::Boolean, true), + ]) +} + +fn get_map_field() -> Field { + Field::new( + "key_value", + DataType::Struct(Fields::from_iter([ + Field::new("key", DataType::Utf8, false), + Field::new("value", DataType::Utf8, true), + ])), + false, + ) +} diff --git a/crates/deltalake-core/src/kernel/actions/serde_path.rs b/crates/deltalake-core/src/kernel/actions/serde_path.rs new file mode 100644 index 0000000000..9868523e81 --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/serde_path.rs @@ -0,0 +1,89 @@ +use std::str::Utf8Error; + +use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS}; +use serde::{self, Deserialize, Deserializer, Serialize, Serializer}; + +pub fn deserialize<'de, D>(deserializer: D) -> Result +where + D: Deserializer<'de>, +{ + let s = String::deserialize(deserializer)?; + decode_path(&s).map_err(serde::de::Error::custom) +} + +pub fn serialize(value: &str, serializer: S) -> Result +where + S: Serializer, +{ + let encoded = encode_path(value); + String::serialize(&encoded, serializer) +} + +pub const _DELIMITER: &str = "/"; +/// The path delimiter as a single byte +pub const _DELIMITER_BYTE: u8 = _DELIMITER.as_bytes()[0]; + +/// Characters we want to encode. +const INVALID: &AsciiSet = &CONTROLS + // The delimiter we are reserving for internal hierarchy + // .add(DELIMITER_BYTE) + // Characters AWS recommends avoiding for object keys + // https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingMetadata.html + .add(b'\\') + .add(b'{') + .add(b'^') + .add(b'}') + .add(b'%') + .add(b'`') + .add(b']') + .add(b'"') + .add(b'>') + .add(b'[') + // .add(b'~') + .add(b'<') + .add(b'#') + .add(b'|') + // Characters Google Cloud Storage recommends avoiding for object names + // https://cloud.google.com/storage/docs/naming-objects + .add(b'\r') + .add(b'\n') + .add(b'*') + .add(b'?'); + +fn encode_path(path: &str) -> String { + percent_encode(path.as_bytes(), INVALID).to_string() +} + +fn decode_path(path: &str) -> Result { + Ok(percent_decode_str(path).decode_utf8()?.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_encode_path() { + let cases = [ + ( + "string=$%25&%2F()%3D%5E%22%5B%5D%23%2A%3F.%3A/part-00023-4b06bc90-0678-4a63-94a2-f09af1adb945.c000.snappy.parquet", + "string=$%2525&%252F()%253D%255E%2522%255B%255D%2523%252A%253F.%253A/part-00023-4b06bc90-0678-4a63-94a2-f09af1adb945.c000.snappy.parquet", + ), + ( + "string=$%25&%2F()%3D%5E%22<>~%5B%5D%7B}`%23|%2A%3F%2F%5Cr%5Cn.%3A/part-00023-e0a68495-8098-40a6-be5f-b502b111b789.c000.snappy.parquet", + "string=$%2525&%252F()%253D%255E%2522%3C%3E~%255B%255D%257B%7D%60%2523%7C%252A%253F%252F%255Cr%255Cn.%253A/part-00023-e0a68495-8098-40a6-be5f-b502b111b789.c000.snappy.parquet" + ), + ( + "string=$%25&%2F()%3D%5E%22<>~%5B%5D%7B}`%23|%2A%3F%2F%5Cr%5Cn.%3A_-/part-00023-346b6795-dafa-4948-bda5-ecdf4baa4445.c000.snappy.parquet", + "string=$%2525&%252F()%253D%255E%2522%3C%3E~%255B%255D%257B%7D%60%2523%7C%252A%253F%252F%255Cr%255Cn.%253A_-/part-00023-346b6795-dafa-4948-bda5-ecdf4baa4445.c000.snappy.parquet" + ) + ]; + + for (raw, expected) in cases { + let encoded = encode_path(raw); + assert_eq!(encoded, expected); + let decoded = decode_path(expected).unwrap(); + assert_eq!(decoded, raw); + } + } +} diff --git a/crates/deltalake-core/src/kernel/actions/types.rs b/crates/deltalake-core/src/kernel/actions/types.rs new file mode 100644 index 0000000000..166dbc98ef --- /dev/null +++ b/crates/deltalake-core/src/kernel/actions/types.rs @@ -0,0 +1,900 @@ +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +// use std::io::{Cursor, Read}; +// use std::sync::Arc; + +// use roaring::RoaringTreemap; +use log::warn; +use serde::{Deserialize, Serialize}; +use url::Url; + +use super::super::schema::StructType; +use super::super::{error::Error, DeltaResult}; +use super::serde_path; + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +/// Defines a file format used in table +pub struct Format { + /// Name of the encoding for files in this table + pub provider: String, + /// A map containing configuration options for the format + pub options: HashMap>, +} + +impl Format { + /// Allows creation of a new action::Format + pub fn new(provider: String, options: Option>>) -> Self { + let options = options.unwrap_or_default(); + Self { provider, options } + } + + /// Return the Format provider + pub fn get_provider(self) -> String { + self.provider + } +} + +impl Default for Format { + fn default() -> Self { + Self { + provider: String::from("parquet"), + options: HashMap::new(), + } + } +} + +/// Return a default empty schema to be used for edge-cases when a schema is missing +fn default_schema() -> String { + warn!("A `metaData` action was missing a `schemaString` and has been given an empty schema"); + r#"{"type":"struct", "fields": []}"#.into() +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// Defines a metadata action +pub struct Metadata { + /// Unique identifier for this table + pub id: String, + /// User-provided identifier for this table + pub name: Option, + /// User-provided description for this table + pub description: Option, + /// Specification of the encoding for the files stored in the table + pub format: Format, + /// Schema of the table + #[serde(default = "default_schema")] + pub schema_string: String, + /// Column names by which the data should be partitioned + pub partition_columns: Vec, + /// The time when this metadata action is created, in milliseconds since the Unix epoch + pub created_time: Option, + /// Configuration options for the metadata action + pub configuration: HashMap>, +} + +impl Metadata { + /// Create a new metadata action + pub fn new( + id: impl Into, + format: Format, + schema_string: impl Into, + partition_columns: impl IntoIterator>, + configuration: Option>>, + ) -> Self { + Self { + id: id.into(), + format, + schema_string: schema_string.into(), + partition_columns: partition_columns.into_iter().map(|c| c.into()).collect(), + configuration: configuration.unwrap_or_default(), + name: None, + description: None, + created_time: None, + } + } + + /// set the table name in the metadata action + pub fn with_name(mut self, name: impl Into) -> Self { + self.name = Some(name.into()); + self + } + + /// set the table description in the metadata action + pub fn with_description(mut self, description: impl Into) -> Self { + self.description = Some(description.into()); + self + } + + /// set the table creation time in the metadata action + pub fn with_created_time(mut self, created_time: i64) -> Self { + self.created_time = Some(created_time); + self + } + + /// get the table schema + pub fn schema(&self) -> DeltaResult { + Ok(serde_json::from_str(&self.schema_string)?) + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// Defines a protocol action +pub struct Protocol { + /// The minimum version of the Delta read protocol that a client must implement + /// in order to correctly read this table + pub min_reader_version: i32, + /// The minimum version of the Delta write protocol that a client must implement + /// in order to correctly write this table + pub min_writer_version: i32, + /// A collection of features that a client must implement in order to correctly + /// read this table (exist only when minReaderVersion is set to 3) + pub reader_features: Option>, + /// A collection of features that a client must implement in order to correctly + /// write this table (exist only when minWriterVersion is set to 7) + pub writer_features: Option>, +} + +impl Protocol { + /// Create a new protocol action + pub fn new(min_reader_version: i32, min_wrriter_version: i32) -> Self { + Self { + min_reader_version, + min_writer_version: min_wrriter_version, + reader_features: None, + writer_features: None, + } + } + + /// set the reader features in the protocol action + pub fn with_reader_features( + mut self, + reader_features: impl IntoIterator>, + ) -> Self { + self.reader_features = Some(reader_features.into_iter().map(|c| c.into()).collect()); + self + } + + /// set the writer features in the protocol action + pub fn with_writer_features( + mut self, + writer_features: impl IntoIterator>, + ) -> Self { + self.writer_features = Some(writer_features.into_iter().map(|c| c.into()).collect()); + self + } +} + +/// Features table readers can support as well as let users know +/// what is supported +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] +#[serde(rename_all = "camelCase")] +pub enum ReaderFeatures { + /// Mapping of one column to another + ColumnMapping, + /// Deletion vectors for merge, update, delete + DeleteionVecotrs, + /// timestamps without timezone support + #[serde(alias = "timestampNtz")] + TimestampWithoutTimezone, + /// version 2 of checkpointing + V2Checkpoint, + /// If we do not match any other reader features + #[serde(untagged)] + Other(String), +} + +#[allow(clippy::from_over_into)] +impl Into for ReaderFeatures { + fn into(self) -> usize { + match self { + ReaderFeatures::Other(_) => 0, + ReaderFeatures::ColumnMapping => 2, + ReaderFeatures::DeleteionVecotrs + | ReaderFeatures::TimestampWithoutTimezone + | ReaderFeatures::V2Checkpoint => 3, + } + } +} + +#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] +impl From<&parquet::record::Field> for ReaderFeatures { + fn from(value: &parquet::record::Field) -> Self { + match value { + parquet::record::Field::Str(feature) => match feature.as_str() { + "columnMapping" => ReaderFeatures::ColumnMapping, + "deletionVectors" => ReaderFeatures::DeleteionVecotrs, + "timestampNtz" => ReaderFeatures::TimestampWithoutTimezone, + "v2Checkpoint" => ReaderFeatures::V2Checkpoint, + f => ReaderFeatures::Other(f.to_string()), + }, + f => ReaderFeatures::Other(f.to_string()), + } + } +} + +impl From for ReaderFeatures { + fn from(value: String) -> Self { + match value.as_str() { + "columnMapping" => ReaderFeatures::ColumnMapping, + "deletionVectors" => ReaderFeatures::DeleteionVecotrs, + "timestampNtz" => ReaderFeatures::TimestampWithoutTimezone, + "v2Checkpoint" => ReaderFeatures::V2Checkpoint, + f => ReaderFeatures::Other(f.to_string()), + } + } +} + +/// Features table writers can support as well as let users know +/// what is supported +#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] +#[serde(rename_all = "camelCase")] +pub enum WriterFeatures { + /// Append Only Tables + AppendOnly, + /// Table invariants + Invariants, + /// Check constraints on columns + CheckConstraints, + /// CDF on a table + ChangeDataFeed, + /// Columns with generated values + GeneratedColumns, + /// Mapping of one column to another + ColumnMapping, + /// ID Columns + IdentityColumns, + /// Deletion vectors for merge, update, delete + DeleteionVecotrs, + /// Row tracking on tables + RowTracking, + /// timestamps without timezone support + #[serde(alias = "timestampNtz")] + TimestampWithoutTimezone, + /// domain specific metadata + DomainMetadata, + /// version 2 of checkpointing + V2Checkpoint, + /// Iceberg compatability support + IcebergCompatV1, + /// If we do not match any other reader features + #[serde(untagged)] + Other(String), +} + +#[allow(clippy::from_over_into)] +impl Into for WriterFeatures { + fn into(self) -> usize { + match self { + WriterFeatures::Other(_) => 0, + WriterFeatures::AppendOnly | WriterFeatures::Invariants => 2, + WriterFeatures::CheckConstraints => 3, + WriterFeatures::ChangeDataFeed | WriterFeatures::GeneratedColumns => 4, + WriterFeatures::ColumnMapping => 5, + WriterFeatures::IdentityColumns + | WriterFeatures::DeleteionVecotrs + | WriterFeatures::RowTracking + | WriterFeatures::TimestampWithoutTimezone + | WriterFeatures::DomainMetadata + | WriterFeatures::V2Checkpoint + | WriterFeatures::IcebergCompatV1 => 7, + } + } +} + +impl From for WriterFeatures { + fn from(value: String) -> Self { + match value.as_str() { + "appendOnly" => WriterFeatures::AppendOnly, + "invariants" => WriterFeatures::Invariants, + "checkConstraints" => WriterFeatures::CheckConstraints, + "changeDataFeed" => WriterFeatures::ChangeDataFeed, + "generatedColumns" => WriterFeatures::GeneratedColumns, + "columnMapping" => WriterFeatures::ColumnMapping, + "identityColumns" => WriterFeatures::IdentityColumns, + "deletionVectors" => WriterFeatures::DeleteionVecotrs, + "rowTracking" => WriterFeatures::RowTracking, + "timestampNtz" => WriterFeatures::TimestampWithoutTimezone, + "domainMetadata" => WriterFeatures::DomainMetadata, + "v2Checkpoint" => WriterFeatures::V2Checkpoint, + "icebergCompatV1" => WriterFeatures::IcebergCompatV1, + f => WriterFeatures::Other(f.to_string()), + } + } +} + +#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] +impl From<&parquet::record::Field> for WriterFeatures { + fn from(value: &parquet::record::Field) -> Self { + match value { + parquet::record::Field::Str(feature) => match feature.as_str() { + "appendOnly" => WriterFeatures::AppendOnly, + "invariants" => WriterFeatures::Invariants, + "checkConstraints" => WriterFeatures::CheckConstraints, + "changeDataFeed" => WriterFeatures::ChangeDataFeed, + "generatedColumns" => WriterFeatures::GeneratedColumns, + "columnMapping" => WriterFeatures::ColumnMapping, + "identityColumns" => WriterFeatures::IdentityColumns, + "deletionVectors" => WriterFeatures::DeleteionVecotrs, + "rowTracking" => WriterFeatures::RowTracking, + "timestampNtz" => WriterFeatures::TimestampWithoutTimezone, + "domainMetadata" => WriterFeatures::DomainMetadata, + "v2Checkpoint" => WriterFeatures::V2Checkpoint, + "icebergCompatV1" => WriterFeatures::IcebergCompatV1, + f => WriterFeatures::Other(f.to_string()), + }, + f => WriterFeatures::Other(f.to_string()), + } + } +} + +///Storage type of deletion vector +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] +pub enum StorageType { + /// Stored at relative path derived from a UUID. + #[serde(rename = "u")] + UuidRelativePath, + /// Stored as inline string. + #[serde(rename = "i")] + Inline, + /// Stored at an absolute path. + #[serde(rename = "p")] + AbsolutePath, +} + +impl Default for StorageType { + fn default() -> Self { + Self::UuidRelativePath // seems to be used by Databricks and therefore most common + } +} + +impl FromStr for StorageType { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s { + "u" => Ok(Self::UuidRelativePath), + "i" => Ok(Self::Inline), + "p" => Ok(Self::AbsolutePath), + _ => Err(Error::DeletionVector(format!( + "Unknown storage format: '{s}'." + ))), + } + } +} + +impl AsRef for StorageType { + fn as_ref(&self) -> &str { + match self { + Self::UuidRelativePath => "u", + Self::Inline => "i", + Self::AbsolutePath => "p", + } + } +} + +impl ToString for StorageType { + fn to_string(&self) -> String { + self.as_ref().to_string() + } +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +/// Defines a deletion vector +pub struct DeletionVectorDescriptor { + /// A single character to indicate how to access the DV. Legal options are: ['u', 'i', 'p']. + pub storage_type: StorageType, + + /// Three format options are currently proposed: + /// - If `storageType = 'u'` then ``: + /// The deletion vector is stored in a file with a path relative to the data + /// directory of this Delta table, and the file name can be reconstructed from + /// the UUID. See Derived Fields for how to reconstruct the file name. The random + /// prefix is recovered as the extra characters before the (20 characters fixed length) uuid. + /// - If `storageType = 'i'` then ``: The deletion vector + /// is stored inline in the log. The format used is the `RoaringBitmapArray` + /// format also used when the DV is stored on disk and described in [Deletion Vector Format]. + /// - If `storageType = 'p'` then ``: The DV is stored in a file with an + /// absolute path given by this path, which has the same format as the `path` field + /// in the `add`/`remove` actions. + /// + /// [Deletion Vector Format]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Deletion-Vector-Format + pub path_or_inline_dv: String, + + /// Start of the data for this DV in number of bytes from the beginning of the file it is stored in. + /// Always None (absent in JSON) when `storageType = 'i'`. + pub offset: Option, + + /// Size of the serialized DV in bytes (raw data size, i.e. before base85 encoding, if inline). + pub size_in_bytes: i32, + + /// Number of rows the given DV logically removes from the file. + pub cardinality: i64, +} + +impl DeletionVectorDescriptor { + /// get a unique idenitfier for the deletion vector + pub fn unique_id(&self) -> String { + if let Some(offset) = self.offset { + format!( + "{}{}@{offset}", + self.storage_type.as_ref(), + self.path_or_inline_dv + ) + } else { + format!("{}{}", self.storage_type.as_ref(), self.path_or_inline_dv) + } + } + + /// get the absolute path of the deletion vector + pub fn absolute_path(&self, parent: &Url) -> DeltaResult> { + match &self.storage_type { + StorageType::UuidRelativePath => { + let prefix_len = self.path_or_inline_dv.len() as i32 - 20; + if prefix_len < 0 { + return Err(Error::DeletionVector("Invalid length".to_string())); + } + let decoded = z85::decode(&self.path_or_inline_dv[(prefix_len as usize)..]) + .map_err(|_| Error::DeletionVector("Failed to decode DV uuid".to_string()))?; + let uuid = uuid::Uuid::from_slice(&decoded) + .map_err(|err| Error::DeletionVector(err.to_string()))?; + let mut dv_suffix = format!("deletion_vector_{uuid}.bin"); + if prefix_len > 0 { + dv_suffix = format!( + "{}/{}", + &self.path_or_inline_dv[..(prefix_len as usize)], + dv_suffix + ); + } + let dv_path = parent + .join(&dv_suffix) + .map_err(|_| Error::DeletionVector(format!("invalid path: {}", dv_suffix)))?; + Ok(Some(dv_path)) + } + StorageType::AbsolutePath => { + Ok(Some(Url::parse(&self.path_or_inline_dv).map_err(|_| { + Error::DeletionVector(format!("invalid path: {}", self.path_or_inline_dv)) + })?)) + } + StorageType::Inline => Ok(None), + } + } + + // TODO read only required byte ranges + // pub fn read( + // &self, + // fs_client: Arc, + // parent: Url, + // ) -> DeltaResult { + // match self.absolute_path(&parent)? { + // None => { + // let bytes = z85::decode(&self.path_or_inline_dv) + // .map_err(|_| Error::DeletionVector("Failed to decode DV".to_string()))?; + // RoaringTreemap::deserialize_from(&bytes[12..]) + // .map_err(|err| Error::DeletionVector(err.to_string())) + // } + // Some(path) => { + // let offset = self.offset; + // let size_in_bytes = self.size_in_bytes; + // + // let dv_data = fs_client + // .read_files(vec![(path, None)])? + // .next() + // .ok_or(Error::MissingData("No deletion Vector data".to_string()))??; + // + // let mut cursor = Cursor::new(dv_data); + // if let Some(offset) = offset { + // // TODO should we read the datasize from the DV file? + // // offset plus datasize bytes + // cursor.set_position((offset + 4) as u64); + // } + // + // let mut buf = vec![0; 4]; + // cursor + // .read(&mut buf) + // .map_err(|err| Error::DeletionVector(err.to_string()))?; + // let magic = + // i32::from_le_bytes(buf.try_into().map_err(|_| { + // Error::DeletionVector("filed to read magic bytes".to_string()) + // })?); + // println!("magic --> : {}", magic); + // // assert!(magic == 1681511377); + // + // let mut buf = vec![0; size_in_bytes as usize]; + // cursor + // .read(&mut buf) + // .map_err(|err| Error::DeletionVector(err.to_string()))?; + // + // RoaringTreemap::deserialize_from(Cursor::new(buf)) + // .map_err(|err| Error::DeletionVector(err.to_string())) + // } + // } + // } +} + +#[derive(Serialize, Deserialize, Debug, Clone, Default)] +#[serde(rename_all = "camelCase")] +/// Defines an add action +pub struct Add { + /// A relative path to a data file from the root of the table or an absolute path to a file + /// that should be added to the table. The path is a URI as specified by + /// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path. + /// + /// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt + #[serde(with = "serde_path")] + pub path: String, + + /// A map from partition column to value for this logical file. + pub partition_values: HashMap>, + + /// The size of this data file in bytes + pub size: i64, + + /// The time this logical file was created, as milliseconds since the epoch. + pub modification_time: i64, + + /// When `false` the logical file must already be present in the table or the records + /// in the added file must be contained in one or more remove actions in the same version. + pub data_change: bool, + + /// Contains [statistics] (e.g., count, min/max values for columns) about the data in this logical file. + /// + /// [statistics]: https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Per-file-Statistics + pub stats: Option, + + /// Map containing metadata about this logical file. + pub tags: Option>>, + + /// Information about deletion vector (DV) associated with this add action + pub deletion_vector: Option, + + /// Default generated Row ID of the first row in the file. The default generated Row IDs + /// of the other rows in the file can be reconstructed by adding the physical index of the + /// row within the file to the base Row ID + pub base_row_id: Option, + + /// First commit version in which an add action with the same path was committed to the table. + pub default_row_commit_version: Option, + + // TODO remove migration filds added to not do too many business logic changes in one PR + /// Partition values stored in raw parquet struct format. In this struct, the column names + /// correspond to the partition columns and the values are stored in their corresponding data + /// type. This is a required field when the table is partitioned and the table property + /// delta.checkpoint.writeStatsAsStruct is set to true. If the table is not partitioned, this + /// column can be omitted. + /// + /// This field is only available in add action records read from checkpoints + #[cfg(feature = "parquet")] + #[serde(skip_serializing, skip_deserializing)] + pub partition_values_parsed: Option, + /// Partition values parsed for parquet2 + #[cfg(feature = "parquet2")] + #[serde(skip_serializing, skip_deserializing)] + pub partition_values_parsed: Option, + + /// Contains statistics (e.g., count, min/max values for columns) about the data in this file in + /// raw parquet format. This field needs to be written when statistics are available and the + /// table property: delta.checkpoint.writeStatsAsStruct is set to true. + /// + /// This field is only available in add action records read from checkpoints + #[cfg(feature = "parquet")] + #[serde(skip_serializing, skip_deserializing)] + pub stats_parsed: Option, + /// Stats parsed for parquet2 + #[cfg(feature = "parquet2")] + #[serde(skip_serializing, skip_deserializing)] + pub stats_parsed: Option, +} + +impl Add { + /// get the unique id of the deletion vector, if any + pub fn dv_unique_id(&self) -> Option { + self.deletion_vector.clone().map(|dv| dv.unique_id()) + } + + /// set the base row id of the add action + pub fn with_base_row_id(mut self, base_row_id: i64) -> Self { + self.base_row_id = Some(base_row_id); + self + } +} + +/// Represents a tombstone (deleted file) in the Delta log. +#[derive(Serialize, Deserialize, Debug, Clone, Eq, Default)] +#[serde(rename_all = "camelCase")] +pub struct Remove { + /// A relative path to a data file from the root of the table or an absolute path to a file + /// that should be added to the table. The path is a URI as specified by + /// [RFC 2396 URI Generic Syntax], which needs to be decoded to get the data file path. + /// + /// [RFC 2396 URI Generic Syntax]: https://www.ietf.org/rfc/rfc2396.txt + pub path: String, + + /// When `false` the logical file must already be present in the table or the records + /// in the added file must be contained in one or more remove actions in the same version. + pub data_change: bool, + + /// The time this logical file was created, as milliseconds since the epoch. + pub deletion_timestamp: Option, + + /// When true the fields `partition_values`, `size`, and `tags` are present + pub extended_file_metadata: Option, + + /// A map from partition column to value for this logical file. + #[serde(skip_serializing_if = "Option::is_none")] + pub partition_values: Option>>, + + /// The size of this data file in bytes + #[serde(skip_serializing_if = "Option::is_none")] + pub size: Option, + + /// Map containing metadata about this logical file. + #[serde(skip_serializing_if = "Option::is_none")] + pub tags: Option>>, + + /// Information about deletion vector (DV) associated with this add action + #[serde(skip_serializing_if = "Option::is_none")] + pub deletion_vector: Option, + + /// Default generated Row ID of the first row in the file. The default generated Row IDs + /// of the other rows in the file can be reconstructed by adding the physical index of the + /// row within the file to the base Row ID + pub base_row_id: Option, + + /// First commit version in which an add action with the same path was committed to the table. + pub default_row_commit_version: Option, +} + +impl Remove { + /// get the unique id of the deletion vector, if any + pub fn dv_unique_id(&self) -> Option { + self.deletion_vector.clone().map(|dv| dv.unique_id()) + } +} + +/// Delta AddCDCFile action that describes a parquet CDC data file. +#[derive(Serialize, Deserialize, Clone, Debug, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct AddCDCFile { + /// A relative path, from the root of the table, or an + /// absolute path to a CDC file + #[serde(with = "serde_path")] + pub path: String, + /// The size of this file in bytes + pub size: i64, + /// A map from partition column to value for this file + pub partition_values: HashMap>, + /// Should always be set to false because they do not change the underlying data of the table + pub data_change: bool, + /// Map containing metadata about this file + pub tags: Option>>, +} + +/// Action used by streaming systems to track progress using application-specific versions to +/// enable idempotency. +#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct Txn { + /// A unique identifier for the application performing the transaction. + pub app_id: String, + /// An application-specific numeric identifier for this transaction. + pub version: i64, + /// The time when this transaction action was created in milliseconds since the Unix epoch. + pub last_updated: Option, +} + +/// The commitInfo is a fairly flexible action within the delta specification, where arbitrary data can be stored. +/// However the reference implementation as well as delta-rs store useful information that may for instance +/// allow us to be more permissive in commit conflict resolution. +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct CommitInfo { + /// Timestamp in millis when the commit was created + #[serde(skip_serializing_if = "Option::is_none")] + pub timestamp: Option, + /// Id of the user invoking the commit + #[serde(skip_serializing_if = "Option::is_none")] + pub user_id: Option, + /// Name of the user invoking the commit + #[serde(skip_serializing_if = "Option::is_none")] + pub user_name: Option, + /// The operation performed during the + #[serde(skip_serializing_if = "Option::is_none")] + pub operation: Option, + /// Parameters used for table operation + #[serde(skip_serializing_if = "Option::is_none")] + pub operation_parameters: Option>, + /// Version of the table when the operation was started + #[serde(skip_serializing_if = "Option::is_none")] + pub read_version: Option, + /// The isolation level of the commit + #[serde(skip_serializing_if = "Option::is_none")] + pub isolation_level: Option, + /// TODO + #[serde(skip_serializing_if = "Option::is_none")] + pub is_blind_append: Option, + /// Delta engine which created the commit. + #[serde(skip_serializing_if = "Option::is_none")] + pub engine_info: Option, + /// Additional provenance information for the commit + #[serde(flatten, default)] + pub info: HashMap, +} + +/// The domain metadata action contains a configuration (string) for a named metadata domain +#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct DomainMetadata { + /// Identifier for this domain (system or user-provided) + pub domain: String, + /// String containing configuration for the metadata domain + pub configuration: String, + /// When `true` the action serves as a tombstone + pub removed: bool, +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +/// The isolation level applied during transaction +pub enum IsolationLevel { + /// The strongest isolation level. It ensures that committed write operations + /// and all reads are Serializable. Operations are allowed as long as there + /// exists a serial sequence of executing them one-at-a-time that generates + /// the same outcome as that seen in the table. For the write operations, + /// the serial sequence is exactly the same as that seen in the table’s history. + Serializable, + + /// A weaker isolation level than Serializable. It ensures only that the write + /// operations (that is, not reads) are serializable. However, this is still stronger + /// than Snapshot isolation. WriteSerializable is the default isolation level because + /// it provides great balance of data consistency and availability for most common operations. + WriteSerializable, + + /// SnapshotIsolation is a guarantee that all reads made in a transaction will see a consistent + /// snapshot of the database (in practice it reads the last committed values that existed at the + /// time it started), and the transaction itself will successfully commit only if no updates + /// it has made conflict with any concurrent updates made since that snapshot. + SnapshotIsolation, +} + +// Spark assumes Serializable as default isolation level +// https://github.com/delta-io/delta/blob/abb171c8401200e7772b27e3be6ea8682528ac72/core/src/main/scala/org/apache/spark/sql/delta/OptimisticTransaction.scala#L1023 +impl Default for IsolationLevel { + fn default() -> Self { + Self::Serializable + } +} + +impl AsRef for IsolationLevel { + fn as_ref(&self) -> &str { + match self { + Self::Serializable => "Serializable", + Self::WriteSerializable => "WriteSerializable", + Self::SnapshotIsolation => "SnapshotIsolation", + } + } +} + +impl FromStr for IsolationLevel { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "serializable" => Ok(Self::Serializable), + "writeserializable" | "write_serializable" => Ok(Self::WriteSerializable), + "snapshotisolation" | "snapshot_isolation" => Ok(Self::SnapshotIsolation), + _ => Err(Error::Generic("Invalid string for IsolationLevel".into())), + } + } +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + // use std::sync::Arc; + + // use object_store::local::LocalFileSystem; + + use crate::kernel::PrimitiveType; + + use super::*; + // use crate::client::filesystem::ObjectStoreFileSystemClient; + // use crate::executor::tokio::TokioBackgroundExecutor; + + fn dv_relateive() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "u".parse().unwrap(), + path_or_inline_dv: "ab^-aqEH.-t@S}K{vb[*k^".to_string(), + offset: Some(4), + size_in_bytes: 40, + cardinality: 6, + } + } + + fn dv_absolute() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "p".parse().unwrap(), + path_or_inline_dv: + "s3://mytable/deletion_vector_d2c639aa-8816-431a-aaf6-d3fe2512ff61.bin".to_string(), + offset: Some(4), + size_in_bytes: 40, + cardinality: 6, + } + } + + fn dv_inline() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "i".parse().unwrap(), + path_or_inline_dv: "wi5b=000010000siXQKl0rr91000f55c8Xg0@@D72lkbi5=-{L".to_string(), + offset: None, + size_in_bytes: 40, + cardinality: 6, + } + } + + fn dv_example() -> DeletionVectorDescriptor { + DeletionVectorDescriptor { + storage_type: "u".parse().unwrap(), + path_or_inline_dv: "vBn[lx{q8@P<9BNH/isA".to_string(), + offset: Some(1), + size_in_bytes: 36, + cardinality: 2, + } + } + + #[test] + fn test_deletion_vector_absolute_path() { + let parent = Url::parse("s3://mytable/").unwrap(); + + let relative = dv_relateive(); + let expected = + Url::parse("s3://mytable/ab/deletion_vector_d2c639aa-8816-431a-aaf6-d3fe2512ff61.bin") + .unwrap(); + assert_eq!(expected, relative.absolute_path(&parent).unwrap().unwrap()); + + let absolute = dv_absolute(); + let expected = + Url::parse("s3://mytable/deletion_vector_d2c639aa-8816-431a-aaf6-d3fe2512ff61.bin") + .unwrap(); + assert_eq!(expected, absolute.absolute_path(&parent).unwrap().unwrap()); + + let inline = dv_inline(); + assert_eq!(None, inline.absolute_path(&parent).unwrap()); + + let path = + std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + let parent = url::Url::from_directory_path(path).unwrap(); + let dv_url = parent + .join("deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin") + .unwrap(); + let example = dv_example(); + assert_eq!(dv_url, example.absolute_path(&parent).unwrap().unwrap()); + } + + #[test] + fn test_primitive() { + let types: PrimitiveType = serde_json::from_str("\"string\"").unwrap(); + println!("{:?}", types); + } + + // #[test] + // fn test_deletion_vector_read() { + // let store = Arc::new(LocalFileSystem::new()); + // let path = + // std::fs::canonicalize(PathBuf::from("./tests/data/table-with-dv-small/")).unwrap(); + // let parent = url::Url::from_directory_path(path).unwrap(); + // let root = object_store::path::Path::from(parent.path()); + // let fs_client = Arc::new(ObjectStoreFileSystemClient::new( + // store, + // root, + // Arc::new(TokioBackgroundExecutor::new()), + // )); + // + // let example = dv_example(); + // let tree_map = example.read(fs_client, parent).unwrap(); + // + // let expected: Vec = vec![0, 9]; + // let found = tree_map.iter().collect::>(); + // assert_eq!(found, expected) + // } +} diff --git a/crates/deltalake-core/src/kernel/error.rs b/crates/deltalake-core/src/kernel/error.rs new file mode 100644 index 0000000000..8ec799ca96 --- /dev/null +++ b/crates/deltalake-core/src/kernel/error.rs @@ -0,0 +1,78 @@ +//! Error types for Delta Lake operations. + +/// A specialized [`Result`] type for Delta Lake operations. +pub type DeltaResult = std::result::Result; + +#[derive(thiserror::Error, Debug)] +#[allow(missing_docs)] +pub enum Error { + #[cfg(feature = "arrow")] + #[error("Arrow error: {0}")] + Arrow(#[from] arrow_schema::ArrowError), + + #[error("Generic delta kernel error: {0}")] + Generic(String), + + #[error("Generic error: {source}")] + GenericError { + /// Source error + source: Box, + }, + + #[cfg(feature = "parquet")] + #[error("Arrow error: {0}")] + Parquet(#[from] parquet::errors::ParquetError), + + #[cfg(feature = "object_store")] + #[error("Error interacting with object store: {0}")] + ObjectStore(object_store::Error), + + #[error("File not found: {0}")] + FileNotFound(String), + + #[error("{0}")] + MissingColumn(String), + + #[error("Expected column type: {0}")] + UnexpectedColumnType(String), + + #[error("Expected is missing: {0}")] + MissingData(String), + + #[error("No table version found.")] + MissingVersion, + + #[error("Deletion Vector error: {0}")] + DeletionVector(String), + + #[error("Schema error: {0}")] + Schema(String), + + #[error("Invalid url: {0}")] + InvalidUrl(#[from] url::ParseError), + + #[error("Invalid url: {0}")] + MalformedJson(#[from] serde_json::Error), + + #[error("No table metadata found in delta log.")] + MissingMetadata, + + /// Error returned when the log contains invalid stats JSON. + #[error("Invalid JSON in invariant expression, line=`{line}`, err=`{json_err}`")] + InvalidInvariantJson { + /// JSON error details returned when parsing the invariant expression JSON. + json_err: serde_json::error::Error, + /// Invariant expression. + line: String, + }, +} + +#[cfg(feature = "object_store")] +impl From for Error { + fn from(value: object_store::Error) -> Self { + match value { + object_store::Error::NotFound { path, .. } => Self::FileNotFound(path), + err => Self::ObjectStore(err), + } + } +} diff --git a/crates/deltalake-core/src/kernel/mod.rs b/crates/deltalake-core/src/kernel/mod.rs new file mode 100644 index 0000000000..7785c273f9 --- /dev/null +++ b/crates/deltalake-core/src/kernel/mod.rs @@ -0,0 +1,9 @@ +//! Kernel module + +pub mod actions; +pub mod error; +pub mod schema; + +pub use actions::*; +pub use error::*; +pub use schema::*; diff --git a/crates/deltalake-core/src/kernel/schema.rs b/crates/deltalake-core/src/kernel/schema.rs new file mode 100644 index 0000000000..12391ca6e8 --- /dev/null +++ b/crates/deltalake-core/src/kernel/schema.rs @@ -0,0 +1,788 @@ +//! Delta table schema + +use std::fmt::Formatter; +use std::sync::Arc; +use std::{collections::HashMap, fmt::Display}; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use super::error::Error; + +/// Type alias for a top level schema +pub type Schema = StructType; +/// Schema reference type +pub type SchemaRef = Arc; + +/// A value that can be stored in the metadata of a Delta table schema entity. +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(untagged)] +pub enum MetadataValue { + /// A number value + Number(i32), + /// A string value + String(String), +} + +impl From for MetadataValue { + fn from(value: String) -> Self { + Self::String(value) + } +} + +impl From<&String> for MetadataValue { + fn from(value: &String) -> Self { + Self::String(value.clone()) + } +} + +impl From for MetadataValue { + fn from(value: i32) -> Self { + Self::Number(value) + } +} + +impl From for MetadataValue { + fn from(value: Value) -> Self { + Self::String(value.to_string()) + } +} + +#[derive(Debug)] +#[allow(missing_docs)] +pub enum ColumnMetadataKey { + ColumnMappingId, + ColumnMappingPhysicalName, + GenerationExpression, + IdentityStart, + IdentityStep, + IdentityHighWaterMark, + IdentityAllowExplicitInsert, + Invariants, +} + +impl AsRef for ColumnMetadataKey { + fn as_ref(&self) -> &str { + match self { + Self::ColumnMappingId => "delta.columnMapping.id", + Self::ColumnMappingPhysicalName => "delta.columnMapping.physicalName", + Self::GenerationExpression => "delta.generationExpression", + Self::IdentityAllowExplicitInsert => "delta.identity.allowExplicitInsert", + Self::IdentityHighWaterMark => "delta.identity.highWaterMark", + Self::IdentityStart => "delta.identity.start", + Self::IdentityStep => "delta.identity.step", + Self::Invariants => "delta.invariants", + } + } +} + +/// An invariant for a column that is enforced on all writes to a Delta table. +#[derive(Eq, PartialEq, Debug, Default, Clone)] +pub struct Invariant { + /// The full path to the field. + pub field_name: String, + /// The SQL string that must always evaluate to true. + pub invariant_sql: String, +} + +impl Invariant { + /// Create a new invariant + pub fn new(field_name: &str, invariant_sql: &str) -> Self { + Self { + field_name: field_name.to_string(), + invariant_sql: invariant_sql.to_string(), + } + } +} + +/// Represents a struct field defined in the Delta table schema. +// https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +pub struct StructField { + /// Name of this (possibly nested) column + pub name: String, + /// The data type of this field + #[serde(rename = "type")] + pub data_type: DataType, + /// Denotes whether this Field can be null + pub nullable: bool, + /// A JSON map containing information about this column + pub metadata: HashMap, +} + +impl StructField { + /// Creates a new field + pub fn new(name: impl Into, data_type: DataType, nullable: bool) -> Self { + Self { + name: name.into(), + data_type, + nullable, + metadata: HashMap::default(), + } + } + + /// Creates a new field with metadata + pub fn with_metadata( + mut self, + metadata: impl IntoIterator, impl Into)>, + ) -> Self { + self.metadata = metadata + .into_iter() + .map(|(k, v)| (k.into(), v.into())) + .collect(); + self + } + + /// Get the value of a specific metadata key + pub fn get_config_value(&self, key: &ColumnMetadataKey) -> Option<&MetadataValue> { + self.metadata.get(key.as_ref()) + } + + #[inline] + /// Returns the name of the column + pub fn name(&self) -> &String { + &self.name + } + + #[inline] + /// Returns whether the column is nullable + pub fn is_nullable(&self) -> bool { + self.nullable + } + + #[inline] + /// Returns the data type of the column + pub const fn data_type(&self) -> &DataType { + &self.data_type + } + + #[inline] + /// Returns the metadata of the column + pub const fn metadata(&self) -> &HashMap { + &self.metadata + } +} + +/// A struct is used to represent both the top-level schema of the table +/// as well as struct columns that contain nested columns. +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +pub struct StructType { + #[serde(rename = "type")] + /// The type of this struct + pub type_name: String, + /// The type of element stored in this array + pub fields: Vec, +} + +impl StructType { + /// Creates a new struct type + pub fn new(fields: Vec) -> Self { + Self { + type_name: "struct".into(), + fields, + } + } + + /// Returns an immutable reference of the fields in the struct + pub fn fields(&self) -> &Vec { + &self.fields + } + + /// Find the index of the column with the given name. + pub fn index_of(&self, name: &str) -> Result { + let (idx, _) = self + .fields() + .iter() + .enumerate() + .find(|(_, b)| b.name() == name) + .ok_or_else(|| { + let valid_fields: Vec<_> = self.fields.iter().map(|f| f.name()).collect(); + Error::Schema(format!( + "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" + )) + })?; + Ok(idx) + } + + /// Returns a reference of a specific [`StructField`] instance selected by name. + pub fn field_with_name(&self, name: &str) -> Result<&StructField, Error> { + Ok(&self.fields[self.index_of(name)?]) + } + + /// Get all invariants in the schemas + pub fn get_invariants(&self) -> Result, Error> { + let mut remaining_fields: Vec<(String, StructField)> = self + .fields() + .iter() + .map(|field| (field.name.clone(), field.clone())) + .collect(); + let mut invariants: Vec = Vec::new(); + + let add_segment = |prefix: &str, segment: &str| -> String { + if prefix.is_empty() { + segment.to_owned() + } else { + format!("{prefix}.{segment}") + } + }; + + while let Some((field_path, field)) = remaining_fields.pop() { + match field.data_type() { + DataType::Struct(inner) => { + remaining_fields.extend( + inner + .fields() + .iter() + .map(|field| { + let new_prefix = add_segment(&field_path, &field.name); + (new_prefix, field.clone()) + }) + .collect::>(), + ); + } + DataType::Array(inner) => { + let element_field_name = add_segment(&field_path, "element"); + remaining_fields.push(( + element_field_name, + StructField::new("".to_string(), inner.element_type.clone(), false), + )); + } + DataType::Map(inner) => { + let key_field_name = add_segment(&field_path, "key"); + remaining_fields.push(( + key_field_name, + StructField::new("".to_string(), inner.key_type.clone(), false), + )); + let value_field_name = add_segment(&field_path, "value"); + remaining_fields.push(( + value_field_name, + StructField::new("".to_string(), inner.value_type.clone(), false), + )); + } + _ => {} + } + // JSON format: {"expression": {"expression": ""} } + if let Some(MetadataValue::String(invariant_json)) = + field.metadata.get(ColumnMetadataKey::Invariants.as_ref()) + { + let json: Value = serde_json::from_str(invariant_json).map_err(|e| { + Error::InvalidInvariantJson { + json_err: e, + line: invariant_json.to_string(), + } + })?; + if let Value::Object(json) = json { + if let Some(Value::Object(expr1)) = json.get("expression") { + if let Some(Value::String(sql)) = expr1.get("expression") { + invariants.push(Invariant::new(&field_path, sql)); + } + } + } + } + } + Ok(invariants) + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(rename_all = "camelCase")] +/// An array stores a variable length collection of items of some type. +pub struct ArrayType { + #[serde(rename = "type")] + /// The type of this struct + pub type_name: String, + /// The type of element stored in this array + pub element_type: DataType, + /// Denoting whether this array can contain one or more null values + pub contains_null: bool, +} + +impl ArrayType { + /// Creates a new array type + pub fn new(element_type: DataType, contains_null: bool) -> Self { + Self { + type_name: "array".into(), + element_type, + contains_null, + } + } + + #[inline] + /// Returns the element type of the array + pub const fn element_type(&self) -> &DataType { + &self.element_type + } + + #[inline] + /// Returns whether the array can contain null values + pub const fn contains_null(&self) -> bool { + self.contains_null + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(rename_all = "camelCase")] +/// A map stores an arbitrary length collection of key-value pairs +pub struct MapType { + #[serde(rename = "type")] + /// The type of this struct + pub type_name: String, + /// The type of element used for the key of this map + pub key_type: DataType, + /// The type of element used for the value of this map + pub value_type: DataType, + /// Denoting whether this array can contain one or more null values + #[serde(default = "default_true")] + pub value_contains_null: bool, +} + +impl MapType { + /// Creates a new map type + pub fn new(key_type: DataType, value_type: DataType, value_contains_null: bool) -> Self { + Self { + type_name: "map".into(), + key_type, + value_type, + value_contains_null, + } + } + + #[inline] + /// Returns the key type of the map + pub const fn key_type(&self) -> &DataType { + &self.key_type + } + + #[inline] + /// Returns the value type of the map + pub const fn value_type(&self) -> &DataType { + &self.value_type + } + + #[inline] + /// Returns whether the map can contain null values + pub const fn value_contains_null(&self) -> bool { + self.value_contains_null + } +} + +fn default_true() -> bool { + true +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(rename_all = "camelCase")] +/// Primitive types supported by Delta +pub enum PrimitiveType { + /// UTF-8 encoded string of characters + String, + /// i64: 8-byte signed integer. Range: -9223372036854775808 to 9223372036854775807 + Long, + /// i32: 4-byte signed integer. Range: -2147483648 to 2147483647 + Integer, + /// i16: 2-byte signed integer numbers. Range: -32768 to 32767 + Short, + /// i8: 1-byte signed integer number. Range: -128 to 127 + Byte, + /// f32: 4-byte single-precision floating-point numbers + Float, + /// f64: 8-byte double-precision floating-point numbers + Double, + /// bool: boolean values + Boolean, + /// Binary: uninterpreted binary data + Binary, + /// Date: Calendar date (year, month, day) + Date, + /// Microsecond precision timestamp, adjusted to UTC. + Timestamp, + // TODO: timestamp without timezone + #[serde( + serialize_with = "serialize_decimal", + deserialize_with = "deserialize_decimal", + untagged + )] + /// Decimal: arbitrary precision decimal numbers + Decimal(i32, i32), +} + +fn serialize_decimal( + precision: &i32, + scale: &i32, + serializer: S, +) -> Result { + serializer.serialize_str(&format!("decimal({},{})", precision, scale)) +} + +fn deserialize_decimal<'de, D>(deserializer: D) -> Result<(i32, i32), D::Error> +where + D: serde::Deserializer<'de>, +{ + let str_value = String::deserialize(deserializer)?; + if !str_value.starts_with("decimal(") || !str_value.ends_with(')') { + return Err(serde::de::Error::custom(format!( + "Invalid decimal: {}", + str_value + ))); + } + + let mut parts = str_value[8..str_value.len() - 1].split(','); + let precision = parts + .next() + .and_then(|part| part.trim().parse::().ok()) + .ok_or_else(|| { + serde::de::Error::custom(format!("Invalid precision in decimal: {}", str_value)) + })?; + let scale = parts + .next() + .and_then(|part| part.trim().parse::().ok()) + .ok_or_else(|| { + serde::de::Error::custom(format!("Invalid scale in decimal: {}", str_value)) + })?; + + Ok((precision, scale)) +} + +impl Display for PrimitiveType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + PrimitiveType::String => write!(f, "string"), + PrimitiveType::Long => write!(f, "long"), + PrimitiveType::Integer => write!(f, "integer"), + PrimitiveType::Short => write!(f, "short"), + PrimitiveType::Byte => write!(f, "byte"), + PrimitiveType::Float => write!(f, "float"), + PrimitiveType::Double => write!(f, "double"), + PrimitiveType::Boolean => write!(f, "boolean"), + PrimitiveType::Binary => write!(f, "binary"), + PrimitiveType::Date => write!(f, "date"), + PrimitiveType::Timestamp => write!(f, "timestamp"), + PrimitiveType::Decimal(precision, scale) => { + write!(f, "decimal({},{})", precision, scale) + } + } + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone)] +#[serde(untagged, rename_all = "camelCase")] +/// The data type of a column +pub enum DataType { + /// UTF-8 encoded string of characters + Primitive(PrimitiveType), + /// An array stores a variable length collection of items of some type. + Array(Box), + /// A struct is used to represent both the top-level schema of the table as well + /// as struct columns that contain nested columns. + Struct(Box), + /// A map stores an arbitrary length collection of key-value pairs + /// with a single keyType and a single valueType + Map(Box), +} + +impl DataType { + /// create a new string type + pub fn string() -> Self { + DataType::Primitive(PrimitiveType::String) + } + + /// create a new long type + pub fn long() -> Self { + DataType::Primitive(PrimitiveType::Long) + } + + /// create a new integer type + pub fn integer() -> Self { + DataType::Primitive(PrimitiveType::Integer) + } + + /// create a new short type + pub fn short() -> Self { + DataType::Primitive(PrimitiveType::Short) + } + + /// create a new byte type + pub fn byte() -> Self { + DataType::Primitive(PrimitiveType::Byte) + } + + /// create a new float type + pub fn float() -> Self { + DataType::Primitive(PrimitiveType::Float) + } + + /// create a new double type + pub fn double() -> Self { + DataType::Primitive(PrimitiveType::Double) + } + + /// create a new boolean type + pub fn boolean() -> Self { + DataType::Primitive(PrimitiveType::Boolean) + } + + /// create a new binary type + pub fn binary() -> Self { + DataType::Primitive(PrimitiveType::Binary) + } + + /// create a new date type + pub fn date() -> Self { + DataType::Primitive(PrimitiveType::Date) + } + + /// create a new timestamp type + pub fn timestamp() -> Self { + DataType::Primitive(PrimitiveType::Timestamp) + } + + /// create a new decimal type + pub fn decimal(precision: usize, scale: usize) -> Self { + DataType::Primitive(PrimitiveType::Decimal(precision as i32, scale as i32)) + } +} + +impl Display for DataType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + DataType::Primitive(p) => write!(f, "{}", p), + DataType::Array(a) => write!(f, "array<{}>", a.element_type), + DataType::Struct(s) => { + write!(f, "struct<")?; + for (i, field) in s.fields.iter().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}: {}", field.name, field.data_type)?; + } + write!(f, ">") + } + DataType::Map(m) => write!(f, "map<{}, {}>", m.key_type, m.value_type), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json; + use serde_json::json; + + #[test] + fn test_serde_data_types() { + let data = r#" + { + "name": "a", + "type": "integer", + "nullable": false, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!( + field.data_type, + DataType::Primitive(PrimitiveType::Integer) + )); + + let data = r#" + { + "name": "c", + "type": { + "type": "array", + "elementType": "integer", + "containsNull": false + }, + "nullable": true, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!(field.data_type, DataType::Array(_))); + + let data = r#" + { + "name": "e", + "type": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": {} + } + ] + }, + "containsNull": true + }, + "nullable": true, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!(field.data_type, DataType::Array(_))); + match field.data_type { + DataType::Array(array) => assert!(matches!(array.element_type, DataType::Struct(_))), + _ => unreachable!(), + } + + let data = r#" + { + "name": "f", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!(field.data_type, DataType::Map(_))); + } + + #[test] + fn test_roundtrip_decimal() { + let data = r#" + { + "name": "a", + "type": "decimal(10, 2)", + "nullable": false, + "metadata": {} + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + assert!(matches!( + field.data_type, + DataType::Primitive(PrimitiveType::Decimal(10, 2)) + )); + + let json_str = serde_json::to_string(&field).unwrap(); + assert_eq!( + json_str, + r#"{"name":"a","type":"decimal(10,2)","nullable":false,"metadata":{}}"# + ); + } + + #[test] + fn test_field_metadata() { + let data = r#" + { + "name": "e", + "type": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": { + "delta.columnMapping.id": 5, + "delta.columnMapping.physicalName": "col-a7f4159c-53be-4cb0-b81a-f7e5240cfc49" + } + } + ] + }, + "containsNull": true + }, + "nullable": true, + "metadata": { + "delta.columnMapping.id": 4, + "delta.columnMapping.physicalName": "col-5f422f40-de70-45b2-88ab-1d5c90e94db1" + } + } + "#; + let field: StructField = serde_json::from_str(data).unwrap(); + + let col_id = field + .get_config_value(&ColumnMetadataKey::ColumnMappingId) + .unwrap(); + assert!(matches!(col_id, MetadataValue::Number(num) if *num == 4)); + let physical_name = field + .get_config_value(&ColumnMetadataKey::ColumnMappingPhysicalName) + .unwrap(); + assert!( + matches!(physical_name, MetadataValue::String(name) if *name == "col-5f422f40-de70-45b2-88ab-1d5c90e94db1") + ); + } + + #[test] + fn test_read_schemas() { + let file = std::fs::File::open("./tests/serde/schema.json").unwrap(); + let schema: Result = serde_json::from_reader(file); + assert!(schema.is_ok()); + + let file = std::fs::File::open("./tests/serde/checkpoint_schema.json").unwrap(); + let schema: Result = serde_json::from_reader(file); + assert!(schema.is_ok()) + } + + #[test] + fn test_get_invariants() { + let schema: StructType = serde_json::from_value(json!({ + "type": "struct", + "fields": [{"name": "x", "type": "string", "nullable": true, "metadata": {}}] + })) + .unwrap(); + let invariants = schema.get_invariants().unwrap(); + assert_eq!(invariants.len(), 0); + + let schema: StructType = serde_json::from_value(json!({ + "type": "struct", + "fields": [ + {"name": "x", "type": "integer", "nullable": true, "metadata": { + "delta.invariants": "{\"expression\": { \"expression\": \"x > 2\"} }" + }}, + {"name": "y", "type": "integer", "nullable": true, "metadata": { + "delta.invariants": "{\"expression\": { \"expression\": \"y < 4\"} }" + }} + ] + })) + .unwrap(); + let invariants = schema.get_invariants().unwrap(); + assert_eq!(invariants.len(), 2); + assert!(invariants.contains(&Invariant::new("x", "x > 2"))); + assert!(invariants.contains(&Invariant::new("y", "y < 4"))); + + let schema: StructType = serde_json::from_value(json!({ + "type": "struct", + "fields": [{ + "name": "a_map", + "type": { + "type": "map", + "keyType": "string", + "valueType": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [{ + "name": "d", + "type": "integer", + "metadata": { + "delta.invariants": "{\"expression\": { \"expression\": \"a_map.value.element.d < 4\"} }" + }, + "nullable": false + }] + }, + "containsNull": false + }, + "valueContainsNull": false + }, + "nullable": false, + "metadata": {} + }] + })).unwrap(); + let invariants = schema.get_invariants().unwrap(); + assert_eq!(invariants.len(), 1); + assert_eq!( + invariants[0], + Invariant::new("a_map.value.element.d", "a_map.value.element.d < 4") + ); + } +} diff --git a/crates/deltalake-core/src/lib.rs b/crates/deltalake-core/src/lib.rs index fa7f65963f..d683b906dd 100644 --- a/crates/deltalake-core/src/lib.rs +++ b/crates/deltalake-core/src/lib.rs @@ -84,6 +84,7 @@ compile_error!( pub mod data_catalog; pub mod errors; +pub mod kernel; pub mod operations; pub mod protocol; pub mod schema; @@ -200,12 +201,17 @@ mod tests { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 4); - assert!(tombstones.contains(&crate::protocol::Remove { + assert!(tombstones.contains(&crate::kernel::Remove { path: "part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1564524298213), data_change: false, extended_file_metadata: Some(false), - ..Default::default() + deletion_vector: None, + partition_values: None, + tags: None, + base_row_id: None, + default_row_commit_version: None, + size: None, })); } @@ -302,14 +308,17 @@ mod tests { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 1); - assert!(tombstones.contains(&crate::protocol::Remove { + assert!(tombstones.contains(&crate::kernel::Remove { path: "part-00001-911a94a2-43f6-4acb-8620-5e68c2654989-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1615043776198), data_change: true, extended_file_metadata: Some(true), partition_values: Some(HashMap::new()), size: Some(445), - ..Default::default() + base_row_id: None, + default_row_commit_version: None, + deletion_vector: None, + tags: None, })); } diff --git a/crates/deltalake-core/src/operations/create.rs b/crates/deltalake-core/src/operations/create.rs index 8a78f2266b..1dc9fdf8b2 100644 --- a/crates/deltalake-core/src/operations/create.rs +++ b/crates/deltalake-core/src/operations/create.rs @@ -10,8 +10,8 @@ use serde_json::{Map, Value}; use super::transaction::commit; use super::{MAX_SUPPORTED_READER_VERSION, MAX_SUPPORTED_WRITER_VERSION}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, DeltaOperation, MetaData, Protocol, SaveMode}; -use crate::schema::{SchemaDataType, SchemaField, SchemaTypeStruct}; +use crate::kernel::{Action, DataType, Metadata, Protocol, StructField, StructType}; +use crate::protocol::{DeltaOperation, SaveMode}; use crate::storage::DeltaObjectStore; use crate::table::builder::ensure_table_uri; use crate::table::config::DeltaConfigKey; @@ -51,7 +51,7 @@ pub struct CreateBuilder { location: Option, mode: SaveMode, comment: Option, - columns: Vec, + columns: Vec, partition_columns: Option>, storage_options: Option>, actions: Vec, @@ -114,23 +114,22 @@ impl CreateBuilder { pub fn with_column( mut self, name: impl Into, - data_type: SchemaDataType, + data_type: DataType, nullable: bool, metadata: Option>, ) -> Self { - self.columns.push(SchemaField::new( - name.into(), - data_type, - nullable, - metadata.unwrap_or_default(), - )); + let mut field = StructField::new(name.into(), data_type, nullable); + if let Some(meta) = metadata { + field = field.with_metadata(meta); + }; + self.columns.push(field); self } /// Specify columns to append to schema pub fn with_columns( mut self, - columns: impl IntoIterator>, + columns: impl IntoIterator>, ) -> Self { self.columns.extend(columns.into_iter().map(|c| c.into())); self @@ -212,7 +211,7 @@ impl CreateBuilder { if self .actions .iter() - .any(|a| matches!(a, Action::metaData(_))) + .any(|a| matches!(a, Action::Metadata(_))) { return Err(CreateError::MetadataSpecified.into()); } @@ -242,9 +241,9 @@ impl CreateBuilder { let protocol = self .actions .iter() - .find(|a| matches!(a, Action::protocol(_))) + .find(|a| matches!(a, Action::Protocol(_))) .map(|a| match a { - Action::protocol(p) => p.clone(), + Action::Protocol(p) => p.clone(), _ => unreachable!(), }) .unwrap_or_else(|| Protocol { @@ -258,7 +257,7 @@ impl CreateBuilder { self.name, self.comment, None, - SchemaTypeStruct::new(self.columns), + StructType::new(self.columns), self.partition_columns.unwrap_or_default(), self.configuration, ); @@ -271,13 +270,13 @@ impl CreateBuilder { }; let mut actions = vec![ - Action::protocol(protocol), - Action::metaData(MetaData::try_from(metadata)?), + Action::Protocol(protocol), + Action::Metadata(Metadata::try_from(metadata)?), ]; actions.extend( self.actions .into_iter() - .filter(|a| !matches!(a, Action::protocol(_))), + .filter(|a| !matches!(a, Action::Protocol(_))), ); Ok((table, actions, operation)) @@ -340,7 +339,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -360,7 +359,7 @@ mod tests { .await .unwrap() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -378,7 +377,7 @@ mod tests { ); let table = CreateBuilder::new() .with_location(format!("./{relative_path}")) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -389,7 +388,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -406,8 +405,8 @@ mod tests { }; let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.get_fields().clone()) - .with_actions(vec![Action::protocol(protocol)]) + .with_columns(schema.fields().clone()) + .with_actions(vec![Action::Protocol(protocol)]) .await .unwrap(); assert_eq!(table.get_min_reader_version(), 0); @@ -415,7 +414,7 @@ mod tests { let table = CreateBuilder::new() .with_location("memory://") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_configuration_property(DeltaConfigKey::AppendOnly, Some("true")) .await .unwrap(); @@ -438,7 +437,7 @@ mod tests { let schema = get_delta_schema(); let table = CreateBuilder::new() .with_location(tmp_dir.path().to_str().unwrap()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -449,7 +448,7 @@ mod tests { // Check an error is raised when a table exists at location let table = CreateBuilder::new() .with_object_store(object_store.clone()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_save_mode(SaveMode::ErrorIfExists) .await; assert!(table.is_err()); @@ -457,7 +456,7 @@ mod tests { // Check current table is returned when ignore option is chosen. let table = CreateBuilder::new() .with_object_store(object_store.clone()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_save_mode(SaveMode::Ignore) .await .unwrap(); @@ -466,7 +465,7 @@ mod tests { // Check table is overwritten let table = CreateBuilder::new() .with_object_store(object_store.clone()) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().iter().cloned()) .with_save_mode(SaveMode::Overwrite) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/delete.rs b/crates/deltalake-core/src/operations/delete.rs index d387024673..7f8be1f293 100644 --- a/crates/deltalake-core/src/operations/delete.rs +++ b/crates/deltalake-core/src/operations/delete.rs @@ -17,11 +17,10 @@ //! .await?; //! ```` +use std::collections::HashMap; use std::sync::Arc; use std::time::{Instant, SystemTime, UNIX_EPOCH}; -use crate::delta_datafusion::expr::fmt_expr_to_sql; -use crate::protocol::{Action, Add, Remove}; use datafusion::execution::context::{SessionContext, SessionState}; use datafusion::physical_expr::create_physical_expr; use datafusion::physical_plan::filter::FilterExec; @@ -32,11 +31,12 @@ use datafusion_common::DFSchema; use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::Map; use serde_json::Value; +use crate::delta_datafusion::expr::fmt_expr_to_sql; use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; use crate::errors::{DeltaResult, DeltaTableError}; +use crate::kernel::{Action, Add, Remove}; use crate::operations::transaction::commit; use crate::operations::write::write_execution_plan; use crate::protocol::DeltaOperation; @@ -60,7 +60,7 @@ pub struct DeleteBuilder { /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, } #[derive(Default, Debug, Serialize)] @@ -112,7 +112,7 @@ impl DeleteBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -191,7 +191,7 @@ async fn execute( snapshot: &DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + app_metadata: Option>, ) -> DeltaResult<((Vec, i64), DeleteMetrics)> { let exec_start = Instant::now(); let mut metrics = DeleteMetrics::default(); @@ -226,21 +226,23 @@ async fn execute( .unwrap() .as_millis() as i64; - let mut actions: Vec = add.into_iter().map(Action::add).collect(); + let mut actions: Vec = add.into_iter().map(Action::Add).collect(); let mut version = snapshot.version(); metrics.num_removed_files = remove.len(); metrics.num_added_files = actions.len(); for action in remove { - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: action.path, deletion_timestamp: Some(deletion_timestamp), data_change: true, extended_file_metadata: Some(true), partition_values: Some(action.partition_values), size: Some(action.size), - deletion_vector: None, + deletion_vector: action.deletion_vector, tags: None, + base_row_id: action.base_row_id, + default_row_commit_version: action.default_row_commit_version, })) } @@ -334,7 +336,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/filesystem_check.rs b/crates/deltalake-core/src/operations/filesystem_check.rs index 83af12b57c..65716bbfe1 100644 --- a/crates/deltalake-core/src/operations/filesystem_check.rs +++ b/crates/deltalake-core/src/operations/filesystem_check.rs @@ -26,8 +26,9 @@ use serde::Serialize; use url::{ParseError, Url}; use crate::errors::{DeltaResult, DeltaTableError}; +use crate::kernel::{Action, Add, Remove}; use crate::operations::transaction::commit; -use crate::protocol::{Action, Add, DeltaOperation, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::DeltaObjectStore; use crate::table::state::DeltaTableState; use crate::DeltaTable; @@ -140,7 +141,7 @@ impl FileSystemCheckPlan { let deletion_time = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); let deletion_time = deletion_time.as_millis() as i64; removed_file_paths.push(file.path.clone()); - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: file.path, deletion_timestamp: Some(deletion_time), data_change: true, @@ -149,6 +150,8 @@ impl FileSystemCheckPlan { size: Some(file.size), deletion_vector: None, tags: file.tags, + base_row_id: file.base_row_id, + default_row_commit_version: file.default_row_commit_version, })); } diff --git a/crates/deltalake-core/src/operations/merge.rs b/crates/deltalake-core/src/operations/merge.rs index a51e7649fc..57621cb316 100644 --- a/crates/deltalake-core/src/operations/merge.rs +++ b/crates/deltalake-core/src/operations/merge.rs @@ -61,21 +61,19 @@ use datafusion_physical_expr::{create_physical_expr, expressions, PhysicalExpr}; use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::{Map, Value}; +use serde_json::Value; use super::datafusion_utils::{into_expr, maybe_into_expr, Expression}; use super::transaction::commit; use crate::delta_datafusion::expr::{fmt_expr_to_sql, parse_predicate_expression}; use crate::delta_datafusion::{register_store, DeltaScanBuilder}; +use crate::kernel::{Action, Remove}; use crate::operations::datafusion_utils::MetricObserverExec; -use crate::{ - operations::write::write_execution_plan, - storage::{DeltaObjectStore, ObjectStoreRef}, - DeltaResult, DeltaTable, DeltaTableError, -}; - -use crate::protocol::{Action, DeltaOperation, MergePredicate, Remove}; +use crate::operations::write::write_execution_plan; +use crate::protocol::{DeltaOperation, MergePredicate}; +use crate::storage::{DeltaObjectStore, ObjectStoreRef}; use crate::table::state::DeltaTableState; +use crate::{DeltaResult, DeltaTable, DeltaTableError}; const OPERATION_COLUMN: &str = "__delta_rs_operation"; const DELETE_COLUMN: &str = "__delta_rs_delete"; @@ -115,7 +113,7 @@ pub struct MergeBuilder { /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, /// safe_cast determines how data types that do not match the underlying table are handled /// By default an error is returned safe_cast: bool, @@ -343,7 +341,7 @@ impl MergeBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -567,7 +565,7 @@ async fn execute( snapshot: &DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + app_metadata: Option>, safe_cast: bool, source_alias: Option, target_alias: Option, @@ -843,7 +841,7 @@ async fn execute( let mut projection_map = HashMap::new(); let mut f = project_schema_df.fields().clone(); - for delta_field in snapshot.schema().unwrap().get_fields() { + for delta_field in snapshot.schema().unwrap().fields() { let mut when_expr = Vec::with_capacity(operations_size); let mut then_expr = Vec::with_capacity(operations_size); @@ -853,7 +851,7 @@ async fn execute( }), None => TableReference::none(), }; - let name = delta_field.get_name(); + let name = delta_field.name(); let column = Column::new(qualifier.clone(), name); let field = project_schema_df.field_with_name(qualifier.as_ref(), name)?; @@ -882,8 +880,8 @@ async fn execute( state.execution_props(), )?; - projection_map.insert(delta_field.get_name(), expressions.len()); - let name = "__delta_rs_c_".to_owned() + delta_field.get_name(); + projection_map.insert(delta_field.name(), expressions.len()); + let name = "__delta_rs_c_".to_owned() + delta_field.name(); f.push(DFField::new_unqualified( &name, @@ -1143,12 +1141,12 @@ async fn execute( .unwrap() .as_millis() as i64; - let mut actions: Vec = add_actions.into_iter().map(Action::add).collect(); + let mut actions: Vec = add_actions.into_iter().map(Action::Add).collect(); metrics.num_target_files_added = actions.len(); for action in snapshot.files() { metrics.num_target_files_removed += 1; - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: action.path.clone(), deletion_timestamp: Some(deletion_timestamp), data_change: true, @@ -1157,6 +1155,8 @@ async fn execute( deletion_vector: action.deletion_vector.clone(), size: Some(action.size), tags: None, + base_row_id: action.base_row_id, + default_row_commit_version: action.default_row_commit_version, })) } @@ -1270,7 +1270,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/optimize.rs b/crates/deltalake-core/src/operations/optimize.rs index ae9ab6cd65..7feecd1e56 100644 --- a/crates/deltalake-core/src/operations/optimize.rs +++ b/crates/deltalake-core/src/operations/optimize.rs @@ -37,12 +37,12 @@ use parquet::basic::{Compression, ZstdLevel}; use parquet::errors::ParquetError; use parquet::file::properties::WriterProperties; use serde::{Deserialize, Serialize}; -use serde_json::Map; use super::transaction::commit; use super::writer::{PartitionWriter, PartitionWriterConfig}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{self, Action, DeltaOperation}; +use crate::kernel::{Action, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::ObjectStoreRef; use crate::table::state::DeltaTableState; use crate::writer::utils::arrow_schema_without_partitions; @@ -311,7 +311,7 @@ fn create_remove( let deletion_time = SystemTime::now().duration_since(UNIX_EPOCH).unwrap(); let deletion_time = deletion_time.as_millis() as i64; - Ok(Action::remove(protocol::Remove { + Ok(Action::Remove(Remove { path: path.to_string(), deletion_timestamp: Some(deletion_time), data_change: false, @@ -320,6 +320,8 @@ fn create_remove( size: Some(size), deletion_vector: None, tags: None, + base_row_id: None, + default_row_commit_version: None, })) } @@ -450,7 +452,7 @@ impl MergePlan { partial_metrics.files_added.max = std::cmp::max(partial_metrics.files_added.max, size); partial_metrics.files_added.min = std::cmp::min(partial_metrics.files_added.min, size); - Action::add(add) + Action::Add(add) }); partial_actions.extend(add_actions); @@ -703,7 +705,7 @@ impl MergePlan { last_commit = now; buffered_metrics.preserve_insertion_order = true; - let mut metadata = Map::new(); + let mut metadata = HashMap::new(); metadata.insert("readVersion".to_owned(), self.read_table_version.into()); let maybe_map_metrics = serde_json::to_value(std::mem::replace( &mut buffered_metrics, @@ -791,12 +793,14 @@ pub fn create_merge_plan( let input_parameters = OptimizeInput { target_size }; let file_schema = arrow_schema_without_partitions( - &Arc::new(>::try_from( - &snapshot - .current_metadata() - .ok_or(DeltaTableError::NoMetadata)? - .schema, - )?), + &Arc::new( + >::try_from( + &snapshot + .current_metadata() + .ok_or(DeltaTableError::NoMetadata)? + .schema, + )?, + ), partitions_keys, ); @@ -943,9 +947,9 @@ fn build_zorder_plan( .current_metadata() .unwrap() .schema - .get_fields() + .fields() .iter() - .map(|field| field.get_name().to_string()) + .map(|field| field.name().to_string()) .collect_vec(); let unknown_columns = zorder_columns .iter() diff --git a/crates/deltalake-core/src/operations/restore.rs b/crates/deltalake-core/src/operations/restore.rs index 1f4de3a06c..a356b5b312 100644 --- a/crates/deltalake-core/src/operations/restore.rs +++ b/crates/deltalake-core/src/operations/restore.rs @@ -30,8 +30,9 @@ use object_store::path::Path; use object_store::ObjectStore; use serde::Serialize; +use crate::kernel::{Action, Add, Protocol, Remove}; use crate::operations::transaction::{prepare_commit, try_commit_transaction, TransactionError}; -use crate::protocol::{Action, Add, DeltaOperation, Protocol, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::ObjectStoreRef; use crate::table::state::DeltaTableState; use crate::{DeltaResult, DeltaTable, DeltaTableConfig, DeltaTableError, ObjectStoreError}; @@ -187,6 +188,8 @@ async fn execute( size: Some(a.size), tags: a.tags, deletion_vector: a.deletion_vector, + base_row_id: a.base_row_id, + default_row_commit_version: a.default_row_commit_version, } }) .collect(); @@ -230,9 +233,9 @@ async fn execute( reader_features: snapshot.reader_features().cloned(), } }; - actions.push(Action::protocol(protocol)); - actions.extend(files_to_add.into_iter().map(Action::add)); - actions.extend(files_to_remove.into_iter().map(Action::remove)); + actions.push(Action::Protocol(protocol)); + actions.extend(files_to_add.into_iter().map(Action::Add)); + actions.extend(files_to_remove.into_iter().map(Action::Remove)); let commit = prepare_commit( object_store.as_ref(), diff --git a/crates/deltalake-core/src/operations/transaction/conflict_checker.rs b/crates/deltalake-core/src/operations/transaction/conflict_checker.rs index 6bbc2a9d45..3a0bf0526d 100644 --- a/crates/deltalake-core/src/operations/transaction/conflict_checker.rs +++ b/crates/deltalake-core/src/operations/transaction/conflict_checker.rs @@ -6,7 +6,8 @@ use object_store::ObjectStore; use super::CommitInfo; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, Add, DeltaOperation, MetaData, Protocol, Remove}; +use crate::kernel::{Action, Add, Metadata, Protocol, Remove}; +use crate::protocol::DeltaOperation; use crate::storage::commit_uri_from_version; use crate::table::config::IsolationLevel; use crate::table::state::DeltaTableState; @@ -169,7 +170,7 @@ impl<'a> TransactionInfo<'a> { pub fn metadata_changed(&self) -> bool { self.actions .iter() - .any(|a| matches!(a, Action::metaData(_))) + .any(|a| matches!(a, Action::Metadata(_))) } #[cfg(feature = "datafusion")] @@ -236,9 +237,9 @@ impl WinningCommitSummary { let commit_info = actions .iter() - .find(|action| matches!(action, Action::commitInfo(_))) + .find(|action| matches!(action, Action::CommitInfo(_))) .map(|action| match action { - Action::commitInfo(info) => info.clone(), + Action::CommitInfo(info) => info.clone(), _ => unreachable!(), }); @@ -248,12 +249,12 @@ impl WinningCommitSummary { }) } - pub fn metadata_updates(&self) -> Vec { + pub fn metadata_updates(&self) -> Vec { self.actions .iter() .cloned() .filter_map(|action| match action { - Action::metaData(metadata) => Some(metadata), + Action::Metadata(metadata) => Some(metadata), _ => None, }) .collect() @@ -264,7 +265,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::txn(txn) => Some(txn.app_id), + Action::Txn(txn) => Some(txn.app_id), _ => None, }) .collect() @@ -275,7 +276,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::protocol(protocol) => Some(protocol), + Action::Protocol(protocol) => Some(protocol), _ => None, }) .collect() @@ -286,7 +287,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::remove(remove) => Some(remove), + Action::Remove(remove) => Some(remove), _ => None, }) .collect() @@ -297,7 +298,7 @@ impl WinningCommitSummary { .iter() .cloned() .filter_map(|action| match action { - Action::add(add) => Some(add), + Action::Add(add) => Some(add), _ => None, }) .collect() @@ -414,7 +415,7 @@ impl<'a> ConflictChecker<'a> { .txn_info .actions .iter() - .any(|a| matches!(a, Action::protocol(_))) + .any(|a| matches!(a, Action::Protocol(_))) { return Err(CommitConflictError::ProtocolChanged( "protocol changed".into(), @@ -546,7 +547,7 @@ impl<'a> ConflictChecker<'a> { .iter() .cloned() .filter_map(|action| match action { - Action::remove(remove) => Some(remove.path), + Action::Remove(remove) => Some(remove.path), _ => None, }) .collect(); @@ -620,8 +621,8 @@ pub(super) fn can_downgrade_to_snapshot_isolation<'a>( let mut has_non_file_actions = false; for action in actions { match action { - Action::add(act) if act.data_change => data_changed = true, - Action::remove(rem) if rem.data_change => data_changed = true, + Action::Add(act) if act.data_change => data_changed = true, + Action::Remove(rem) if rem.data_change => data_changed = true, _ => has_non_file_actions = true, } } @@ -644,7 +645,7 @@ mod tests { use super::super::test_utils as tu; use super::super::test_utils::init_table_actions; use super::*; - use crate::protocol::Action; + use crate::kernel::Action; #[cfg(feature = "datafusion")] use datafusion_expr::{col, lit}; use serde_json::json; diff --git a/crates/deltalake-core/src/operations/transaction/mod.rs b/crates/deltalake-core/src/operations/transaction/mod.rs index 738ae404ec..c31c349fd7 100644 --- a/crates/deltalake-core/src/operations/transaction/mod.rs +++ b/crates/deltalake-core/src/operations/transaction/mod.rs @@ -1,13 +1,16 @@ //! Delta transactions +use std::collections::HashMap; + use chrono::Utc; use conflict_checker::ConflictChecker; use object_store::path::Path; use object_store::{Error as ObjectStoreError, ObjectStore}; -use serde_json::{Map, Value}; +use serde_json::Value; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, CommitInfo, DeltaOperation}; +use crate::kernel::{Action, CommitInfo}; +use crate::protocol::DeltaOperation; use crate::storage::commit_uri_from_version; use crate::table::state::DeltaTableState; @@ -79,7 +82,7 @@ fn log_entry_from_actions<'a>( let mut jsons = Vec::::new(); for action in actions { if append_only { - if let Action::remove(remove) = action { + if let Action::Remove(remove) = action { if remove.data_change { return Err(TransactionError::DeltaTableAppendOnly); } @@ -96,24 +99,24 @@ pub(crate) fn get_commit_bytes( operation: &DeltaOperation, actions: &Vec, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, ) -> Result { - if !actions.iter().any(|a| matches!(a, Action::commitInfo(..))) { - let mut extra_info = Map::::new(); + if !actions.iter().any(|a| matches!(a, Action::CommitInfo(..))) { + let mut extra_info = HashMap::::new(); let mut commit_info = operation.get_commit_info(); commit_info.timestamp = Some(Utc::now().timestamp_millis()); extra_info.insert( "clientVersion".to_string(), Value::String(format!("delta-rs.{}", crate_version())), ); - if let Some(mut meta) = app_metadata { - extra_info.append(&mut meta) + if let Some(meta) = app_metadata { + extra_info.extend(meta) } commit_info.info = extra_info; Ok(bytes::Bytes::from(log_entry_from_actions( actions .iter() - .chain(std::iter::once(&Action::commitInfo(commit_info))), + .chain(std::iter::once(&Action::CommitInfo(commit_info))), read_snapshot, )?)) } else { @@ -132,7 +135,7 @@ pub(crate) async fn prepare_commit<'a>( operation: &DeltaOperation, actions: &Vec, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, ) -> Result { // Serialize all actions that are part of this log entry. let log_entry = get_commit_bytes(operation, actions, read_snapshot, app_metadata)?; @@ -180,7 +183,7 @@ pub async fn commit( actions: &Vec, operation: DeltaOperation, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, ) -> DeltaResult { commit_with_retries(storage, actions, operation, read_snapshot, app_metadata, 15).await } @@ -194,7 +197,7 @@ pub async fn commit_with_retries( actions: &Vec, operation: DeltaOperation, read_snapshot: &DeltaTableState, - app_metadata: Option>, + app_metadata: Option>, max_retries: usize, ) -> DeltaResult { let tmp_commit = diff --git a/crates/deltalake-core/src/operations/transaction/state.rs b/crates/deltalake-core/src/operations/transaction/state.rs index bb9c3ff35e..a209b7369d 100644 --- a/crates/deltalake-core/src/operations/transaction/state.rs +++ b/crates/deltalake-core/src/operations/transaction/state.rs @@ -20,7 +20,7 @@ use crate::delta_datafusion::{ get_null_of_arrow_type, logical_expr_to_physical_expr, to_correct_scalar_value, }; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::Add; +use crate::kernel::Add; use crate::table::state::DeltaTableState; impl DeltaTableState { @@ -33,15 +33,15 @@ impl DeltaTableState { let meta = self.current_metadata().ok_or(DeltaTableError::NoMetadata)?; let fields = meta .schema - .get_fields() + .fields() .iter() - .filter(|f| !meta.partition_columns.contains(&f.get_name().to_string())) + .filter(|f| !meta.partition_columns.contains(&f.name().to_string())) .map(|f| f.try_into()) .chain( meta.schema - .get_fields() + .fields() .iter() - .filter(|f| meta.partition_columns.contains(&f.get_name().to_string())) + .filter(|f| meta.partition_columns.contains(&f.name().to_string())) .map(|f| { let field = ArrowField::try_from(f)?; let corrected = if wrap_partitions { diff --git a/crates/deltalake-core/src/operations/transaction/test_utils.rs b/crates/deltalake-core/src/operations/transaction/test_utils.rs index e4ae14b2ed..b52b1a1c7b 100644 --- a/crates/deltalake-core/src/operations/transaction/test_utils.rs +++ b/crates/deltalake-core/src/operations/transaction/test_utils.rs @@ -1,31 +1,49 @@ #![allow(unused)] use std::collections::HashMap; -use super::{prepare_commit, try_commit_transaction, CommitInfo}; -use crate::protocol::{Action, Add, DeltaOperation, MetaData, Protocol, Remove, SaveMode}; +use super::{prepare_commit, try_commit_transaction}; +use crate::kernel::{ + Action, Add, CommitInfo, DataType, Metadata, PrimitiveType, Protocol, Remove, StructField, + StructType, +}; +use crate::protocol::{DeltaOperation, SaveMode}; use crate::table::state::DeltaTableState; use crate::table::DeltaTableMetaData; -use crate::{DeltaTable, DeltaTableBuilder, Schema, SchemaDataType, SchemaField}; +use crate::{DeltaTable, DeltaTableBuilder}; pub fn create_add_action( path: impl Into, data_change: bool, stats: Option, ) -> Action { - Action::add(Add { + Action::Add(Add { path: path.into(), size: 100, data_change, stats, - ..Default::default() + modification_time: -1, + partition_values: Default::default(), + partition_values_parsed: None, + stats_parsed: None, + base_row_id: None, + default_row_commit_version: None, + tags: None, + deletion_vector: None, }) } pub fn create_remove_action(path: impl Into, data_change: bool) -> Action { - Action::remove(Remove { + Action::Remove(Remove { path: path.into(), data_change, - ..Default::default() + size: None, + deletion_timestamp: None, + deletion_vector: None, + partition_values: Default::default(), + extended_file_metadata: None, + base_row_id: None, + default_row_commit_version: None, + tags: None, }) } @@ -36,31 +54,28 @@ pub fn create_protocol_action(max_reader: Option, max_writer: Option) writer_features: None, reader_features: None, }; - Action::protocol(protocol) + Action::Protocol(protocol) } pub fn create_metadata_action( parttiton_columns: Option>, configuration: Option>>, ) -> Action { - let table_schema = Schema::new(vec![ - SchemaField::new( + let table_schema = StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]); let metadata = DeltaTableMetaData::new( @@ -71,7 +86,7 @@ pub fn create_metadata_action( parttiton_columns.unwrap_or_default(), configuration.unwrap_or_default(), ); - Action::metaData(MetaData::try_from(metadata).unwrap()) + Action::Metadata(Metadata::try_from(metadata).unwrap()) } pub fn init_table_actions(configuration: Option>>) -> Vec { @@ -96,7 +111,7 @@ pub fn init_table_actions(configuration: Option>> let commit_info = serde_json::from_str::(raw).unwrap(); vec![ - Action::commitInfo(commit_info), + Action::CommitInfo(commit_info), create_protocol_action(None, None), create_metadata_action(None, configuration), ] @@ -109,24 +124,21 @@ pub async fn create_initialized_table( let storage = DeltaTableBuilder::from_uri("memory://") .build_storage() .unwrap(); - let table_schema = Schema::new(vec![ - SchemaField::new( + let table_schema = StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]); let state = DeltaTableState::from_actions(init_table_actions(None), 0).unwrap(); diff --git a/crates/deltalake-core/src/operations/update.rs b/crates/deltalake-core/src/operations/update.rs index 8b7ec915f3..1723d287a2 100644 --- a/crates/deltalake-core/src/operations/update.rs +++ b/crates/deltalake-core/src/operations/update.rs @@ -41,21 +41,18 @@ use datafusion_physical_expr::{ use futures::future::BoxFuture; use parquet::file::properties::WriterProperties; use serde::Serialize; -use serde_json::{Map, Value}; - -use crate::{ - delta_datafusion::{expr::fmt_expr_to_sql, find_files, register_store, DeltaScanBuilder}, - protocol::{Action, DeltaOperation, Remove}, - storage::{DeltaObjectStore, ObjectStoreRef}, - table::state::DeltaTableState, - DeltaResult, DeltaTable, DeltaTableError, -}; - -use super::{ - datafusion_utils::{Expression, MetricObserverExec}, - transaction::commit, - write::write_execution_plan, -}; +use serde_json::Value; + +use super::datafusion_utils::{Expression, MetricObserverExec}; +use super::transaction::commit; +use super::write::write_execution_plan; +use crate::delta_datafusion::expr::fmt_expr_to_sql; +use crate::delta_datafusion::{find_files, register_store, DeltaScanBuilder}; +use crate::kernel::{Action, Remove}; +use crate::protocol::DeltaOperation; +use crate::storage::{DeltaObjectStore, ObjectStoreRef}; +use crate::table::state::DeltaTableState; +use crate::{DeltaResult, DeltaTable, DeltaTableError}; /// Updates records in the Delta Table. /// See this module's documentation for more information @@ -73,7 +70,7 @@ pub struct UpdateBuilder { /// Properties passed to underlying parquet writer for when files are rewritten writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, /// safe_cast determines how data types that do not match the underlying table are handled /// By default an error is returned safe_cast: bool, @@ -138,7 +135,7 @@ impl UpdateBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -171,7 +168,7 @@ async fn execute( snapshot: &DeltaTableState, state: SessionState, writer_properties: Option, - app_metadata: Option>, + app_metadata: Option>, safe_cast: bool, ) -> DeltaResult<((Vec, i64), UpdateMetrics)> { // Validate the predicate and update expressions. @@ -384,13 +381,13 @@ async fn execute( .duration_since(UNIX_EPOCH) .unwrap() .as_millis() as i64; - let mut actions: Vec = add_actions.into_iter().map(Action::add).collect(); + let mut actions: Vec = add_actions.into_iter().map(Action::Add).collect(); metrics.num_added_files = actions.len(); metrics.num_removed_files = candidates.candidates.len(); for action in candidates.candidates { - actions.push(Action::remove(Remove { + actions.push(Action::Remove(Remove { path: action.path, deletion_timestamp: Some(deletion_timestamp), data_change: true, @@ -399,6 +396,8 @@ async fn execute( size: Some(action.size), deletion_vector: action.deletion_vector, tags: None, + base_row_id: None, + default_row_commit_version: None, })) } @@ -480,7 +479,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions.unwrap_or_default()) .await .unwrap(); diff --git a/crates/deltalake-core/src/operations/vacuum.rs b/crates/deltalake-core/src/operations/vacuum.rs index 684e6f6d0a..47f7c1d5c9 100644 --- a/crates/deltalake-core/src/operations/vacuum.rs +++ b/crates/deltalake-core/src/operations/vacuum.rs @@ -21,7 +21,7 @@ //! let (table, metrics) = VacuumBuilder::new(table.object_store(). table.state).await?; //! ```` -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::Arc; @@ -31,12 +31,13 @@ use futures::{StreamExt, TryStreamExt}; use object_store::Error; use object_store::{path::Path, ObjectStore}; use serde::Serialize; -use serde_json::{Map, Value}; +use serde_json::Value; use super::transaction::commit; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, DeltaOperation}; // Txn CommitInfo +use crate::kernel::Action; +use crate::protocol::DeltaOperation; use crate::storage::DeltaObjectStore; use crate::table::state::DeltaTableState; use crate::DeltaTable; @@ -288,7 +289,7 @@ impl VacuumPlan { // Begin VACUUM START COMMIT let mut commit_info = start_operation.get_commit_info(); - let mut extra_info = Map::::new(); + let mut extra_info = HashMap::::new(); commit_info.timestamp = Some(Utc::now().timestamp_millis()); extra_info.insert( @@ -300,7 +301,7 @@ impl VacuumPlan { } commit_info.info = extra_info; - let start_actions = vec![Action::commitInfo(commit_info)]; + let start_actions = vec![Action::CommitInfo(commit_info)]; commit(store, &start_actions, start_operation, snapshot, None).await?; // Finish VACUUM START COMMIT @@ -327,7 +328,7 @@ impl VacuumPlan { // Begin VACUUM END COMMIT let mut commit_info = end_operation.get_commit_info(); - let mut extra_info = Map::::new(); + let mut extra_info = HashMap::::new(); commit_info.timestamp = Some(Utc::now().timestamp_millis()); extra_info.insert( @@ -339,7 +340,7 @@ impl VacuumPlan { } commit_info.info = extra_info; - let end_actions = vec![Action::commitInfo(commit_info)]; + let end_actions = vec![Action::CommitInfo(commit_info)]; commit(store, &end_actions, end_operation, snapshot, None).await?; // Finish VACUUM END COMMIT diff --git a/crates/deltalake-core/src/operations/write.rs b/crates/deltalake-core/src/operations/write.rs index 31723cc235..45bdaaeff5 100644 --- a/crates/deltalake-core/src/operations/write.rs +++ b/crates/deltalake-core/src/operations/write.rs @@ -37,15 +37,14 @@ use datafusion::physical_plan::{memory::MemoryExec, ExecutionPlan}; use futures::future::BoxFuture; use futures::StreamExt; use parquet::file::properties::WriterProperties; -use serde_json::Map; use super::writer::{DeltaWriter, WriterConfig}; use super::MAX_SUPPORTED_WRITER_VERSION; use super::{transaction::commit, CreateBuilder}; use crate::delta_datafusion::DeltaDataChecker; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::{Action, Add, DeltaOperation, Remove, SaveMode}; -use crate::schema::Schema; +use crate::kernel::{Action, Add, Remove, StructType}; +use crate::protocol::{DeltaOperation, SaveMode}; use crate::storage::{DeltaObjectStore, ObjectStoreRef}; use crate::table::state::DeltaTableState; use crate::writer::record_batch::divide_by_partition_values; @@ -113,7 +112,7 @@ pub struct WriteBuilder { /// Parquet writer properties writer_properties: Option, /// Additional metadata to be added to commit - app_metadata: Option>, + app_metadata: Option>, } impl WriteBuilder { @@ -206,7 +205,7 @@ impl WriteBuilder { mut self, metadata: impl IntoIterator, ) -> Self { - self.app_metadata = Some(Map::from_iter(metadata)); + self.app_metadata = Some(HashMap::from_iter(metadata)); self } @@ -226,7 +225,7 @@ impl WriteBuilder { } } false => { - let schema: Schema = if let Some(plan) = &self.input { + let schema: StructType = if let Some(plan) = &self.input { Ok(plan.schema().try_into()?) } else if let Some(batches) = &self.batches { if batches.is_empty() { @@ -238,7 +237,7 @@ impl WriteBuilder { }?; let mut builder = CreateBuilder::new() .with_object_store(self.store.clone()) - .with_columns(schema.get_fields().clone()); + .with_columns(schema.fields().clone()); if let Some(partition_columns) = self.partition_columns.as_ref() { builder = builder.with_partition_columns(partition_columns.clone()) } @@ -426,7 +425,7 @@ impl std::future::IntoFuture for WriteBuilder { this.safe_cast, ) .await?; - actions.extend(add_actions.into_iter().map(Action::add)); + actions.extend(add_actions.into_iter().map(Action::Add)); // Collect remove actions if we are overwriting the table if matches!(this.mode, SaveMode::Overwrite) { @@ -437,7 +436,7 @@ impl std::future::IntoFuture for WriteBuilder { .as_millis() as i64; let to_remove_action = |add: &Add| { - Action::remove(Remove { + Action::Remove(Remove { path: add.path.clone(), deletion_timestamp: Some(deletion_timestamp), data_change: true, @@ -447,6 +446,8 @@ impl std::future::IntoFuture for WriteBuilder { // TODO add file metadata to remove action (tags missing) tags: None, deletion_vector: add.deletion_vector.clone(), + base_row_id: add.base_row_id, + default_row_commit_version: add.default_row_commit_version, }) }; @@ -599,14 +600,14 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); assert_eq!(table.state.commit_infos().len(), 1); // write some data - let metadata = Map::from_iter(vec![("k1".to_string(), json!("v1.1"))]); + let metadata = HashMap::from_iter(vec![("k1".to_string(), json!("v1.1"))]); let mut table = DeltaOps(table) .write(vec![batch.clone()]) .with_save_mode(SaveMode::Append) @@ -623,12 +624,13 @@ mod tests { .clone() .into_iter() .filter(|(k, _)| k != "clientVersion") - .collect::>(), + .collect::>(), metadata ); // append some data - let metadata: Map = Map::from_iter(vec![("k1".to_string(), json!("v1.2"))]); + let metadata: HashMap = + HashMap::from_iter(vec![("k1".to_string(), json!("v1.2"))]); let mut table = DeltaOps(table) .write(vec![batch.clone()]) .with_save_mode(SaveMode::Append) @@ -645,12 +647,13 @@ mod tests { .clone() .into_iter() .filter(|(k, _)| k != "clientVersion") - .collect::>(), + .collect::>(), metadata ); // overwrite table - let metadata: Map = Map::from_iter(vec![("k2".to_string(), json!("v2.1"))]); + let metadata: HashMap = + HashMap::from_iter(vec![("k2".to_string(), json!("v2.1"))]); let mut table = DeltaOps(table) .write(vec![batch]) .with_save_mode(SaveMode::Overwrite) @@ -667,7 +670,7 @@ mod tests { .clone() .into_iter() .filter(|(k, _)| k != "clientVersion") - .collect::>(), + .collect::>(), metadata ); } @@ -807,7 +810,7 @@ mod tests { #[tokio::test] async fn test_check_invariants() { let batch = get_record_batch(None, false); - let schema: Schema = serde_json::from_value(json!({ + let schema: StructType = serde_json::from_value(json!({ "type": "struct", "fields": [ {"name": "id", "type": "string", "nullable": true, "metadata": {}}, @@ -821,7 +824,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -829,7 +832,7 @@ mod tests { let table = DeltaOps(table).write(vec![batch.clone()]).await.unwrap(); assert_eq!(table.version(), 1); - let schema: Schema = serde_json::from_value(json!({ + let schema: StructType = serde_json::from_value(json!({ "type": "struct", "fields": [ {"name": "id", "type": "string", "nullable": true, "metadata": {}}, @@ -843,7 +846,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() .with_save_mode(SaveMode::ErrorIfExists) - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); @@ -859,7 +862,7 @@ mod tests { let table = DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .await .unwrap(); assert_eq!(table.version(), 0); diff --git a/crates/deltalake-core/src/operations/writer.rs b/crates/deltalake-core/src/operations/writer.rs index 05bda44ae6..0bba167e33 100644 --- a/crates/deltalake-core/src/operations/writer.rs +++ b/crates/deltalake-core/src/operations/writer.rs @@ -13,7 +13,7 @@ use parquet::file::properties::WriterProperties; use crate::crate_version; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::Add; +use crate::kernel::Add; use crate::storage::ObjectStoreRef; use crate::writer::record_batch::{divide_by_partition_values, PartitionResult}; use crate::writer::stats::create_add; diff --git a/crates/deltalake-core/src/protocol/checkpoints.rs b/crates/deltalake-core/src/protocol/checkpoints.rs index 5667b4e1b2..fc23c1d28b 100644 --- a/crates/deltalake-core/src/protocol/checkpoints.rs +++ b/crates/deltalake-core/src/protocol/checkpoints.rs @@ -4,9 +4,8 @@ use std::collections::HashMap; use std::convert::TryFrom; use std::iter::Iterator; -use arrow::datatypes::Schema as ArrowSchema; -use arrow::error::ArrowError; use arrow::json::ReaderBuilder; +use arrow_schema::{ArrowError, Schema as ArrowSchema}; use chrono::{Datelike, Utc}; use futures::{StreamExt, TryStreamExt}; @@ -18,9 +17,12 @@ use parquet::errors::ParquetError; use regex::Regex; use serde_json::Value; -use super::{time_utils, Action, Add as AddAction, MetaData, Protocol, ProtocolError, Txn}; -use crate::arrow_convert::delta_log_schema_for_table; -use crate::schema::*; +use super::{time_utils, ProtocolError}; +use crate::kernel::actions::arrow::delta_log_schema_for_table; +use crate::kernel::{ + Action, Add as AddAction, DataType, Metadata, PrimitiveType, Protocol, StructField, StructType, + Txn, +}; use crate::storage::DeltaObjectStore; use crate::table::state::DeltaTableState; use crate::table::{CheckPoint, CheckPointBuilder}; @@ -201,8 +203,11 @@ fn parquet_bytes_from_state( let partition_col_data_types = current_metadata.get_partition_col_data_types(); // Collect a map of paths that require special stats conversion. - let mut stats_conversions: Vec<(SchemaPath, SchemaDataType)> = Vec::new(); - collect_stats_conversions(&mut stats_conversions, current_metadata.schema.get_fields()); + let mut stats_conversions: Vec<(SchemaPath, DataType)> = Vec::new(); + collect_stats_conversions( + &mut stats_conversions, + current_metadata.schema.fields().as_slice(), + ); let mut tombstones = state.unexpired_tombstones().cloned().collect::>(); @@ -226,14 +231,14 @@ fn parquet_bytes_from_state( } // protocol - let jsons = std::iter::once(Action::protocol(Protocol { + let jsons = std::iter::once(Action::Protocol(Protocol { min_reader_version: state.min_reader_version(), min_writer_version: state.min_writer_version(), writer_features: None, reader_features: None, })) // metaData - .chain(std::iter::once(Action::metaData(MetaData::try_from( + .chain(std::iter::once(Action::Metadata(Metadata::try_from( current_metadata.clone(), )?))) // txns @@ -242,7 +247,7 @@ fn parquet_bytes_from_state( .app_transaction_version() .iter() .map(|(app_id, version)| { - Action::txn(Txn { + Action::Txn(Txn { app_id: app_id.clone(), version: *version, last_updated: None, @@ -259,7 +264,7 @@ fn parquet_bytes_from_state( r.extended_file_metadata = Some(false); } - Action::remove(r) + Action::Remove(r) })) .map(|a| serde_json::to_value(a).map_err(ProtocolError::from)) // adds @@ -269,7 +274,7 @@ fn parquet_bytes_from_state( // Create the arrow schema that represents the Checkpoint parquet file. let arrow_schema = delta_log_schema_for_table( - >::try_from(¤t_metadata.schema)?, + >::try_from(¤t_metadata.schema)?, current_metadata.partition_columns.as_slice(), use_extended_remove_schema, ); @@ -299,10 +304,10 @@ fn parquet_bytes_from_state( fn checkpoint_add_from_state( add: &AddAction, - partition_col_data_types: &[(&str, &SchemaDataType)], - stats_conversions: &[(SchemaPath, SchemaDataType)], + partition_col_data_types: &[(&String, &DataType)], + stats_conversions: &[(SchemaPath, DataType)], ) -> Result { - let mut v = serde_json::to_value(Action::add(add.clone())) + let mut v = serde_json::to_value(Action::Add(add.clone())) .map_err(|err| ArrowError::JsonError(err.to_string()))?; v["add"]["dataChange"] = Value::Bool(false); @@ -348,24 +353,27 @@ fn checkpoint_add_from_state( fn typed_partition_value_from_string( string_value: &str, - data_type: &SchemaDataType, + data_type: &DataType, ) -> Result { match data_type { - SchemaDataType::primitive(primitive_type) => match primitive_type.as_str() { - "string" | "binary" => Ok(string_value.to_owned().into()), - "long" | "integer" | "short" | "byte" => Ok(string_value + DataType::Primitive(primitive_type) => match primitive_type { + PrimitiveType::String | PrimitiveType::Binary => Ok(string_value.to_owned().into()), + PrimitiveType::Long + | PrimitiveType::Integer + | PrimitiveType::Short + | PrimitiveType::Byte => Ok(string_value .parse::() .map_err(|_| CheckpointError::PartitionValueNotParseable(string_value.to_owned()))? .into()), - "boolean" => Ok(string_value + PrimitiveType::Boolean => Ok(string_value .parse::() .map_err(|_| CheckpointError::PartitionValueNotParseable(string_value.to_owned()))? .into()), - "float" | "double" => Ok(string_value + PrimitiveType::Float | PrimitiveType::Double => Ok(string_value .parse::() .map_err(|_| CheckpointError::PartitionValueNotParseable(string_value.to_owned()))? .into()), - "date" => { + PrimitiveType::Date => { let d = chrono::naive::NaiveDate::parse_from_str(string_value, "%Y-%m-%d") .map_err(|_| { CheckpointError::PartitionValueNotParseable(string_value.to_owned()) @@ -373,7 +381,7 @@ fn typed_partition_value_from_string( // day 0 is 1970-01-01 (719163 days from ce) Ok((d.num_days_from_ce() - 719_163).into()) } - "timestamp" => { + PrimitiveType::Timestamp => { let ts = chrono::naive::NaiveDateTime::parse_from_str(string_value, "%Y-%m-%d %H:%M:%S") .map_err(|_| { @@ -395,7 +403,7 @@ fn typed_partition_value_from_string( fn typed_partition_value_from_option_string( string_value: &Option, - data_type: &SchemaDataType, + data_type: &DataType, ) -> Result { match string_value { Some(s) => { @@ -409,10 +417,7 @@ fn typed_partition_value_from_option_string( } } -fn collect_stats_conversions( - paths: &mut Vec<(SchemaPath, SchemaDataType)>, - fields: &[SchemaField], -) { +fn collect_stats_conversions(paths: &mut Vec<(SchemaPath, DataType)>, fields: &[StructField]) { let mut _path = SchemaPath::new(); fields .iter() @@ -421,20 +426,18 @@ fn collect_stats_conversions( fn collect_field_conversion( current_path: &mut SchemaPath, - all_paths: &mut Vec<(SchemaPath, SchemaDataType)>, - field: &SchemaField, + all_paths: &mut Vec<(SchemaPath, DataType)>, + field: &StructField, ) { - match field.get_type() { - SchemaDataType::primitive(type_name) => { - if let "timestamp" = type_name.as_str() { - let mut key_path = current_path.clone(); - key_path.push(field.get_name().to_owned()); - all_paths.push((key_path, field.get_type().to_owned())); - } + match field.data_type() { + DataType::Primitive(PrimitiveType::Timestamp) => { + let mut key_path = current_path.clone(); + key_path.push(field.name().to_owned()); + all_paths.push((key_path, field.data_type().to_owned())); } - SchemaDataType::r#struct(struct_field) => { - let struct_fields = struct_field.get_fields(); - current_path.push(field.get_name().to_owned()); + DataType::Struct(struct_field) => { + let struct_fields = struct_field.fields(); + current_path.push(field.name().to_owned()); struct_fields .iter() .for_each(|f| collect_field_conversion(current_path, all_paths, f)); @@ -447,11 +450,11 @@ fn collect_field_conversion( fn apply_stats_conversion( context: &mut serde_json::Map, path: &[String], - data_type: &SchemaDataType, + data_type: &DataType, ) { if path.len() == 1 { match data_type { - SchemaDataType::primitive(type_name) if type_name == "timestamp" => { + DataType::Primitive(PrimitiveType::Timestamp) => { let v = context.get_mut(&path[0]); if let Some(v) = v { @@ -488,7 +491,7 @@ mod tests { string_value, typed_partition_value_from_option_string( &Some("Hello World!".to_string()), - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ) .unwrap() ); @@ -498,7 +501,7 @@ mod tests { bool_value, typed_partition_value_from_option_string( &Some("true".to_string()), - &SchemaDataType::primitive("boolean".to_string()), + &DataType::Primitive(PrimitiveType::Boolean), ) .unwrap() ); @@ -508,7 +511,7 @@ mod tests { number_value, typed_partition_value_from_option_string( &Some("42".to_string()), - &SchemaDataType::primitive("integer".to_string()), + &DataType::Primitive(PrimitiveType::Integer), ) .unwrap() ); @@ -525,7 +528,7 @@ mod tests { date_value, typed_partition_value_from_option_string( &Some(s.to_string()), - &SchemaDataType::primitive("date".to_string()), + &DataType::Primitive(PrimitiveType::Date), ) .unwrap() ); @@ -543,7 +546,7 @@ mod tests { timestamp_value, typed_partition_value_from_option_string( &Some(s.to_string()), - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ) .unwrap() ); @@ -554,7 +557,7 @@ mod tests { binary_value, typed_partition_value_from_option_string( &Some("₁₂₃₄".to_string()), - &SchemaDataType::primitive("binary".to_string()), + &DataType::Primitive(PrimitiveType::Binary), ) .unwrap() ); @@ -566,7 +569,7 @@ mod tests { Value::Null, typed_partition_value_from_option_string( &None, - &SchemaDataType::primitive("integer".to_string()), + &DataType::Primitive(PrimitiveType::Integer), ) .unwrap() ); @@ -576,7 +579,7 @@ mod tests { Value::Null, typed_partition_value_from_option_string( &Some("".to_string()), - &SchemaDataType::primitive("integer".to_string()), + &DataType::Primitive(PrimitiveType::Integer), ) .unwrap() ); @@ -584,8 +587,8 @@ mod tests { #[test] fn collect_stats_conversions_test() { - let delta_schema: Schema = serde_json::from_value(SCHEMA.clone()).unwrap(); - let fields = delta_schema.get_fields(); + let delta_schema: StructType = serde_json::from_value(SCHEMA.clone()).unwrap(); + let fields = delta_schema.fields(); let mut paths = Vec::new(); collect_stats_conversions(&mut paths, fields.as_slice()); @@ -594,14 +597,14 @@ mod tests { assert_eq!( ( vec!["some_struct".to_string(), "struct_timestamp".to_string()], - SchemaDataType::primitive("timestamp".to_string()) + DataType::Primitive(PrimitiveType::Timestamp) ), paths[0] ); assert_eq!( ( vec!["some_timestamp".to_string()], - SchemaDataType::primitive("timestamp".to_string()) + DataType::Primitive(PrimitiveType::Timestamp) ), paths[1] ); @@ -616,22 +619,22 @@ mod tests { apply_stats_conversion( min_values, &["some_struct".to_string(), "struct_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( min_values, &["some_struct".to_string(), "struct_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); apply_stats_conversion( min_values, &["some_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( min_values, &["some_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); let max_values = stats.get_mut("maxValues").unwrap().as_object_mut().unwrap(); @@ -639,22 +642,22 @@ mod tests { apply_stats_conversion( max_values, &["some_struct".to_string(), "struct_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( max_values, &["some_struct".to_string(), "struct_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); apply_stats_conversion( max_values, &["some_string".to_string()], - &SchemaDataType::primitive("string".to_string()), + &DataType::Primitive(PrimitiveType::String), ); apply_stats_conversion( max_values, &["some_timestamp".to_string()], - &SchemaDataType::primitive("timestamp".to_string()), + &DataType::Primitive(PrimitiveType::Timestamp), ); // minValues diff --git a/crates/deltalake-core/src/protocol/mod.rs b/crates/deltalake-core/src/protocol/mod.rs index 66f06b13a1..47e24cd959 100644 --- a/crates/deltalake-core/src/protocol/mod.rs +++ b/crates/deltalake-core/src/protocol/mod.rs @@ -8,29 +8,27 @@ pub mod checkpoints; pub mod parquet2_read; #[cfg(feature = "parquet")] mod parquet_read; -mod serde_path; mod time_utils; #[cfg(feature = "arrow")] use arrow_schema::ArrowError; use futures::StreamExt; use lazy_static::lazy_static; -use log::*; +use log::debug; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; use regex::Regex; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; +use serde_json::Value; use std::borrow::Borrow; -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::hash::{Hash, Hasher}; use std::mem::take; -use std::str::FromStr; use crate::errors::DeltaResult; +use crate::kernel::{Add, CommitInfo, Metadata, Protocol, Remove}; use crate::storage::ObjectStoreRef; -use crate::table::config::IsolationLevel; +use crate::table::CheckPoint; use crate::table::DeltaTableMetaData; -use crate::{schema::*, table::CheckPoint}; /// Error returned when an invalid Delta log action is encountered. #[allow(missing_docs)] @@ -105,6 +103,12 @@ pub enum ProtocolError { #[from] source: std::io::Error, }, + + #[error("Kernel: {source}")] + Kernel { + #[from] + source: crate::kernel::Error, + }, } /// Struct used to represent minValues and maxValues in add action statistics. @@ -244,170 +248,6 @@ pub struct StatsParsed { pub null_count: HashMap, } -/// Delta AddCDCFile action that describes a parquet CDC data file. -#[derive(Serialize, Deserialize, Clone, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct AddCDCFile { - /// A relative path, from the root of the table, or an - /// absolute path to a CDC file - #[serde(with = "serde_path")] - pub path: String, - /// The size of this file in bytes - pub size: i64, - /// A map from partition column to value for this file - pub partition_values: HashMap>, - /// Should always be set to false because they do not change the underlying data of the table - pub data_change: bool, - /// Map containing metadata about this file - pub tags: Option>>, -} - -///Storage type of deletion vector -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] -#[serde()] -pub enum StorageType { - /// Stored at relative path derived from a UUID. - #[serde(rename = "u")] - UuidRelativePath, - /// Stored as inline string. - #[serde(rename = "i")] - Inline, - /// Stored at an absolute path. - #[serde(rename = "p")] - AbsolutePath, -} - -impl Default for StorageType { - fn default() -> Self { - Self::UuidRelativePath // seems to be used by Databricks and therefore most common - } -} - -impl FromStr for StorageType { - type Err = ProtocolError; - - fn from_str(s: &str) -> Result { - match s { - "u" => Ok(Self::UuidRelativePath), - "i" => Ok(Self::Inline), - "p" => Ok(Self::AbsolutePath), - _ => Err(ProtocolError::InvalidDeletionVectorStorageType( - s.to_string(), - )), - } - } -} - -impl ToString for StorageType { - fn to_string(&self) -> String { - match self { - Self::UuidRelativePath => "u".to_string(), - Self::Inline => "i".to_string(), - Self::AbsolutePath => "p".to_string(), - } - } -} - -/// Describes deleted rows of a parquet file as part of an add or remove action -#[derive(Serialize, Deserialize, Clone, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct DeletionVector { - ///storageType of the deletion vector. p = Absolute Path, i = Inline, u = UUid Relative Path - pub storage_type: StorageType, - - ///If storageType = 'u' then - ///If storageType = 'i' then of the deletion vector data - ///If storageType = 'p' then - pub path_or_inline_dv: String, - - ///Start of the data for this DV in number of bytes from the beginning of the file it is stored in. Always None (absent in JSON) when storageType = 'i'. - pub offset: Option, - - ///Size of the serialized DV in bytes (raw data size, i.e. before base85 encoding, if inline). - pub size_in_bytes: i32, - - ///Number of rows the given DV logically removes from the file. - pub cardinality: i64, -} - -impl PartialEq for DeletionVector { - fn eq(&self, other: &Self) -> bool { - self.storage_type == other.storage_type - && self.path_or_inline_dv == other.path_or_inline_dv - && self.offset == other.offset - && self.size_in_bytes == other.size_in_bytes - && self.cardinality == other.cardinality - } -} - -impl Eq for DeletionVector {} - -/// Delta log action that describes a parquet data file that is part of the table. -#[derive(Serialize, Deserialize, Clone, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct Add { - /// A relative path, from the root of the table, to a file that should be added to the table - #[serde(with = "serde_path")] - pub path: String, - /// The size of this file in bytes - pub size: i64, - /// A map from partition column to value for this file - pub partition_values: HashMap>, - /// Partition values stored in raw parquet struct format. In this struct, the column names - /// correspond to the partition columns and the values are stored in their corresponding data - /// type. This is a required field when the table is partitioned and the table property - /// delta.checkpoint.writeStatsAsStruct is set to true. If the table is not partitioned, this - /// column can be omitted. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet")] - #[serde(skip_serializing, skip_deserializing)] - pub partition_values_parsed: Option, - /// Partition values stored in raw parquet struct format. In this struct, the column names - /// correspond to the partition columns and the values are stored in their corresponding data - /// type. This is a required field when the table is partitioned and the table property - /// delta.checkpoint.writeStatsAsStruct is set to true. If the table is not partitioned, this - /// column can be omitted. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet2")] - #[serde(skip_serializing, skip_deserializing)] - pub partition_values_parsed: Option, - /// The time this file was created, as milliseconds since the epoch - pub modification_time: i64, - /// When false the file must already be present in the table or the records in the added file - /// must be contained in one or more remove actions in the same version - /// - /// streaming queries that are tailing the transaction log can use this flag to skip actions - /// that would not affect the final results. - pub data_change: bool, - /// Contains statistics (e.g., count, min/max values for columns) about the data in this file - pub stats: Option, - /// Contains statistics (e.g., count, min/max values for columns) about the data in this file in - /// raw parquet format. This field needs to be written when statistics are available and the - /// table property: delta.checkpoint.writeStatsAsStruct is set to true. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet")] - #[serde(skip_serializing, skip_deserializing)] - pub stats_parsed: Option, - /// Contains statistics (e.g., count, min/max values for columns) about the data in this file in - /// raw parquet format. This field needs to be written when statistics are available and the - /// table property: delta.checkpoint.writeStatsAsStruct is set to true. - /// - /// This field is only available in add action records read from checkpoints - #[cfg(feature = "parquet2")] - #[serde(skip_serializing, skip_deserializing)] - pub stats_parsed: Option, - /// Map containing metadata about this file - #[serde(skip_serializing_if = "Option::is_none")] - pub tags: Option>>, - - /// Metadata about deletion vector - #[serde(skip_serializing_if = "Option::is_none")] - pub deletion_vector: Option, -} - impl Hash for Add { fn hash(&self, state: &mut H) { self.path.hash(state); @@ -468,127 +308,6 @@ impl Add { } } -/// Describes the data format of files in the table. -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] -pub struct Format { - /// Name of the encoding for files in this table. - provider: String, - /// A map containing configuration options for the format. - options: HashMap>, -} - -impl Format { - /// Allows creation of a new action::Format - pub fn new(provider: String, options: Option>>) -> Self { - let options = options.unwrap_or_default(); - Self { provider, options } - } - - /// Return the Format provider - pub fn get_provider(self) -> String { - self.provider - } -} - -// Assuming this is a more appropriate default than derived Default -impl Default for Format { - fn default() -> Self { - Self { - provider: "parquet".to_string(), - options: Default::default(), - } - } -} - -/// Return a default empty schema to be used for edge-cases when a schema is missing -fn default_schema() -> String { - warn!("A `metaData` action was missing a `schemaString` and has been given an empty schema"); - r#"{"type":"struct", "fields": []}"#.into() -} - -/// Action that describes the metadata of the table. -/// This is a top-level action in Delta log entries. -#[derive(Serialize, Deserialize, Debug, Default, Clone)] -#[serde(rename_all = "camelCase")] -pub struct MetaData { - /// Unique identifier for this table - pub id: Guid, - /// User-provided identifier for this table - pub name: Option, - /// User-provided description for this table - pub description: Option, - /// Specification of the encoding for the files stored in the table - pub format: Format, - /// Schema of the table - #[serde(default = "default_schema")] - pub schema_string: String, - /// An array containing the names of columns by which the data should be partitioned - pub partition_columns: Vec, - /// The time when this metadata action is created, in milliseconds since the Unix epoch - pub created_time: Option, - /// A map containing configuration options for the table - pub configuration: HashMap>, -} - -impl MetaData { - /// Returns the table schema from the embedded schema string contained within the metadata - /// action. - pub fn get_schema(&self) -> Result { - serde_json::from_str(&self.schema_string) - } -} - -impl TryFrom for MetaData { - type Error = ProtocolError; - - fn try_from(metadata: DeltaTableMetaData) -> Result { - let schema_string = serde_json::to_string(&metadata.schema) - .map_err(|source| ProtocolError::SerializeOperation { source })?; - Ok(Self { - id: metadata.id, - name: metadata.name, - description: metadata.description, - format: metadata.format, - schema_string, - partition_columns: metadata.partition_columns, - created_time: metadata.created_time, - configuration: metadata.configuration, - }) - } -} - -/// Represents a tombstone (deleted file) in the Delta log. -/// This is a top-level action in Delta log entries. -#[derive(Serialize, Deserialize, Clone, Eq, Debug, Default)] -#[serde(rename_all = "camelCase")] -pub struct Remove { - /// The path of the file that is removed from the table. - #[serde(with = "serde_path")] - pub path: String, - /// The timestamp when the remove was added to table state. - pub deletion_timestamp: Option, - /// Whether data is changed by the remove. A table optimize will report this as false for - /// example, since it adds and removes files by combining many files into one. - pub data_change: bool, - /// When true the fields partitionValues, size, and tags are present - /// - /// NOTE: Although it's defined as required in scala delta implementation, but some writes - /// it's still nullable so we keep it as Option<> for compatibly. - pub extended_file_metadata: Option, - /// A map from partition column to value for this file. - #[serde(skip_serializing_if = "Option::is_none")] - pub partition_values: Option>>, - /// Size of this file in bytes - #[serde(skip_serializing_if = "Option::is_none")] - pub size: Option, - /// Map containing metadata about this file - #[serde(skip_serializing_if = "Option::is_none")] - pub tags: Option>>, - /// Metadata about deletion vector - #[serde(skip_serializing_if = "Option::is_none")] - pub deletion_vector: Option, -} - impl Hash for Remove { fn hash(&self, state: &mut H) { self.path.hash(state); @@ -616,296 +335,21 @@ impl PartialEq for Remove { } } -/// Action used by streaming systems to track progress using application-specific versions to -/// enable idempotency. -#[derive(Serialize, Deserialize, Debug, Default, Clone)] -#[serde(rename_all = "camelCase")] -pub struct Txn { - /// A unique identifier for the application performing the transaction. - pub app_id: String, - /// An application-specific numeric identifier for this transaction. - pub version: i64, - /// The time when this transaction action was created in milliseconds since the Unix epoch. - pub last_updated: Option, -} - -/// Action used to increase the version of the Delta protocol required to read or write to the -/// table. -#[derive(Serialize, Deserialize, Debug, Default, Clone, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -pub struct Protocol { - /// Minimum version of the Delta read protocol a client must implement to correctly read the - /// table. - pub min_reader_version: i32, - /// Minimum version of the Delta write protocol a client must implement to correctly read the - /// table. - pub min_writer_version: i32, - /// Table features are missing from older versions - /// The table features this reader supports - #[serde(skip_serializing_if = "Option::is_none")] - pub reader_features: Option>, - /// Table features are missing from older versions - /// The table features this writer supports - #[serde(skip_serializing_if = "Option::is_none")] - pub writer_features: Option>, -} - -/// Features table readers can support as well as let users know -/// what is supported -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] -pub enum ReaderFeatures { - /// Mapping of one column to another - #[serde(alias = "columnMapping")] - COLUMN_MAPPING, - /// Deletion vectors for merge, update, delete - #[serde(alias = "deletionVectors")] - DELETION_VECTORS, - /// timestamps without timezone support - #[serde(alias = "timestampNtz")] - TIMESTAMP_WITHOUT_TIMEZONE, - /// version 2 of checkpointing - #[serde(alias = "v2Checkpoint")] - V2_CHECKPOINT, - /// If we do not match any other reader features - #[serde(untagged)] - OTHER(String), -} - -#[allow(clippy::from_over_into)] -impl Into for ReaderFeatures { - fn into(self) -> usize { - match self { - ReaderFeatures::OTHER(_) => 0, - ReaderFeatures::COLUMN_MAPPING => 2, - ReaderFeatures::DELETION_VECTORS - | ReaderFeatures::TIMESTAMP_WITHOUT_TIMEZONE - | ReaderFeatures::V2_CHECKPOINT => 3, - } - } -} - -#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] -impl From<&parquet::record::Field> for ReaderFeatures { - fn from(value: &parquet::record::Field) -> Self { - match value { - parquet::record::Field::Str(feature) => match feature.as_str() { - "columnMapping" => ReaderFeatures::COLUMN_MAPPING, - "deletionVectors" => ReaderFeatures::DELETION_VECTORS, - "timestampNtz" => ReaderFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "v2Checkpoint" => ReaderFeatures::V2_CHECKPOINT, - f => ReaderFeatures::OTHER(f.to_string()), - }, - f => ReaderFeatures::OTHER(f.to_string()), - } - } -} - -impl From for ReaderFeatures { - fn from(value: String) -> Self { - match value.as_str() { - "columnMapping" => ReaderFeatures::COLUMN_MAPPING, - "deletionVectors" => ReaderFeatures::DELETION_VECTORS, - "timestampNtz" => ReaderFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "v2Checkpoint" => ReaderFeatures::V2_CHECKPOINT, - f => ReaderFeatures::OTHER(f.to_string()), - } - } -} - -/// Features table writers can support as well as let users know -/// what is supported -#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq, Hash)] -pub enum WriterFeatures { - /// Append Only Tables - #[serde(alias = "appendOnly")] - APPEND_ONLY, - /// Table invariants - #[serde(alias = "invariants")] - INVARIANTS, - /// Check constraints on columns - #[serde(alias = "checkConstraints")] - CHECK_CONSTRAINTS, - /// CDF on a table - #[serde(alias = "changeDataFeed")] - CHANGE_DATA_FEED, - /// Columns with generated values - #[serde(alias = "generatedColumns")] - GENERATED_COLUMNS, - /// Mapping of one column to another - #[serde(alias = "columnMapping")] - COLUMN_MAPPING, - /// ID Columns - #[serde(alias = "identityColumns")] - IDENTITY_COLUMNS, - /// Deletion vectors for merge, update, delete - #[serde(alias = "deletionVectors")] - DELETION_VECTORS, - /// Row tracking on tables - #[serde(alias = "rowTracking")] - ROW_TRACKING, - /// timestamps without timezone support - #[serde(alias = "timestampNtz")] - TIMESTAMP_WITHOUT_TIMEZONE, - /// domain specific metadata - #[serde(alias = "domainMetadata")] - DOMAIN_METADATA, - /// version 2 of checkpointing - #[serde(alias = "v2Checkpoint")] - V2_CHECKPOINT, - /// Iceberg compatability support - #[serde(alias = "icebergCompatV1")] - ICEBERG_COMPAT_V1, - /// If we do not match any other reader features - #[serde(untagged)] - OTHER(String), -} - -#[allow(clippy::from_over_into)] -impl Into for WriterFeatures { - fn into(self) -> usize { - match self { - WriterFeatures::OTHER(_) => 0, - WriterFeatures::APPEND_ONLY | WriterFeatures::INVARIANTS => 2, - WriterFeatures::CHECK_CONSTRAINTS => 3, - WriterFeatures::CHANGE_DATA_FEED | WriterFeatures::GENERATED_COLUMNS => 4, - WriterFeatures::COLUMN_MAPPING => 5, - WriterFeatures::IDENTITY_COLUMNS - | WriterFeatures::DELETION_VECTORS - | WriterFeatures::ROW_TRACKING - | WriterFeatures::TIMESTAMP_WITHOUT_TIMEZONE - | WriterFeatures::DOMAIN_METADATA - | WriterFeatures::V2_CHECKPOINT - | WriterFeatures::ICEBERG_COMPAT_V1 => 7, - } - } -} - -impl From for WriterFeatures { - fn from(value: String) -> Self { - match value.as_str() { - "appendOnly" => WriterFeatures::APPEND_ONLY, - "invariants" => WriterFeatures::INVARIANTS, - "checkConstraints" => WriterFeatures::CHECK_CONSTRAINTS, - "changeDataFeed" => WriterFeatures::CHANGE_DATA_FEED, - "generatedColumns" => WriterFeatures::GENERATED_COLUMNS, - "columnMapping" => WriterFeatures::COLUMN_MAPPING, - "identityColumns" => WriterFeatures::IDENTITY_COLUMNS, - "deletionVectors" => WriterFeatures::DELETION_VECTORS, - "rowTracking" => WriterFeatures::ROW_TRACKING, - "timestampNtz" => WriterFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "domainMetadata" => WriterFeatures::DOMAIN_METADATA, - "v2Checkpoint" => WriterFeatures::V2_CHECKPOINT, - "icebergCompatV1" => WriterFeatures::ICEBERG_COMPAT_V1, - f => WriterFeatures::OTHER(f.to_string()), - } - } -} - -#[cfg(all(not(feature = "parquet2"), feature = "parquet"))] -impl From<&parquet::record::Field> for WriterFeatures { - fn from(value: &parquet::record::Field) -> Self { - match value { - parquet::record::Field::Str(feature) => match feature.as_str() { - "appendOnly" => WriterFeatures::APPEND_ONLY, - "invariants" => WriterFeatures::INVARIANTS, - "checkConstraints" => WriterFeatures::CHECK_CONSTRAINTS, - "changeDataFeed" => WriterFeatures::CHANGE_DATA_FEED, - "generatedColumns" => WriterFeatures::GENERATED_COLUMNS, - "columnMapping" => WriterFeatures::COLUMN_MAPPING, - "identityColumns" => WriterFeatures::IDENTITY_COLUMNS, - "deletionVectors" => WriterFeatures::DELETION_VECTORS, - "rowTracking" => WriterFeatures::ROW_TRACKING, - "timestampNtz" => WriterFeatures::TIMESTAMP_WITHOUT_TIMEZONE, - "domainMetadata" => WriterFeatures::DOMAIN_METADATA, - "v2Checkpoint" => WriterFeatures::V2_CHECKPOINT, - "icebergCompatV1" => WriterFeatures::ICEBERG_COMPAT_V1, - f => WriterFeatures::OTHER(f.to_string()), - }, - f => WriterFeatures::OTHER(f.to_string()), - } - } -} - -/// The commitInfo is a fairly flexible action within the delta specification, where arbitrary data can be stored. -/// However the reference implementation as well as delta-rs store useful information that may for instance -/// allow us to be more permissive in commit conflict resolution. -#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)] -#[serde(rename_all = "camelCase")] -pub struct CommitInfo { - /// Timestamp in millis when the commit was created - #[serde(skip_serializing_if = "Option::is_none")] - pub timestamp: Option, - /// Id of the user invoking the commit - #[serde(skip_serializing_if = "Option::is_none")] - pub user_id: Option, - /// Name of the user invoking the commit - #[serde(skip_serializing_if = "Option::is_none")] - pub user_name: Option, - /// The operation performed during the - #[serde(skip_serializing_if = "Option::is_none")] - pub operation: Option, - /// Parameters used for table operation - #[serde(skip_serializing_if = "Option::is_none")] - pub operation_parameters: Option>, - /// Version of the table when the operation was started - #[serde(skip_serializing_if = "Option::is_none")] - pub read_version: Option, - /// The isolation level of the commit - #[serde(skip_serializing_if = "Option::is_none")] - pub isolation_level: Option, - /// TODO - #[serde(skip_serializing_if = "Option::is_none")] - pub is_blind_append: Option, - /// Delta engine which created the commit. - #[serde(skip_serializing_if = "Option::is_none")] - pub engine_info: Option, - /// Additional provenance information for the commit - #[serde(flatten, default)] - pub info: Map, -} - -/// The domain metadata action contains a configuration (string) for a named metadata domain -#[derive(Serialize, Deserialize, Debug, Clone, Default, PartialEq)] -#[serde(rename_all = "camelCase")] -pub struct DomainMetaData { - /// Identifier for this domain (system or user-provided) - pub domain: String, - /// String containing configuration for the metadata domain - pub configuration: String, - /// When `true` the action serves as a tombstone - pub removed: bool, -} - -/// Represents an action in the Delta log. The Delta log is an aggregate of all actions performed -/// on the table, so the full list of actions is required to properly read a table. -#[derive(Serialize, Deserialize, Debug, Clone)] -pub enum Action { - /// Changes the current metadata of the table. Must be present in the first version of a table. - /// Subsequent `metaData` actions completely overwrite previous metadata. - metaData(MetaData), - /// Adds CDC a file to the table state. - cdc(AddCDCFile), - /// Adds a file to the table state. - add(Add), - /// Removes a file from the table state. - remove(Remove), - /// Used by streaming systems to track progress externally with application specific version - /// identifiers. - txn(Txn), - /// Describes the minimum reader and writer versions required to read or write to the table. - protocol(Protocol), - /// Describes commit provenance information for the table. - commitInfo(CommitInfo), - /// Describe s the configuration for a named metadata domain - domainMetadata(DomainMetaData), -} +impl TryFrom for Metadata { + type Error = ProtocolError; -impl Action { - /// Create a commit info from a map - pub fn commit_info(info: Map) -> Self { - Self::commitInfo(CommitInfo { - info, - ..Default::default() + fn try_from(metadata: DeltaTableMetaData) -> Result { + let schema_string = serde_json::to_string(&metadata.schema) + .map_err(|source| ProtocolError::SerializeOperation { source })?; + Ok(Self { + id: metadata.id, + name: metadata.name, + description: metadata.description, + format: metadata.format, + schema_string, + partition_columns: metadata.partition_columns, + created_time: metadata.created_time, + configuration: metadata.configuration, }) } } @@ -1232,6 +676,7 @@ pub(crate) async fn find_latest_check_point_for_version( #[cfg(test)] mod tests { use super::*; + use crate::kernel::Action; #[test] fn test_load_table_stats() { @@ -1245,7 +690,17 @@ mod tests { }) .to_string(), ), - ..Default::default() + path: Default::default(), + data_change: true, + deletion_vector: None, + partition_values: Default::default(), + partition_values_parsed: None, + stats_parsed: None, + tags: None, + size: 0, + modification_time: 0, + base_row_id: None, + default_row_commit_version: None, }; let stats = action.get_stats().unwrap().unwrap(); @@ -1310,7 +765,17 @@ mod tests { }) .to_string(), ), - ..Default::default() + path: Default::default(), + data_change: true, + deletion_vector: None, + partition_values: Default::default(), + partition_values_parsed: None, + stats_parsed: None, + tags: None, + size: 0, + modification_time: 0, + base_row_id: None, + default_row_commit_version: None, }; let stats = action.get_stats().unwrap().unwrap(); diff --git a/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs b/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs index 474a61a153..e68971be42 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/boolean.rs @@ -4,7 +4,7 @@ use parquet2::page::DataPage; use super::validity::ValidityRowIndexIter; use super::{split_page, ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; /// Parquet dictionary primitive value reader pub struct SomeBooleanValueIter<'a> { diff --git a/crates/deltalake-core/src/protocol/parquet2_read/map.rs b/crates/deltalake-core/src/protocol/parquet2_read/map.rs index 0739feae2d..df4dc94ab7 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/map.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/map.rs @@ -3,7 +3,7 @@ use parquet2::page::{DataPage, DictPage}; use super::string::for_each_repeated_string_field_value_with_idx; use super::{ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; #[derive(Default)] pub struct MapState { diff --git a/crates/deltalake-core/src/protocol/parquet2_read/mod.rs b/crates/deltalake-core/src/protocol/parquet2_read/mod.rs index ae5461d2b6..3314559e43 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/mod.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/mod.rs @@ -10,9 +10,10 @@ use parquet2::read::decompress; use parquet2::read::get_page_iterator; use parquet2::read::levels::get_bit_width; -use super::{ProtocolError, ReaderFeatures, WriterFeatures}; -use crate::protocol::{Action, Add, CommitInfo, MetaData, Protocol, Remove, Txn}; -use crate::schema::Guid; +use super::ProtocolError; +use crate::kernel::{ + Action, Add, CommitInfo, Metadata, Protocol, ReaderFeatures, Remove, Txn, WriterFeatures, +}; use boolean::for_each_boolean_field_value; use map::for_each_map_field_value; use primitive::for_each_primitive_field_value; @@ -138,12 +139,12 @@ impl ActionVariant for Add { type Variant = Add; fn default_action() -> Action { - Action::add(Self::default()) + Action::Add(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::add(v) => Ok(v), + Action::Add(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect Add action, got: {:?}", a @@ -156,7 +157,7 @@ impl ActionVariant for Remove { type Variant = Remove; fn default_action() -> Action { - Action::remove(Self { + Action::Remove(Self { data_change: true, extended_file_metadata: Some(false), ..Default::default() @@ -165,7 +166,7 @@ impl ActionVariant for Remove { fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::remove(v) => Ok(v), + Action::Remove(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect remove action, got: {:?}", a @@ -174,16 +175,16 @@ impl ActionVariant for Remove { } } -impl ActionVariant for MetaData { - type Variant = MetaData; +impl ActionVariant for Metadata { + type Variant = Metadata; fn default_action() -> Action { - Action::metaData(Self::default()) + Action::Metadata(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::metaData(v) => Ok(v), + Action::Metadata(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect metadata action, got: {:?}", a @@ -196,12 +197,12 @@ impl ActionVariant for Txn { type Variant = Txn; fn default_action() -> Action { - Action::txn(Self::default()) + Action::Txn(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::txn(v) => Ok(v), + Action::Txn(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect txn action, got: {:?}", a @@ -214,12 +215,12 @@ impl ActionVariant for Protocol { type Variant = Protocol; fn default_action() -> Action { - Action::protocol(Self::default()) + Action::Protocol(Self::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::protocol(v) => Ok(v), + Action::Protocol(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect protocol action, got: {:?}", a @@ -232,12 +233,12 @@ impl ActionVariant for CommitInfo { type Variant = CommitInfo; fn default_action() -> Action { - Action::commitInfo(CommitInfo::default()) + Action::CommitInfo(CommitInfo::default()) } fn try_mut_from_action(a: &mut Action) -> Result<&mut Self, ParseError> { match a { - Action::commitInfo(v) => Ok(v), + Action::CommitInfo(v) => Ok(v), _ => Err(ParseError::Generic(format!( "expect commitInfo action, got: {:?}", a @@ -485,7 +486,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: Guid| action.id = v, + |action: &mut Metadata, v: String| action.id = v, )?; } "name" => { @@ -494,7 +495,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.name = Some(v), + |action: &mut Metadata, v: String| action.name = Some(v), )?; } "description" => { @@ -503,7 +504,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.description = Some(v), + |action: &mut Metadata, v: String| action.description = Some(v), )?; } "format" => { @@ -515,7 +516,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.format.provider = v, + |action: &mut Metadata, v: String| action.format.provider = v, )?; } "options" => { @@ -526,7 +527,7 @@ fn deserialize_metadata_column_page( dict, descriptor, &mut state.metadata_fromat_options, - |action: &mut MetaData, v: (Vec, Vec>)| { + |action: &mut Metadata, v: (Vec, Vec>)| { action.format.options = hashmap_from_kvpairs(v.0, v.1); }, )?; @@ -545,7 +546,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: String| action.schema_string = v, + |action: &mut Metadata, v: String| action.schema_string = v, )?; } "partitionColumns" => { @@ -554,7 +555,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: Vec| action.partition_columns = v, + |action: &mut Metadata, v: Vec| action.partition_columns = v, )?; } "createdTime" => { @@ -563,7 +564,7 @@ fn deserialize_metadata_column_page( page, dict, descriptor, - |action: &mut MetaData, v: i64| action.created_time = Some(v), + |action: &mut Metadata, v: i64| action.created_time = Some(v), )?; } "configuration" => { @@ -574,7 +575,7 @@ fn deserialize_metadata_column_page( dict, descriptor, &mut state.metadata_configuration, - |action: &mut MetaData, v: (Vec, Vec>)| { + |action: &mut Metadata, v: (Vec, Vec>)| { action.configuration = hashmap_from_kvpairs(v.0, v.1); }, )?; @@ -762,20 +763,20 @@ mod tests { for row_group in meta_data.row_groups { let actions = actions_from_row_group(row_group, &mut reader).unwrap(); match &actions[0] { - Action::protocol(protocol) => { + Action::Protocol(protocol) => { assert_eq!(protocol.min_reader_version, 1,); assert_eq!(protocol.min_writer_version, 2,); } _ => panic!("expect protocol action"), } match &actions[1] { - Action::metaData(meta_data) => { + Action::Metadata(meta_data) => { assert_eq!(meta_data.id, "22ef18ba-191c-4c36-a606-3dad5cdf3830"); assert_eq!(meta_data.name, None); assert_eq!(meta_data.description, None); assert_eq!( meta_data.format, - crate::protocol::Format::new("parquet".to_string(), None), + crate::kernel::Format::new("parquet".to_string(), None), ); assert_eq!(meta_data.schema_string, "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}"); assert_eq!(meta_data.partition_columns.len(), 0); @@ -786,7 +787,7 @@ mod tests { } match &actions[2] { - Action::txn(txn) => { + Action::Txn(txn) => { assert_eq!(txn.app_id, "e4a20b59-dd0e-4c50-b074-e8ae4786df30"); assert_eq!(txn.version, 0); assert_eq!(txn.last_updated, Some(1564524299648)); @@ -794,7 +795,7 @@ mod tests { _ => panic!("expect txn action, got: {:?}", &actions[1]), } match &actions[3] { - Action::remove(remove) => { + Action::Remove(remove) => { assert_eq!( remove.path, "part-00000-512e1537-8aaa-4193-b8b4-bef3de0de409-c000.snappy.parquet" @@ -809,7 +810,7 @@ mod tests { _ => panic!("expect remove action, got: {:?}", &actions[2]), } match &actions[9] { - Action::add(add_action) => { + Action::Add(add_action) => { assert_eq!( add_action.path, "part-00001-c373a5bd-85f0-4758-815e-7eb62007a15c-c000.snappy.parquet" @@ -837,20 +838,20 @@ mod tests { for row_group in metadata.row_groups { let actions = actions_from_row_group(row_group, &mut reader).unwrap(); match &actions[0] { - Action::protocol(protocol) => { + Action::Protocol(protocol) => { assert_eq!(protocol.min_reader_version, 1,); assert_eq!(protocol.min_writer_version, 2,); } _ => panic!("expect protocol action"), } match &actions[1] { - Action::metaData(meta_data) => { + Action::Metadata(meta_data) => { assert_eq!(meta_data.id, "94ba8468-c676-4468-b326-adde3ab9dcd2"); assert_eq!(meta_data.name, None); assert_eq!(meta_data.description, None); assert_eq!( meta_data.format, - crate::protocol::Format::new("parquet".to_string(), None), + crate::kernel::Format::new("parquet".to_string(), None), ); assert_eq!( meta_data.schema_string, @@ -864,7 +865,7 @@ mod tests { } match &actions[2] { - Action::add(add_action) => { + Action::Add(add_action) => { assert_eq!(add_action.path, "f62d8868-d952-4f9d-8bb2-fd4e011ebf36"); assert_eq!(add_action.size, 100); assert_eq!(add_action.modification_time, 1661662807080); @@ -880,7 +881,7 @@ mod tests { _ => panic!("expect add action, got: {:?}", &actions[9]), } match &actions[3] { - Action::add(add_action) => { + Action::Add(add_action) => { assert_eq!(add_action.path, "8ac7d8e1-daab-48ef-9d05-ec22fb4b0d2f"); assert_eq!(add_action.size, 100); assert_eq!(add_action.modification_time, 1661662807097); diff --git a/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs b/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs index 29147ea8ca..16cb850f05 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/primitive.rs @@ -12,7 +12,7 @@ use parquet2::types::NativeType; use super::dictionary; use super::validity::ValidityRowIndexIter; use super::{split_page, ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; struct ExactChunksIter<'a, T: NativeType> { chunks: std::slice::ChunksExact<'a, u8>, diff --git a/crates/deltalake-core/src/protocol/parquet2_read/string.rs b/crates/deltalake-core/src/protocol/parquet2_read/string.rs index fc0ec574e0..391a9b9390 100644 --- a/crates/deltalake-core/src/protocol/parquet2_read/string.rs +++ b/crates/deltalake-core/src/protocol/parquet2_read/string.rs @@ -9,7 +9,7 @@ use super::dictionary; use super::dictionary::binary::BinaryPageDict; use super::validity::{ValidityRepeatedRowIndexIter, ValidityRowIndexIter}; use super::{split_page, split_page_nested, ActionVariant, ParseError}; -use crate::protocol::Action; +use crate::kernel::Action; pub trait StringValueIter<'a>: Iterator> { fn try_from_encoded_values( diff --git a/crates/deltalake-core/src/protocol/parquet_read/mod.rs b/crates/deltalake-core/src/protocol/parquet_read/mod.rs index d5e3e708b6..e89c73d4bd 100644 --- a/crates/deltalake-core/src/protocol/parquet_read/mod.rs +++ b/crates/deltalake-core/src/protocol/parquet_read/mod.rs @@ -6,12 +6,10 @@ use num_traits::cast::ToPrimitive; use parquet::record::{Field, ListAccessor, MapAccessor, RowAccessor}; use serde_json::json; -use crate::protocol::{ - Action, Add, AddCDCFile, ColumnCountStat, ColumnValueStat, DeletionVector, MetaData, Protocol, - ProtocolError, Remove, Stats, Txn, +use crate::kernel::{ + Action, Add, AddCDCFile, DeletionVectorDescriptor, Metadata, Protocol, Remove, StorageType, Txn, }; - -use super::StorageType; +use crate::protocol::{ColumnCountStat, ColumnValueStat, ProtocolError, Stats}; fn populate_hashmap_with_option_from_parquet_map( map: &mut HashMap>, @@ -46,10 +44,14 @@ impl AddCDCFile { } } -impl DeletionVector { +impl DeletionVectorDescriptor { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + cardinality: -1, + offset: None, + path_or_inline_dv: "".to_string(), + size_in_bytes: -1, + storage_type: StorageType::default(), }; for (i, (name, _)) in record.get_column_iter().enumerate() { match name.as_str() { @@ -99,7 +101,18 @@ impl DeletionVector { impl Add { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + path: "".to_string(), + size: -1, + modification_time: -1, + data_change: true, + partition_values_parsed: None, + partition_values: HashMap::new(), + stats: None, + stats_parsed: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + tags: None, }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -182,7 +195,8 @@ impl Add { }, "deletionVector" => match record.get_group(i) { Ok(row) => { - re.deletion_vector = Some(DeletionVector::from_parquet_record(row)?); + re.deletion_vector = + Some(DeletionVectorDescriptor::from_parquet_record(row)?); } _ => { re.deletion_vector = None; @@ -364,10 +378,17 @@ fn convert_date_to_string(value: i32) -> Result { Ok(format!("{}", dt.format("%Y-%m-%d"))) } -impl MetaData { +impl Metadata { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + id: "".to_string(), + name: None, + description: None, + partition_columns: vec![], + schema_string: "".to_string(), + created_time: None, + configuration: HashMap::new(), + format: Default::default(), }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -480,7 +501,14 @@ impl Remove { let mut re = Self { data_change: true, extended_file_metadata: Some(false), - ..Default::default() + deletion_timestamp: None, + deletion_vector: None, + partition_values: None, + path: "".to_string(), + size: None, + tags: None, + base_row_id: None, + default_row_commit_version: None, }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -595,7 +623,10 @@ impl Txn { impl Protocol { fn from_parquet_record(record: &parquet::record::Row) -> Result { let mut re = Self { - ..Default::default() + min_reader_version: -1, + min_writer_version: -1, + reader_features: None, + writer_features: None, }; for (i, (name, _)) in record.get_column_iter().enumerate() { @@ -673,12 +704,12 @@ impl Action { let field = &fields[col_idx]; Ok(match field.get_basic_info().name() { - "add" => Action::add(Add::from_parquet_record(col_data)?), - "metaData" => Action::metaData(MetaData::from_parquet_record(col_data)?), - "remove" => Action::remove(Remove::from_parquet_record(col_data)?), - "txn" => Action::txn(Txn::from_parquet_record(col_data)?), - "protocol" => Action::protocol(Protocol::from_parquet_record(col_data)?), - "cdc" => Action::cdc(AddCDCFile::from_parquet_record(col_data)?), + "add" => Action::Add(Add::from_parquet_record(col_data)?), + "metaData" => Action::Metadata(Metadata::from_parquet_record(col_data)?), + "remove" => Action::Remove(Remove::from_parquet_record(col_data)?), + "txn" => Action::Txn(Txn::from_parquet_record(col_data)?), + "protocol" => Action::Protocol(Protocol::from_parquet_record(col_data)?), + "cdc" => Action::Cdc(AddCDCFile::from_parquet_record(col_data)?), name => { return Err(ProtocolError::InvalidField(format!( "Unexpected action from checkpoint: {name}", diff --git a/crates/deltalake-core/src/schema/arrow_convert.rs b/crates/deltalake-core/src/schema/arrow_convert.rs index 2b37b05c4a..d292362604 100644 --- a/crates/deltalake-core/src/schema/arrow_convert.rs +++ b/crates/deltalake-core/src/schema/arrow_convert.rs @@ -1,44 +1,41 @@ -//! Conversion between Delta Table schema and Arrow schema +use std::sync::Arc; -use crate::schema; -use arrow::datatypes::{ - DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, +use arrow_schema::{ + ArrowError, DataType as ArrowDataType, Field as ArrowField, FieldRef as ArrowFieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, }; -use arrow::error::ArrowError; use lazy_static::lazy_static; -use regex::Regex; -use std::convert::TryFrom; -use std::sync::Arc; -impl TryFrom<&schema::Schema> for ArrowSchema { +use super::super::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; + +impl TryFrom<&StructType> for ArrowSchema { type Error = ArrowError; - fn try_from(s: &schema::Schema) -> Result { + fn try_from(s: &StructType) -> Result { let fields = s - .get_fields() + .fields() .iter() - .map(>::try_from) + .map(>::try_from) .collect::, ArrowError>>()?; Ok(ArrowSchema::new(fields)) } } -impl TryFrom<&schema::SchemaField> for ArrowField { +impl TryFrom<&StructField> for ArrowField { type Error = ArrowError; - fn try_from(f: &schema::SchemaField) -> Result { + fn try_from(f: &StructField) -> Result { let metadata = f - .get_metadata() + .metadata() .iter() .map(|(key, val)| Ok((key.clone(), serde_json::to_string(val)?))) .collect::>() .map_err(|err| ArrowError::JsonError(err.to_string()))?; let field = ArrowField::new( - f.get_name(), - ArrowDataType::try_from(f.get_type())?, + f.name(), + ArrowDataType::try_from(f.data_type())?, f.is_nullable(), ) .with_metadata(metadata); @@ -47,119 +44,113 @@ impl TryFrom<&schema::SchemaField> for ArrowField { } } -impl TryFrom<&schema::SchemaTypeArray> for ArrowField { +impl TryFrom<&ArrayType> for ArrowField { type Error = ArrowError; - fn try_from(a: &schema::SchemaTypeArray) -> Result { + fn try_from(a: &ArrayType) -> Result { Ok(ArrowField::new( "item", - ArrowDataType::try_from(a.get_element_type())?, + ArrowDataType::try_from(a.element_type())?, a.contains_null(), )) } } -impl TryFrom<&schema::SchemaTypeMap> for ArrowField { +impl TryFrom<&MapType> for ArrowField { type Error = ArrowError; - fn try_from(a: &schema::SchemaTypeMap) -> Result { - Ok(ArrowField::new_map( - "map", + fn try_from(a: &MapType) -> Result { + Ok(ArrowField::new( "entries", - ArrowField::new("key", ArrowDataType::try_from(a.get_key_type())?, false), - ArrowField::new( - "value", - ArrowDataType::try_from(a.get_value_type())?, - a.get_value_contains_null(), + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::try_from(a.key_type())?, false), + ArrowField::new( + "value", + ArrowDataType::try_from(a.value_type())?, + a.value_contains_null(), + ), + ] + .into(), ), - false, - false, + false, // always non-null )) } } -impl TryFrom<&schema::SchemaDataType> for ArrowDataType { +impl TryFrom<&DataType> for ArrowDataType { type Error = ArrowError; - fn try_from(t: &schema::SchemaDataType) -> Result { + fn try_from(t: &DataType) -> Result { match t { - schema::SchemaDataType::primitive(p) => { - lazy_static! { - static ref DECIMAL_REGEX: Regex = - Regex::new(r"\((\d{1,2}),(\d{1,2})\)").unwrap(); - } - match p.as_str() { - "string" => Ok(ArrowDataType::Utf8), - "long" => Ok(ArrowDataType::Int64), // undocumented type - "integer" => Ok(ArrowDataType::Int32), - "short" => Ok(ArrowDataType::Int16), - "byte" => Ok(ArrowDataType::Int8), - "float" => Ok(ArrowDataType::Float32), - "double" => Ok(ArrowDataType::Float64), - "boolean" => Ok(ArrowDataType::Boolean), - "binary" => Ok(ArrowDataType::Binary), - decimal if DECIMAL_REGEX.is_match(decimal) => { - let extract = DECIMAL_REGEX.captures(decimal).ok_or_else(|| { + DataType::Primitive(p) => { + match p { + PrimitiveType::String => Ok(ArrowDataType::Utf8), + PrimitiveType::Long => Ok(ArrowDataType::Int64), // undocumented type + PrimitiveType::Integer => Ok(ArrowDataType::Int32), + PrimitiveType::Short => Ok(ArrowDataType::Int16), + PrimitiveType::Byte => Ok(ArrowDataType::Int8), + PrimitiveType::Float => Ok(ArrowDataType::Float32), + PrimitiveType::Double => Ok(ArrowDataType::Float64), + PrimitiveType::Boolean => Ok(ArrowDataType::Boolean), + PrimitiveType::Binary => Ok(ArrowDataType::Binary), + PrimitiveType::Decimal(precision, scale) => { + let precision = u8::try_from(*precision).map_err(|_| { ArrowError::SchemaError(format!( - "Invalid decimal type for Arrow: {decimal}" + "Invalid precision for decimal: {}", + precision )) })?; - let precision = extract.get(1).and_then(|v| v.as_str().parse::().ok()); - let scale = extract.get(2).and_then(|v| v.as_str().parse::().ok()); - match (precision, scale) { - // TODO how do we decide which variant (128 / 256) to use? - (Some(p), Some(s)) => Ok(ArrowDataType::Decimal128(p, s)), - _ => Err(ArrowError::SchemaError(format!( - "Invalid precision or scale decimal type for Arrow: {decimal}" - ))), + let scale = i8::try_from(*scale).map_err(|_| { + ArrowError::SchemaError(format!("Invalid scale for decimal: {}", scale)) + })?; + + if precision <= 38 { + Ok(ArrowDataType::Decimal128(precision, scale)) + } else if precision <= 76 { + Ok(ArrowDataType::Decimal256(precision, scale)) + } else { + Err(ArrowError::SchemaError(format!( + "Precision too large to be represented in Arrow: {}", + precision + ))) } } - "date" => { + PrimitiveType::Date => { // A calendar date, represented as a year-month-day triple without a - // timezone. Stored as 4 bytes integer representing days sinece 1970-01-01 + // timezone. Stored as 4 bytes integer representing days since 1970-01-01 Ok(ArrowDataType::Date32) } - "timestamp" => { + PrimitiveType::Timestamp => { // Issue: https://github.com/delta-io/delta/issues/643 Ok(ArrowDataType::Timestamp(TimeUnit::Microsecond, None)) } - s => Err(ArrowError::SchemaError(format!( - "Invalid data type for Arrow: {s}" - ))), } } - schema::SchemaDataType::r#struct(s) => Ok(ArrowDataType::Struct( - s.get_fields() + DataType::Struct(s) => Ok(ArrowDataType::Struct( + s.fields() .iter() - .map(>::try_from) + .map(>::try_from) .collect::, ArrowError>>()? .into(), )), - schema::SchemaDataType::array(a) => { - Ok(ArrowDataType::List(Arc::new(>::try_from( - a - )?))) - } - schema::SchemaDataType::map(m) => Ok(ArrowDataType::Map( + DataType::Array(a) => Ok(ArrowDataType::List(Arc::new(>::try_from(a)?))), + DataType::Map(m) => Ok(ArrowDataType::Map( Arc::new(ArrowField::new( "entries", ArrowDataType::Struct( vec![ ArrowField::new( "keys", - >::try_from( - m.get_key_type(), - )?, + >::try_from(m.key_type())?, false, ), ArrowField::new( "values", - >::try_from( - m.get_value_type(), - )?, - m.get_value_contains_null(), + >::try_from(m.value_type())?, + m.value_contains_null(), ), ] .into(), @@ -172,19 +163,20 @@ impl TryFrom<&schema::SchemaDataType> for ArrowDataType { } } -impl TryFrom<&ArrowSchema> for schema::Schema { +impl TryFrom<&ArrowSchema> for StructType { type Error = ArrowError; + fn try_from(arrow_schema: &ArrowSchema) -> Result { - let new_fields: Result, _> = arrow_schema + let new_fields: Result, _> = arrow_schema .fields() .iter() .map(|field| field.as_ref().try_into()) .collect(); - Ok(schema::Schema::new(new_fields?)) + Ok(StructType::new(new_fields?)) } } -impl TryFrom for schema::Schema { +impl TryFrom for StructType { type Error = ArrowError; fn try_from(arrow_schema: ArrowSchemaRef) -> Result { @@ -192,99 +184,86 @@ impl TryFrom for schema::Schema { } } -impl TryFrom<&ArrowField> for schema::SchemaField { +impl TryFrom<&ArrowField> for StructField { type Error = ArrowError; + fn try_from(arrow_field: &ArrowField) -> Result { - Ok(schema::SchemaField::new( + Ok(StructField::new( arrow_field.name().clone(), arrow_field.data_type().try_into()?, arrow_field.is_nullable(), - arrow_field - .metadata() - .iter() - .map(|(k, v)| (k.clone(), serde_json::Value::String(v.clone()))) - .collect(), - )) + ) + .with_metadata(arrow_field.metadata().iter().map(|(k, v)| (k.clone(), v)))) } } -impl TryFrom<&ArrowDataType> for schema::SchemaDataType { +impl TryFrom<&ArrowDataType> for DataType { type Error = ArrowError; + fn try_from(arrow_datatype: &ArrowDataType) -> Result { match arrow_datatype { - ArrowDataType::Utf8 => Ok(schema::SchemaDataType::primitive("string".to_string())), - ArrowDataType::LargeUtf8 => Ok(schema::SchemaDataType::primitive("string".to_string())), - ArrowDataType::Int64 => Ok(schema::SchemaDataType::primitive("long".to_string())), // undocumented type - ArrowDataType::Int32 => Ok(schema::SchemaDataType::primitive("integer".to_string())), - ArrowDataType::Int16 => Ok(schema::SchemaDataType::primitive("short".to_string())), - ArrowDataType::Int8 => Ok(schema::SchemaDataType::primitive("byte".to_string())), - ArrowDataType::UInt64 => Ok(schema::SchemaDataType::primitive("long".to_string())), // undocumented type - ArrowDataType::UInt32 => Ok(schema::SchemaDataType::primitive("integer".to_string())), - ArrowDataType::UInt16 => Ok(schema::SchemaDataType::primitive("short".to_string())), - ArrowDataType::UInt8 => Ok(schema::SchemaDataType::primitive("byte".to_string())), - ArrowDataType::Float32 => Ok(schema::SchemaDataType::primitive("float".to_string())), - ArrowDataType::Float64 => Ok(schema::SchemaDataType::primitive("double".to_string())), - ArrowDataType::Boolean => Ok(schema::SchemaDataType::primitive("boolean".to_string())), - ArrowDataType::Binary => Ok(schema::SchemaDataType::primitive("binary".to_string())), - ArrowDataType::FixedSizeBinary(_) => { - Ok(schema::SchemaDataType::primitive("binary".to_string())) - } - ArrowDataType::LargeBinary => { - Ok(schema::SchemaDataType::primitive("binary".to_string())) - } - ArrowDataType::Decimal128(p, s) => Ok(schema::SchemaDataType::primitive(format!( - "decimal({p},{s})" + ArrowDataType::Utf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::LargeUtf8 => Ok(DataType::Primitive(PrimitiveType::String)), + ArrowDataType::Int64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::Int32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::Int16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::Int8 => Ok(DataType::Primitive(PrimitiveType::Byte)), + ArrowDataType::UInt64 => Ok(DataType::Primitive(PrimitiveType::Long)), // undocumented type + ArrowDataType::UInt32 => Ok(DataType::Primitive(PrimitiveType::Integer)), + ArrowDataType::UInt16 => Ok(DataType::Primitive(PrimitiveType::Short)), + ArrowDataType::UInt8 => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Float32 => Ok(DataType::Primitive(PrimitiveType::Float)), + ArrowDataType::Float64 => Ok(DataType::Primitive(PrimitiveType::Double)), + ArrowDataType::Boolean => Ok(DataType::Primitive(PrimitiveType::Boolean)), + ArrowDataType::Binary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::FixedSizeBinary(_) => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::LargeBinary => Ok(DataType::Primitive(PrimitiveType::Binary)), + ArrowDataType::Decimal128(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, ))), - ArrowDataType::Decimal256(p, s) => Ok(schema::SchemaDataType::primitive(format!( - "decimal({p},{s})" + ArrowDataType::Decimal256(p, s) => Ok(DataType::Primitive(PrimitiveType::Decimal( + *p as i32, *s as i32, ))), - ArrowDataType::Date32 => Ok(schema::SchemaDataType::primitive("date".to_string())), - ArrowDataType::Date64 => Ok(schema::SchemaDataType::primitive("date".to_string())), + ArrowDataType::Date32 => Ok(DataType::Primitive(PrimitiveType::Date)), + ArrowDataType::Date64 => Ok(DataType::Primitive(PrimitiveType::Date)), ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => { - Ok(schema::SchemaDataType::primitive("timestamp".to_string())) + Ok(DataType::Primitive(PrimitiveType::Timestamp)) } ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) if tz.eq_ignore_ascii_case("utc") => { - Ok(schema::SchemaDataType::primitive("timestamp".to_string())) + Ok(DataType::Primitive(PrimitiveType::Timestamp)) } ArrowDataType::Struct(fields) => { - let converted_fields: Result, _> = fields + let converted_fields: Result, _> = fields .iter() .map(|field| field.as_ref().try_into()) .collect(); - Ok(schema::SchemaDataType::r#struct( - schema::SchemaTypeStruct::new(converted_fields?), - )) - } - ArrowDataType::List(field) => { - Ok(schema::SchemaDataType::array(schema::SchemaTypeArray::new( - Box::new((*field).data_type().try_into()?), - (*field).is_nullable(), - ))) - } - ArrowDataType::LargeList(field) => { - Ok(schema::SchemaDataType::array(schema::SchemaTypeArray::new( - Box::new((*field).data_type().try_into()?), - (*field).is_nullable(), - ))) - } - ArrowDataType::FixedSizeList(field, _) => { - Ok(schema::SchemaDataType::array(schema::SchemaTypeArray::new( - Box::new((*field).data_type().try_into()?), - (*field).is_nullable(), - ))) + Ok(DataType::Struct(Box::new(StructType::new( + converted_fields?, + )))) } + ArrowDataType::List(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::LargeList(field) => Ok(DataType::Array(Box::new(ArrayType::new( + (*field).data_type().try_into()?, + (*field).is_nullable(), + )))), + ArrowDataType::FixedSizeList(field, _) => Ok(DataType::Array(Box::new( + ArrayType::new((*field).data_type().try_into()?, (*field).is_nullable()), + ))), ArrowDataType::Map(field, _) => { if let ArrowDataType::Struct(struct_fields) = field.data_type() { let key_type = struct_fields[0].data_type().try_into()?; let value_type = struct_fields[1].data_type().try_into()?; let value_type_nullable = struct_fields[1].is_nullable(); - Ok(schema::SchemaDataType::map(schema::SchemaTypeMap::new( - Box::new(key_type), - Box::new(value_type), + Ok(DataType::Map(Box::new(MapType::new( + key_type, + value_type, value_type_nullable, - ))) + )))) } else { panic!("DataType::Map should contain a struct field child"); } @@ -635,7 +614,6 @@ fn null_count_schema_for_fields(dest: &mut Vec, f: &ArrowField) { #[cfg(test)] mod tests { use arrow::array::ArrayData; - use arrow::datatypes::DataType; use arrow_array::Array; use arrow_array::{make_array, ArrayRef, MapArray, StringArray, StructArray}; use arrow_buffer::{Buffer, ToByteSlice}; @@ -790,33 +768,18 @@ mod tests { fn test_arrow_from_delta_decimal_type() { let precision = 20; let scale = 2; - let decimal_type = format!["decimal({precision},{scale})"]; - let decimal_field = crate::SchemaDataType::primitive(decimal_type); + let decimal_field = DataType::Primitive(PrimitiveType::Decimal(precision, scale)); assert_eq!( - >::try_from(&decimal_field).unwrap(), - ArrowDataType::Decimal128(precision, scale) + >::try_from(&decimal_field).unwrap(), + ArrowDataType::Decimal128(precision as u8, scale as i8) ); } - #[test] - fn test_arrow_from_delta_wrong_decimal_type() { - let precision = 20; - let scale = "wrong"; - let decimal_type = format!["decimal({precision},{scale})"]; - let _error = format!("Invalid precision or scale decimal type for Arrow: {scale}"); - let decimal_field = crate::SchemaDataType::primitive(decimal_type); - assert!(matches!( - >::try_from(&decimal_field) - .unwrap_err(), - arrow::error::ArrowError::SchemaError(_error), - )); - } - #[test] fn test_arrow_from_delta_timestamp_type() { - let timestamp_field = crate::SchemaDataType::primitive("timestamp".to_string()); + let timestamp_field = DataType::Primitive(PrimitiveType::Timestamp); assert_eq!( - >::try_from(×tamp_field).unwrap(), + >::try_from(×tamp_field).unwrap(), ArrowDataType::Timestamp(TimeUnit::Microsecond, None) ); } @@ -825,8 +788,8 @@ mod tests { fn test_delta_from_arrow_timestamp_type() { let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, None); assert_eq!( - >::try_from(×tamp_field).unwrap(), - crate::SchemaDataType::primitive("timestamp".to_string()) + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) ); } @@ -835,8 +798,8 @@ mod tests { let timestamp_field = ArrowDataType::Timestamp(TimeUnit::Microsecond, Some("UTC".to_string().into())); assert_eq!( - >::try_from(×tamp_field).unwrap(), - crate::SchemaDataType::primitive("timestamp".to_string()) + >::try_from(×tamp_field).unwrap(), + DataType::Primitive(PrimitiveType::Timestamp) ); } @@ -856,15 +819,15 @@ mod tests { )), false, ); - let converted_map: crate::SchemaDataType = (&arrow_map).try_into().unwrap(); + let converted_map: DataType = (&arrow_map).try_into().unwrap(); assert_eq!( converted_map, - crate::SchemaDataType::map(crate::SchemaTypeMap::new( - Box::new(crate::SchemaDataType::primitive("byte".to_string())), - Box::new(crate::SchemaDataType::primitive("binary".to_string())), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::Byte), + DataType::Primitive(PrimitiveType::Binary), true, - )) + ))) ); } @@ -891,7 +854,7 @@ mod tests { let entry_offsets_buffer = Buffer::from(entry_offsets.to_byte_slice()); let keys_data = StringArray::from_iter_values(keys); - let keys_field = Arc::new(Field::new("keys", DataType::Utf8, false)); + let keys_field = Arc::new(Field::new("keys", ArrowDataType::Utf8, false)); let values_field = Arc::new(Field::new( "values", values.data_type().clone(), @@ -903,7 +866,7 @@ mod tests { (values_field, make_array(values.to_data())), ]); - let map_data_type = DataType::Map( + let map_data_type = ArrowDataType::Map( Arc::new(Field::new( "entries", entry_struct.data_type().clone(), @@ -928,19 +891,19 @@ mod tests { ) .expect("Could not create a map array"); - let schema = >::try_from( - &crate::Schema::new(vec![crate::SchemaField::new( - "example".to_string(), - crate::SchemaDataType::map(crate::SchemaTypeMap::new( - Box::new(crate::SchemaDataType::primitive("string".to_string())), - Box::new(crate::SchemaDataType::primitive("binary".to_string())), + let schema = + >::try_from(&StructType::new(vec![ + StructField::new( + "example".to_string(), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::String), + DataType::Primitive(PrimitiveType::Binary), + false, + ))), false, - )), - false, - HashMap::new(), - )]), - ) - .expect("Could not get schema"); + ), + ])) + .expect("Could not get schema"); let record_batch = arrow::record_batch::RecordBatch::try_new(Arc::new(schema), vec![Arc::new(map_array)]) @@ -1081,6 +1044,6 @@ mod tests { ), true, )); - let _converted: schema::SchemaField = field.as_ref().try_into().unwrap(); + let _converted: StructField = field.as_ref().try_into().unwrap(); } } diff --git a/crates/deltalake-core/src/schema/mod.rs b/crates/deltalake-core/src/schema/mod.rs index a853725fc6..54d73ec664 100644 --- a/crates/deltalake-core/src/schema/mod.rs +++ b/crates/deltalake-core/src/schema/mod.rs @@ -1,379 +1,2 @@ //! Delta Table schema implementation. -#![allow(non_snake_case, non_camel_case_types)] - -use serde::{Deserialize, Serialize}; -use serde_json::Value; -use std::borrow::Cow; -use std::collections::HashMap; - -use crate::errors::DeltaTableError; - -#[cfg(all(feature = "arrow", feature = "parquet"))] -pub mod arrow_convert; pub mod partitions; - -/// Type alias for a string expected to match a GUID/UUID format -pub type Guid = String; - -static STRUCT_TAG: &str = "struct"; -static ARRAY_TAG: &str = "array"; -static MAP_TAG: &str = "map"; - -/// An invariant for a column that is enforced on all writes to a Delta table. -#[derive(Eq, PartialEq, Debug, Default, Clone)] -pub struct Invariant { - /// The full path to the field. - pub field_name: String, - /// The SQL string that must always evaluate to true. - pub invariant_sql: String, -} - -impl Invariant { - /// Create a new invariant - pub fn new(field_name: &str, invariant_sql: &str) -> Self { - Self { - field_name: field_name.to_string(), - invariant_sql: invariant_sql.to_string(), - } - } -} - -/// Represents a struct field defined in the Delta table schema. -// https://github.com/delta-io/delta/blob/master/PROTOCOL.md#Schema-Serialization-Format -#[derive(Serialize, Deserialize, PartialEq, Debug, Default, Clone)] -pub struct SchemaTypeStruct { - r#type: Cow<'static, str>, - fields: Vec, -} - -impl SchemaTypeStruct { - /// Create a new Schema using a vector of SchemaFields - pub fn new(fields: Vec) -> Self { - let tag = Cow::Borrowed(STRUCT_TAG); - Self { - r#type: tag, - fields, - } - } - - /// Returns the list of fields contained within the column struct. - pub fn get_fields(&self) -> &Vec { - &self.fields - } - - /// Returns an immutable reference of a specific `Field` instance selected by name. - pub fn get_field_with_name(&self, name: &str) -> Result<&SchemaField, DeltaTableError> { - Ok(&self.fields[self.index_of(name)?]) - } - - /// Find the index of the column with the given name. - pub fn index_of(&self, name: &str) -> Result { - for i in 0..self.fields.len() { - if self.fields[i].get_name() == name { - return Ok(i); - } - } - let valid_fields: Vec = self.fields.iter().map(|f| f.name.clone()).collect(); - Err(DeltaTableError::Generic(format!( - "Unable to get field named \"{name}\". Valid fields: {valid_fields:?}" - ))) - } - - /// Get all invariants in the schemas - pub fn get_invariants(&self) -> Result, DeltaTableError> { - let mut remaining_fields: Vec<(String, SchemaField)> = self - .get_fields() - .iter() - .map(|field| (field.name.clone(), field.clone())) - .collect(); - let mut invariants: Vec = Vec::new(); - - let add_segment = |prefix: &str, segment: &str| -> String { - if prefix.is_empty() { - segment.to_owned() - } else { - format!("{prefix}.{segment}") - } - }; - - while let Some((field_path, field)) = remaining_fields.pop() { - match field.r#type { - SchemaDataType::r#struct(inner) => { - remaining_fields.extend( - inner - .get_fields() - .iter() - .map(|field| { - let new_prefix = add_segment(&field_path, &field.name); - (new_prefix, field.clone()) - }) - .collect::>(), - ); - } - SchemaDataType::array(inner) => { - let element_field_name = add_segment(&field_path, "element"); - remaining_fields.push(( - element_field_name, - SchemaField::new("".to_string(), *inner.elementType, false, HashMap::new()), - )); - } - SchemaDataType::map(inner) => { - let key_field_name = add_segment(&field_path, "key"); - remaining_fields.push(( - key_field_name, - SchemaField::new("".to_string(), *inner.keyType, false, HashMap::new()), - )); - let value_field_name = add_segment(&field_path, "value"); - remaining_fields.push(( - value_field_name, - SchemaField::new("".to_string(), *inner.valueType, false, HashMap::new()), - )); - } - _ => {} - } - // JSON format: {"expression": {"expression": ""} } - if let Some(Value::String(invariant_json)) = field.metadata.get("delta.invariants") { - let json: Value = serde_json::from_str(invariant_json).map_err(|e| { - DeltaTableError::InvalidInvariantJson { - json_err: e, - line: invariant_json.to_string(), - } - })?; - if let Value::Object(json) = json { - if let Some(Value::Object(expr1)) = json.get("expression") { - if let Some(Value::String(sql)) = expr1.get("expression") { - invariants.push(Invariant::new(&field_path, sql)); - } - } - } - } - } - Ok(invariants) - } -} - -/// Describes a specific field of the Delta table schema. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -pub struct SchemaField { - // Name of this (possibly nested) column - name: String, - r#type: SchemaDataType, - // Boolean denoting whether this field can be null - nullable: bool, - // A JSON map containing information about this column. Keys prefixed with Delta are reserved - // for the implementation. - metadata: HashMap, -} - -impl SchemaField { - /// Create a new SchemaField from scratch - pub fn new( - name: String, - r#type: SchemaDataType, - nullable: bool, - metadata: HashMap, - ) -> Self { - Self { - name, - r#type, - nullable, - metadata, - } - } - - /// The column name of the schema field. - pub fn get_name(&self) -> &str { - &self.name - } - - /// The data type of the schema field. SchemaDataType defines the possible values. - pub fn get_type(&self) -> &SchemaDataType { - &self.r#type - } - - /// Whether the column/field is nullable. - pub fn is_nullable(&self) -> bool { - self.nullable - } - - /// Additional metadata about the column/field. - pub fn get_metadata(&self) -> &HashMap { - &self.metadata - } -} - -/// Schema definition for array type fields. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -pub struct SchemaTypeArray { - // type field is always the string "array", so we are ignoring it here - r#type: String, - // The type of element stored in this array represented as a string containing the name of a - // primitive type, a struct definition, an array definition or a map definition - elementType: Box, - // Boolean denoting whether this array can contain one or more null values - containsNull: bool, -} - -impl SchemaTypeArray { - /// Create a new SchemaTypeArray - pub fn new(elementType: Box, containsNull: bool) -> Self { - Self { - r#type: String::from(ARRAY_TAG), - elementType, - containsNull, - } - } - - /// The data type of each element contained in the array. - pub fn get_element_type(&self) -> &SchemaDataType { - &self.elementType - } - - /// Whether the column/field is allowed to contain null elements. - pub fn contains_null(&self) -> bool { - self.containsNull - } -} - -/// Schema definition for map type fields. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -pub struct SchemaTypeMap { - r#type: String, - keyType: Box, - valueType: Box, - valueContainsNull: bool, -} - -impl SchemaTypeMap { - /// Create a new SchemaTypeMap - pub fn new( - keyType: Box, - valueType: Box, - valueContainsNull: bool, - ) -> Self { - Self { - r#type: String::from(MAP_TAG), - keyType, - valueType, - valueContainsNull, - } - } - - /// The type of element used for the key of this map, represented as a string containing the - /// name of a primitive type, a struct definition, an array definition or a map definition - pub fn get_key_type(&self) -> &SchemaDataType { - &self.keyType - } - - /// The type of element contained in the value of this map, represented as a string containing the - /// name of a primitive type, a struct definition, an array definition or a map definition - pub fn get_value_type(&self) -> &SchemaDataType { - &self.valueType - } - - /// Whether the value field is allowed to contain null elements. - pub fn get_value_contains_null(&self) -> bool { - self.valueContainsNull - } -} - -/// Enum with variants for each top level schema data type. -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] -#[serde(untagged)] -pub enum SchemaDataType { - /// Variant representing non-array, non-map, non-struct fields. Wrapped value will contain the - /// the string name of the primitive type. - /// - /// Valid values are: - /// * string: utf8 - /// * long // undocumented, i64? - /// * integer: i32 - /// * short: i16 - /// * byte: i8 - /// * float: f32 - /// * double: f64 - /// * boolean: bool - /// * binary: a sequence of binary data - /// * date: A calendar date, represented as a year-month-day triple without a timezone - /// * timestamp: Microsecond precision timestamp without a timezone - /// * decimal: Signed decimal number with fixed precision (maximum number of digits) and scale (number of digits on right side of dot), where the precision and scale can be up to 38 - primitive(String), - /// Variant representing a struct. - r#struct(SchemaTypeStruct), - /// Variant representing an array. - array(SchemaTypeArray), - /// Variant representing a map. - map(SchemaTypeMap), -} - -/// Represents the schema of the delta table. -pub type Schema = SchemaTypeStruct; - -#[cfg(test)] -mod tests { - use super::*; - use serde_json::json; - - #[test] - fn test_get_invariants() { - let schema: Schema = serde_json::from_value(json!({ - "type": "struct", - "fields": [{"name": "x", "type": "string", "nullable": true, "metadata": {}}] - })) - .unwrap(); - let invariants = schema.get_invariants().unwrap(); - assert_eq!(invariants.len(), 0); - - let schema: Schema = serde_json::from_value(json!({ - "type": "struct", - "fields": [ - {"name": "x", "type": "integer", "nullable": true, "metadata": { - "delta.invariants": "{\"expression\": { \"expression\": \"x > 2\"} }" - }}, - {"name": "y", "type": "integer", "nullable": true, "metadata": { - "delta.invariants": "{\"expression\": { \"expression\": \"y < 4\"} }" - }} - ] - })) - .unwrap(); - let invariants = schema.get_invariants().unwrap(); - assert_eq!(invariants.len(), 2); - assert!(invariants.contains(&Invariant::new("x", "x > 2"))); - assert!(invariants.contains(&Invariant::new("y", "y < 4"))); - - let schema: Schema = serde_json::from_value(json!({ - "type": "struct", - "fields": [{ - "name": "a_map", - "type": { - "type": "map", - "keyType": "string", - "valueType": { - "type": "array", - "elementType": { - "type": "struct", - "fields": [{ - "name": "d", - "type": "integer", - "metadata": { - "delta.invariants": "{\"expression\": { \"expression\": \"a_map.value.element.d < 4\"} }" - }, - "nullable": false - }] - }, - "containsNull": false - }, - "valueContainsNull": false - }, - "nullable": false, - "metadata": {} - }] - })).unwrap(); - let invariants = schema.get_invariants().unwrap(); - assert_eq!(invariants.len(), 1); - assert_eq!( - invariants[0], - Invariant::new("a_map.value.element.d", "a_map.value.element.d < 4") - ); - } -} diff --git a/crates/deltalake-core/src/schema/partitions.rs b/crates/deltalake-core/src/schema/partitions.rs index 3750038b3a..c2db1903fa 100644 --- a/crates/deltalake-core/src/schema/partitions.rs +++ b/crates/deltalake-core/src/schema/partitions.rs @@ -2,8 +2,8 @@ use std::convert::TryFrom; -use super::SchemaDataType; use crate::errors::DeltaTableError; +use crate::kernel::{DataType, PrimitiveType}; use std::cmp::Ordering; use std::collections::HashMap; @@ -40,18 +40,21 @@ pub struct PartitionFilter { fn compare_typed_value( partition_value: &str, filter_value: &str, - data_type: &SchemaDataType, + data_type: &DataType, ) -> Option { match data_type { - SchemaDataType::primitive(primitive_type) => match primitive_type.as_str() { - "long" | "integer" | "short" | "byte" => match filter_value.parse::() { + DataType::Primitive(primitive_type) => match primitive_type { + PrimitiveType::Long + | PrimitiveType::Integer + | PrimitiveType::Short + | PrimitiveType::Byte => match filter_value.parse::() { Ok(parsed_filter_value) => { let parsed_partition_value = partition_value.parse::().unwrap(); parsed_partition_value.partial_cmp(&parsed_filter_value) } _ => None, }, - "float" | "double" => match filter_value.parse::() { + PrimitiveType::Float | PrimitiveType::Double => match filter_value.parse::() { Ok(parsed_filter_value) => { let parsed_partition_value = partition_value.parse::().unwrap(); parsed_partition_value.partial_cmp(&parsed_filter_value) @@ -67,11 +70,7 @@ fn compare_typed_value( /// Partition filters methods for filtering the DeltaTable partitions. impl PartitionFilter { /// Indicates if a DeltaTable partition matches with the partition filter by key and value. - pub fn match_partition( - &self, - partition: &DeltaTablePartition, - data_type: &SchemaDataType, - ) -> bool { + pub fn match_partition(&self, partition: &DeltaTablePartition, data_type: &DataType) -> bool { if self.key != partition.key { return false; } @@ -109,12 +108,9 @@ impl PartitionFilter { pub fn match_partitions( &self, partitions: &[DeltaTablePartition], - partition_col_data_types: &HashMap<&str, &SchemaDataType>, + partition_col_data_types: &HashMap<&String, &DataType>, ) -> bool { - let data_type = partition_col_data_types - .get(self.key.as_str()) - .unwrap() - .to_owned(); + let data_type = partition_col_data_types.get(&self.key).unwrap().to_owned(); partitions .iter() .any(|partition| self.match_partition(partition, data_type)) diff --git a/crates/deltalake-core/src/storage/utils.rs b/crates/deltalake-core/src/storage/utils.rs index 80710efd9b..7e516c7217 100644 --- a/crates/deltalake-core/src/storage/utils.rs +++ b/crates/deltalake-core/src/storage/utils.rs @@ -9,7 +9,7 @@ use object_store::path::Path; use object_store::{DynObjectStore, ObjectMeta, Result as ObjectStoreResult}; use crate::errors::{DeltaResult, DeltaTableError}; -use crate::protocol::Add; +use crate::kernel::Add; use crate::table::builder::DeltaTableBuilder; /// Copies the contents from the `from` location into the `to` location @@ -109,7 +109,15 @@ mod tests { .to_string(), size: 123, modification_time: 123456789, - ..Default::default() + data_change: true, + stats: None, + partition_values: Default::default(), + tags: Default::default(), + base_row_id: None, + default_row_commit_version: None, + deletion_vector: None, + partition_values_parsed: None, + stats_parsed: None, }; let meta: ObjectMeta = (&add).try_into().unwrap(); diff --git a/crates/deltalake-core/src/table/config.rs b/crates/deltalake-core/src/table/config.rs index 60498767ab..3fa021ce6e 100644 --- a/crates/deltalake-core/src/table/config.rs +++ b/crates/deltalake-core/src/table/config.rs @@ -387,12 +387,12 @@ fn parse_int(value: &str) -> Result { #[cfg(test)] mod tests { use super::*; + use crate::kernel::StructType; use crate::table::DeltaTableMetaData; - use crate::Schema; use std::collections::HashMap; fn dummy_metadata() -> DeltaTableMetaData { - let schema = Schema::new(Vec::new()); + let schema = StructType::new(Vec::new()); DeltaTableMetaData::new(None, None, None, schema, Vec::new(), HashMap::new()) } diff --git a/crates/deltalake-core/src/table/mod.rs b/crates/deltalake-core/src/table/mod.rs index 0a1e3116f1..2b011ff608 100644 --- a/crates/deltalake-core/src/table/mod.rs +++ b/crates/deltalake-core/src/table/mod.rs @@ -22,13 +22,14 @@ use uuid::Uuid; use self::builder::DeltaTableConfig; use self::state::DeltaTableState; use crate::errors::DeltaTableError; +use crate::kernel::{ + Action, Add, CommitInfo, DataType, Format, Metadata, ReaderFeatures, Remove, StructType, + WriterFeatures, +}; use crate::partitions::PartitionFilter; use crate::protocol::{ - self, find_latest_check_point_for_version, get_last_checkpoint, Action, ReaderFeatures, - WriterFeatures, + find_latest_check_point_for_version, get_last_checkpoint, ProtocolError, Stats, }; -use crate::protocol::{Add, ProtocolError, Stats}; -use crate::schema::*; use crate::storage::{commit_uri_from_version, ObjectStoreRef}; pub mod builder; @@ -133,16 +134,17 @@ impl Eq for CheckPoint {} /// Delta table metadata #[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] pub struct DeltaTableMetaData { + // TODO make this a UUID? /// Unique identifier for this table - pub id: Guid, + pub id: String, /// User-provided identifier for this table pub name: Option, /// User-provided description for this table pub description: Option, /// Specification of the encoding for the files stored in the table - pub format: protocol::Format, + pub format: Format, /// Schema of the table - pub schema: Schema, + pub schema: StructType, /// An array containing the names of columns by which the data should be partitioned pub partition_columns: Vec, /// The time when this metadata action is created, in milliseconds since the Unix epoch @@ -156,8 +158,8 @@ impl DeltaTableMetaData { pub fn new( name: Option, description: Option, - format: Option, - schema: Schema, + format: Option, + schema: StructType, partition_columns: Vec, configuration: HashMap>, ) -> Self { @@ -181,19 +183,19 @@ impl DeltaTableMetaData { } /// Return partition fields along with their data type from the current schema. - pub fn get_partition_col_data_types(&self) -> Vec<(&str, &SchemaDataType)> { + pub fn get_partition_col_data_types(&self) -> Vec<(&String, &DataType)> { // JSON add actions contain a `partitionValues` field which is a map. // When loading `partitionValues_parsed` we have to convert the stringified partition values back to the correct data type. self.schema - .get_fields() + .fields() .iter() .filter_map(|f| { if self .partition_columns .iter() - .any(|s| s.as_str() == f.get_name()) + .any(|s| s.as_str() == f.name()) { - Some((f.get_name(), f.get_type())) + Some((f.name(), f.data_type())) } else { None } @@ -212,16 +214,16 @@ impl fmt::Display for DeltaTableMetaData { } } -impl TryFrom for DeltaTableMetaData { +impl TryFrom for DeltaTableMetaData { type Error = ProtocolError; - fn try_from(action_metadata: protocol::MetaData) -> Result { - let schema = action_metadata.get_schema()?; + fn try_from(action_metadata: Metadata) -> Result { + let schema = action_metadata.schema()?; Ok(Self { id: action_metadata.id, name: action_metadata.name, description: action_metadata.description, - format: action_metadata.format, + format: Format::default(), schema, partition_columns: action_metadata.partition_columns, created_time: action_metadata.created_time, @@ -667,7 +669,7 @@ impl DeltaTable { pub async fn history( &mut self, limit: Option, - ) -> Result, DeltaTableError> { + ) -> Result, DeltaTableError> { let mut version = match limit { Some(l) => max(self.version() - l as i64 + 1, 0), None => self.get_earliest_delta_log_version().await?, @@ -800,7 +802,7 @@ impl DeltaTable { } /// Returns a vector of active tombstones (i.e. `Remove` actions present in the current delta log). - pub fn get_tombstones(&self) -> impl Iterator { + pub fn get_tombstones(&self) -> impl Iterator { self.state.unexpired_tombstones() } @@ -833,13 +835,13 @@ impl DeltaTable { /// Return table schema parsed from transaction log. Return None if table hasn't been loaded or /// no metadata was found in the log. - pub fn schema(&self) -> Option<&Schema> { + pub fn schema(&self) -> Option<&StructType> { self.state.schema() } /// Return table schema parsed from transaction log. Return `DeltaTableError` if table hasn't /// been loaded or no metadata was found in the log. - pub fn get_schema(&self) -> Result<&Schema, DeltaTableError> { + pub fn get_schema(&self) -> Result<&StructType, DeltaTableError> { self.schema().ok_or(DeltaTableError::NoSchema) } @@ -923,13 +925,14 @@ impl std::fmt::Debug for DeltaTable { #[cfg(test)] mod tests { + use pretty_assertions::assert_eq; + use tempdir::TempDir; + use super::*; + use crate::kernel::{DataType, PrimitiveType, StructField}; use crate::operations::create::CreateBuilder; #[cfg(any(feature = "s3", feature = "s3-native-tls"))] use crate::table::builder::DeltaTableBuilder; - use pretty_assertions::assert_eq; - use std::collections::HashMap; - use tempdir::TempDir; #[tokio::test] async fn table_round_trip() { @@ -966,17 +969,15 @@ mod tests { .with_table_name("Test Table Create") .with_comment("This table is made to test the create function for a DeltaTable") .with_columns(vec![ - SchemaField::new( + StructField::new( "Id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "Name".to_string(), - SchemaDataType::primitive("string".to_string()), + DataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]) .await diff --git a/crates/deltalake-core/src/table/state.rs b/crates/deltalake-core/src/table/state.rs index af4ff6369d..26becd0703 100644 --- a/crates/deltalake-core/src/table/state.rs +++ b/crates/deltalake-core/src/table/state.rs @@ -12,13 +12,15 @@ use serde::{Deserialize, Serialize}; use super::config::TableConfig; use crate::errors::DeltaTableError; +use crate::kernel::{ + Action, Add, CommitInfo, DataType, DomainMetadata, ReaderFeatures, Remove, StructType, + WriterFeatures, +}; use crate::partitions::{DeltaTablePartition, PartitionFilter}; -use crate::protocol::{self, Action, Add, ProtocolError, ReaderFeatures, WriterFeatures}; -use crate::schema::SchemaDataType; +use crate::protocol::ProtocolError; use crate::storage::commit_uri_from_version; use crate::table::DeltaTableMetaData; use crate::DeltaTable; -use crate::Schema; #[cfg(any(feature = "parquet", feature = "parquet2"))] use super::{CheckPoint, DeltaTableConfig}; @@ -31,13 +33,13 @@ pub struct DeltaTableState { version: i64, // A remove action should remain in the state of the table as a tombstone until it has expired. // A tombstone expires when the creation timestamp of the delta file exceeds the expiration - tombstones: HashSet, + tombstones: HashSet, // active files for table state - files: Vec, + files: Vec, // Information added to individual commits - commit_infos: Vec, + commit_infos: Vec, // Domain metadatas provided by the system or user - domain_metadatas: Vec, + domain_metadatas: Vec, app_transaction_version: HashMap, min_reader_version: i32, min_writer_version: i32, @@ -78,7 +80,7 @@ impl DeltaTableState { let mut new_state = DeltaTableState::with_version(version); for line in reader.lines() { - let action: protocol::Action = serde_json::from_str(line?.as_str())?; + let action: Action = serde_json::from_str(line?.as_str())?; new_state.process_action( action, table.config.require_tombstones, @@ -112,13 +114,13 @@ impl DeltaTableState { let preader = SerializedFileReader::new(data)?; let schema = preader.metadata().file_metadata().schema(); if !schema.is_group() { - return Err(DeltaTableError::from(protocol::ProtocolError::Generic( + return Err(DeltaTableError::from(ProtocolError::Generic( "Action record in checkpoint should be a struct".to_string(), ))); } for record in preader.get_row_iter(None)? { self.process_action( - protocol::Action::from_parquet_record(schema, &record.unwrap())?, + Action::from_parquet_record(schema, &record.unwrap())?, table_config.require_tombstones, table_config.require_files, )?; @@ -134,8 +136,8 @@ impl DeltaTableState { let metadata = read_metadata(&mut reader)?; for row_group in metadata.row_groups { - for action in actions_from_row_group(row_group, &mut reader) - .map_err(protocol::ProtocolError::from)? + for action in + actions_from_row_group(row_group, &mut reader).map_err(ProtocolError::from)? { self.process_action( action, @@ -167,7 +169,7 @@ impl DeltaTableState { } /// List of commit info maps. - pub fn commit_infos(&self) -> &Vec { + pub fn commit_infos(&self) -> &Vec { &self.commit_infos } @@ -187,13 +189,13 @@ impl DeltaTableState { } /// Full list of tombstones (remove actions) representing files removed from table state). - pub fn all_tombstones(&self) -> &HashSet { + pub fn all_tombstones(&self) -> &HashSet { &self.tombstones } /// List of unexpired tombstones (remove actions) representing files removed from table state. /// The retention period is set by `deletedFileRetentionDuration` with default value of 1 week. - pub fn unexpired_tombstones(&self) -> impl Iterator { + pub fn unexpired_tombstones(&self) -> impl Iterator { let retention_timestamp = Utc::now().timestamp_millis() - self.tombstone_retention_millis; self.tombstones .iter() @@ -202,7 +204,7 @@ impl DeltaTableState { /// Full list of add actions representing all parquet files that are part of the current /// delta table state. - pub fn files(&self) -> &Vec { + pub fn files(&self) -> &Vec { self.files.as_ref() } @@ -247,7 +249,7 @@ impl DeltaTableState { } /// The table schema - pub fn schema(&self) -> Option<&Schema> { + pub fn schema(&self) -> Option<&StructType> { self.current_metadata.as_ref().map(|m| &m.schema) } @@ -339,30 +341,30 @@ impl DeltaTableState { /// Process given action by updating current state. fn process_action( &mut self, - action: protocol::Action, + action: Action, require_tombstones: bool, require_files: bool, ) -> Result<(), ProtocolError> { match action { // TODO: optionally load CDC into TableState - protocol::Action::cdc(_v) => {} - protocol::Action::add(v) => { + Action::Cdc(_v) => {} + Action::Add(v) => { if require_files { self.files.push(v); } } - protocol::Action::remove(v) => { + Action::Remove(v) => { if require_tombstones && require_files { self.tombstones.insert(v); } } - protocol::Action::protocol(v) => { + Action::Protocol(v) => { self.min_reader_version = v.min_reader_version; self.min_writer_version = v.min_writer_version; self.reader_features = v.reader_features; self.writer_features = v.writer_features; } - protocol::Action::metaData(v) => { + Action::Metadata(v) => { let md = DeltaTableMetaData::try_from(v)?; let table_config = TableConfig(&md.configuration); self.tombstone_retention_millis = @@ -372,16 +374,16 @@ impl DeltaTableState { self.enable_expired_log_cleanup = table_config.enable_expired_log_cleanup(); self.current_metadata = Some(md); } - protocol::Action::txn(v) => { + Action::Txn(v) => { *self .app_transaction_version .entry(v.app_id) .or_insert(v.version) = v.version; } - protocol::Action::commitInfo(v) => { + Action::CommitInfo(v) => { self.commit_infos.push(v); } - protocol::Action::domainMetadata(v) => { + Action::DomainMetadata(v) => { self.domain_metadatas.push(v); } } @@ -408,7 +410,7 @@ impl DeltaTableState { }); } - let partition_col_data_types: HashMap<&str, &SchemaDataType> = current_metadata + let partition_col_data_types: HashMap<&String, &DataType> = current_metadata .get_partition_col_data_types() .into_iter() .collect(); @@ -430,6 +432,7 @@ impl DeltaTableState { #[cfg(test)] mod tests { use super::*; + use crate::kernel::Txn; use pretty_assertions::assert_eq; #[test] @@ -478,7 +481,7 @@ mod tests { enable_expired_log_cleanup: true, }; - let txn_action = protocol::Action::txn(protocol::Txn { + let txn_action = Action::Txn(Txn { app_id: "abc".to_string(), version: 2, last_updated: Some(0), diff --git a/crates/deltalake-core/src/table/state_arrow.rs b/crates/deltalake-core/src/table/state_arrow.rs index 34f858f415..9d82c87326 100644 --- a/crates/deltalake-core/src/table/state_arrow.rs +++ b/crates/deltalake-core/src/table/state_arrow.rs @@ -18,9 +18,8 @@ use itertools::Itertools; use super::state::DeltaTableState; use crate::errors::DeltaTableError; +use crate::kernel::{DataType as DeltaDataType, StructType}; use crate::protocol::{ColumnCountStat, ColumnValueStat, Stats}; -use crate::SchemaDataType; -use crate::SchemaTypeStruct; impl DeltaTableState { /// Get an [arrow::record_batch::RecordBatch] containing add action data. @@ -152,8 +151,8 @@ impl DeltaTableState { .iter() .map( |name| -> Result { - let field = metadata.schema.get_field_with_name(name)?; - Ok(field.get_type().try_into()?) + let field = metadata.schema.field_with_name(name)?; + Ok(field.data_type().try_into()?) }, ) .collect::>()?; @@ -299,7 +298,7 @@ impl DeltaTableState { for add in self.files() { if let Some(value) = &add.deletion_vector { - storage_type.append_value(value.storage_type.to_string()); + storage_type.append_value(&value.storage_type); path_or_inline_div.append_value(value.path_or_inline_dv.clone()); if let Some(ofs) = value.offset { offset.append_value(ofs); @@ -415,7 +414,7 @@ impl DeltaTableState { }; let mut columnar_stats: Vec = SchemaLeafIterator::new(schema) - .filter(|(_path, datatype)| !matches!(datatype, SchemaDataType::r#struct(_))) + .filter(|(_path, datatype)| !matches!(datatype, DeltaDataType::Struct(_))) .map(|(path, datatype)| -> Result { let null_count = stats .iter() @@ -432,7 +431,7 @@ impl DeltaTableState { let arrow_type: arrow::datatypes::DataType = datatype.try_into()?; // Min and max are collected for primitive values, not list or maps - let min_values = if matches!(datatype, SchemaDataType::primitive(_)) { + let min_values = if matches!(datatype, DeltaDataType::Primitive(_)) { let min_values = stats .iter() .flat_map(|maybe_stat| { @@ -449,7 +448,7 @@ impl DeltaTableState { None }; - let max_values = if matches!(datatype, SchemaDataType::primitive(_)) { + let max_values = if matches!(datatype, DeltaDataType::Primitive(_)) { let max_values = stats .iter() .flat_map(|maybe_stat| { @@ -636,33 +635,33 @@ fn resolve_column_count_stat( } struct SchemaLeafIterator<'a> { - fields_remaining: VecDeque<(Vec<&'a str>, &'a SchemaDataType)>, + fields_remaining: VecDeque<(Vec<&'a str>, &'a DeltaDataType)>, } impl<'a> SchemaLeafIterator<'a> { - fn new(schema: &'a SchemaTypeStruct) -> Self { + fn new(schema: &'a StructType) -> Self { SchemaLeafIterator { fields_remaining: schema - .get_fields() + .fields() .iter() - .map(|field| (vec![field.get_name()], field.get_type())) + .map(|field| (vec![field.name().as_ref()], field.data_type())) .collect(), } } } impl<'a> std::iter::Iterator for SchemaLeafIterator<'a> { - type Item = (Vec<&'a str>, &'a SchemaDataType); + type Item = (Vec<&'a str>, &'a DeltaDataType); fn next(&mut self) -> Option { if let Some((path, datatype)) = self.fields_remaining.pop_front() { - if let SchemaDataType::r#struct(struct_type) = datatype { + if let DeltaDataType::Struct(struct_type) = datatype { // push child fields to front - for field in struct_type.get_fields() { + for field in struct_type.fields() { let mut new_path = path.clone(); - new_path.push(field.get_name()); + new_path.push(field.name()); self.fields_remaining - .push_front((new_path, field.get_type())); + .push_front((new_path, field.data_type())); } }; diff --git a/crates/deltalake-core/src/writer/json.rs b/crates/deltalake-core/src/writer/json.rs index f8d6d1a9e3..044ffc20e2 100644 --- a/crates/deltalake-core/src/writer/json.rs +++ b/crates/deltalake-core/src/writer/json.rs @@ -23,9 +23,10 @@ use super::utils::{ }; use super::{utils::PartitionPath, DeltaWriter, DeltaWriterError}; use crate::errors::DeltaTableError; +use crate::kernel::{Add, StructType}; use crate::table::builder::DeltaTableBuilder; use crate::table::DeltaTableMetaData; -use crate::{protocol::Add, DeltaTable, Schema}; +use crate::DeltaTable; use crate::{storage::DeltaObjectStore, writer::utils::ShareableBuffer}; type BadValue = (Value, ParquetError); @@ -33,7 +34,7 @@ type BadValue = (Value, ParquetError); /// Writes messages to a delta lake table. pub struct JsonWriter { storage: Arc, - arrow_schema_ref: Arc, + arrow_schema_ref: Arc, writer_properties: WriterProperties, partition_columns: Vec, arrow_writers: HashMap, @@ -206,7 +207,7 @@ impl JsonWriter { pub fn for_table(table: &DeltaTable) -> Result { // Initialize an arrow schema ref from the delta table schema let metadata = table.get_metadata()?; - let arrow_schema = >::try_from(&metadata.schema)?; + let arrow_schema = >::try_from(&metadata.schema)?; let arrow_schema_ref = Arc::new(arrow_schema); let partition_columns = metadata.partition_columns.clone(); @@ -232,7 +233,8 @@ impl JsonWriter { &mut self, metadata: &DeltaTableMetaData, ) -> Result { - let schema: ArrowSchema = >::try_from(&metadata.schema)?; + let schema: ArrowSchema = + >::try_from(&metadata.schema)?; let schema_updated = self.arrow_schema_ref.as_ref() != &schema || self.partition_columns != metadata.partition_columns; @@ -440,6 +442,11 @@ fn extract_partition_values( #[cfg(test)] mod tests { + use parquet::file::reader::FileReader; + use parquet::file::serialized_reader::SerializedFileReader; + use std::fs::File; + use std::sync::Arc; + use super::*; use crate::arrow::array::Int32Array; use crate::arrow::datatypes::{ @@ -448,11 +455,6 @@ mod tests { use crate::writer::test_utils::get_delta_schema; use crate::writer::DeltaWriter; use crate::writer::JsonWriter; - use crate::Schema; - use parquet::file::reader::FileReader; - use parquet::file::serialized_reader::SerializedFileReader; - use std::fs::File; - use std::sync::Arc; #[tokio::test] async fn test_partition_not_written_to_parquet() { @@ -460,7 +462,7 @@ mod tests { let schema = get_delta_schema(); let path = table_dir.path().to_str().unwrap().to_string(); - let arrow_schema = >::try_from(&schema).unwrap(); + let arrow_schema = >::try_from(&schema).unwrap(); let mut writer = JsonWriter::try_new( path.clone(), Arc::new(arrow_schema), diff --git a/crates/deltalake-core/src/writer/mod.rs b/crates/deltalake-core/src/writer/mod.rs index 8c5512127f..478a0b11f2 100644 --- a/crates/deltalake-core/src/writer/mod.rs +++ b/crates/deltalake-core/src/writer/mod.rs @@ -8,8 +8,9 @@ use parquet::errors::ParquetError; use serde_json::Value; use crate::errors::DeltaTableError; +use crate::kernel::{Action, Add}; use crate::operations::transaction::commit; -use crate::protocol::{Action, Add, ColumnCountStat, DeltaOperation, SaveMode}; +use crate::protocol::{ColumnCountStat, DeltaOperation, SaveMode}; use crate::DeltaTable; pub use json::JsonWriter; @@ -133,7 +134,7 @@ pub trait DeltaWriter { /// Flush the internal write buffers to files in the delta table folder structure. /// and commit the changes to the Delta log, creating a new table version. async fn flush_and_commit(&mut self, table: &mut DeltaTable) -> Result { - let adds: Vec<_> = self.flush().await?.drain(..).map(Action::add).collect(); + let adds: Vec<_> = self.flush().await?.drain(..).map(Action::Add).collect(); let partition_cols = table.get_metadata()?.partition_columns.clone(); let partition_by = if !partition_cols.is_empty() { Some(partition_cols) diff --git a/crates/deltalake-core/src/writer/record_batch.rs b/crates/deltalake-core/src/writer/record_batch.rs index a6486ae109..b673146907 100644 --- a/crates/deltalake-core/src/writer/record_batch.rs +++ b/crates/deltalake-core/src/writer/record_batch.rs @@ -26,9 +26,10 @@ use super::utils::{ }; use super::{DeltaWriter, DeltaWriterError}; use crate::errors::DeltaTableError; +use crate::kernel::{Add, StructType}; use crate::table::builder::DeltaTableBuilder; use crate::table::DeltaTableMetaData; -use crate::{protocol::Add, storage::DeltaObjectStore, DeltaTable, Schema}; +use crate::{storage::DeltaObjectStore, DeltaTable}; /// Writes messages to a delta lake table. pub struct RecordBatchWriter { @@ -76,7 +77,8 @@ impl RecordBatchWriter { pub fn for_table(table: &DeltaTable) -> Result { // Initialize an arrow schema ref from the delta table schema let metadata = table.get_metadata()?; - let arrow_schema = >::try_from(&metadata.schema.clone())?; + let arrow_schema = + >::try_from(&metadata.schema.clone())?; let arrow_schema_ref = Arc::new(arrow_schema); let partition_columns = metadata.partition_columns.clone(); @@ -103,7 +105,8 @@ impl RecordBatchWriter { &mut self, metadata: &DeltaTableMetaData, ) -> Result { - let schema: ArrowSchema = >::try_from(&metadata.schema)?; + let schema: ArrowSchema = + >::try_from(&metadata.schema)?; let schema_updated = self.arrow_schema_ref.as_ref() != &schema || self.partition_columns != metadata.partition_columns; @@ -450,7 +453,7 @@ mod tests { */ #[tokio::test] async fn test_divide_record_batch_with_map_single_partition() { - use crate::{DeltaOps, SchemaTypeStruct}; + use crate::DeltaOps; let table = crate::writer::test_utils::create_bare_table(); let partition_cols = vec!["modified".to_string()]; @@ -466,13 +469,13 @@ mod tests { ] }"#; - let delta_schema: SchemaTypeStruct = + let delta_schema: StructType = serde_json::from_str(delta_schema).expect("Failed to parse schema"); let table = DeltaOps(table) .create() .with_partition_columns(partition_cols.to_vec()) - .with_columns(delta_schema.get_fields().clone()) + .with_columns(delta_schema.fields().clone()) .await .unwrap(); @@ -484,7 +487,7 @@ mod tests { .as_bytes(); let schema: ArrowSchema = - >::try_from(&delta_schema).unwrap(); + >::try_from(&delta_schema).unwrap(); // Using a batch size of two since the buf above only has two records let mut decoder = ReaderBuilder::new(Arc::new(schema)) diff --git a/crates/deltalake-core/src/writer/stats.rs b/crates/deltalake-core/src/writer/stats.rs index 6cd1961798..2e4f6ac177 100644 --- a/crates/deltalake-core/src/writer/stats.rs +++ b/crates/deltalake-core/src/writer/stats.rs @@ -11,7 +11,8 @@ use parquet::{ }; use super::*; -use crate::protocol::{Add, ColumnValueStat, Stats}; +use crate::kernel::Add; +use crate::protocol::{ColumnValueStat, Stats}; /// Creates an [`Add`] log action struct. pub fn create_add( @@ -32,13 +33,15 @@ pub fn create_add( path, size, partition_values: partition_values.to_owned(), - partition_values_parsed: None, modification_time, data_change: true, stats: Some(stats_string), - stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + stats_parsed: None, + partition_values_parsed: None, }) } diff --git a/crates/deltalake-core/src/writer/test_utils.rs b/crates/deltalake-core/src/writer/test_utils.rs index f140c2aa7b..d67931c096 100644 --- a/crates/deltalake-core/src/writer/test_utils.rs +++ b/crates/deltalake-core/src/writer/test_utils.rs @@ -7,10 +7,11 @@ use arrow::compute::take; use arrow_array::{Int32Array, Int64Array, RecordBatch, StringArray, StructArray, UInt32Array}; use arrow_schema::{DataType, Field, Schema as ArrowSchema}; -use crate::operations::{create::CreateBuilder, DeltaOps}; -use crate::schema::{Schema, SchemaTypeStruct}; +use crate::kernel::{DataType as DeltaDataType, PrimitiveType, StructField, StructType}; +use crate::operations::create::CreateBuilder; +use crate::operations::DeltaOps; use crate::table::DeltaTableMetaData; -use crate::{DeltaConfigKey, DeltaTable, DeltaTableBuilder, SchemaDataType, SchemaField}; +use crate::{DeltaConfigKey, DeltaTable, DeltaTableBuilder}; pub type TestResult = Result<(), Box>; @@ -131,25 +132,22 @@ fn data_without_null() -> (Int32Array, StringArray, StringArray) { (base_int, base_str, base_mod) } -pub fn get_delta_schema() -> Schema { - Schema::new(vec![ - SchemaField::new( +pub fn get_delta_schema() -> StructType { + StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DeltaDataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), ]) } @@ -250,36 +248,31 @@ pub fn get_record_batch_with_nested_struct() -> RecordBatch { .unwrap() } -pub fn get_delta_schema_with_nested_struct() -> Schema { - Schema::new(vec![ - SchemaField::new( +pub fn get_delta_schema_with_nested_struct() -> StructType { + StructType::new(vec![ + StructField::new( "id".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DeltaDataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "modified".to_string(), - SchemaDataType::primitive("string".to_string()), + DeltaDataType::Primitive(PrimitiveType::String), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( String::from("nested"), - SchemaDataType::r#struct(SchemaTypeStruct::new(vec![SchemaField::new( + DeltaDataType::Struct(Box::new(StructType::new(vec![StructField::new( String::from("count"), - SchemaDataType::primitive(String::from("integer")), + DeltaDataType::Primitive(PrimitiveType::Integer), true, - Default::default(), - )])), + )]))), true, - Default::default(), ), ]) } @@ -291,7 +284,7 @@ pub async fn setup_table_with_configuration( let table_schema = get_delta_schema(); DeltaOps::new_in_memory() .create() - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_configuration_property(key, value) .await .expect("Failed to create table") @@ -314,7 +307,7 @@ pub async fn create_initialized_table(partition_cols: &[String]) -> DeltaTable { .with_location(table_path.to_str().unwrap()) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partition_cols) .await .unwrap() diff --git a/crates/deltalake-core/tests/checkpoint_writer.rs b/crates/deltalake-core/tests/checkpoint_writer.rs index 6788346ef8..b1fc28faed 100644 --- a/crates/deltalake-core/tests/checkpoint_writer.rs +++ b/crates/deltalake-core/tests/checkpoint_writer.rs @@ -211,7 +211,7 @@ mod checkpoints_with_tombstones { use super::*; use ::object_store::path::Path as ObjectStorePath; use chrono::Utc; - use deltalake_core::protocol::*; + use deltalake_core::kernel::*; use deltalake_core::table::config::DeltaConfigKey; use deltalake_core::*; use maplit::hashmap; @@ -346,6 +346,8 @@ mod checkpoints_with_tombstones { size: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, }) .collect(); @@ -357,8 +359,8 @@ mod checkpoints_with_tombstones { let actions = removes .iter() .cloned() - .map(Action::remove) - .chain(std::iter::once(Action::add(add.clone()))) + .map(Action::Remove) + .chain(std::iter::once(Action::Add(add.clone()))) .collect(); let operation = DeltaOperation::Optimize { predicate: None, @@ -389,7 +391,7 @@ mod checkpoints_with_tombstones { let actions = actions .iter() .filter_map(|a| match a { - Action::remove(r) => Some(r.clone()), + Action::Remove(r) => Some(r.clone()), _ => None, }) .collect(); @@ -408,6 +410,8 @@ mod checkpoints_with_tombstones { size: Some(100), tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, } } diff --git a/crates/deltalake-core/tests/command_optimize.rs b/crates/deltalake-core/tests/command_optimize.rs index 70d161d69e..a923d0064d 100644 --- a/crates/deltalake-core/tests/command_optimize.rs +++ b/crates/deltalake-core/tests/command_optimize.rs @@ -4,18 +4,19 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; use std::{collections::HashMap, error::Error, sync::Arc}; use arrow_array::{Int32Array, RecordBatch, StringArray}; -use arrow_schema::{DataType, Field, Schema as ArrowSchema}; +use arrow_schema::{DataType as ArrowDataType, Field, Schema as ArrowSchema}; use arrow_select::concat::concat_batches; use deltalake_core::errors::DeltaTableError; +use deltalake_core::kernel::{Action, DataType, PrimitiveType, Remove, StructField}; use deltalake_core::operations::optimize::{ create_merge_plan, MetricDetails, Metrics, OptimizeType, }; use deltalake_core::operations::transaction::commit; use deltalake_core::operations::DeltaOps; -use deltalake_core::protocol::{Action, DeltaOperation, Remove}; +use deltalake_core::protocol::DeltaOperation; use deltalake_core::storage::ObjectStoreRef; use deltalake_core::writer::{DeltaWriter, RecordBatchWriter}; -use deltalake_core::{DeltaTable, PartitionFilter, Path, SchemaDataType, SchemaField}; +use deltalake_core::{DeltaTable, PartitionFilter, Path}; use futures::TryStreamExt; use object_store::ObjectStore; use parquet::arrow::async_reader::ParquetObjectReader; @@ -32,23 +33,20 @@ struct Context { async fn setup_test(partitioned: bool) -> Result> { let columns = vec![ - SchemaField::new( + StructField::new( "x".to_owned(), - SchemaDataType::primitive("integer".to_owned()), + DataType::Primitive(PrimitiveType::Integer), false, - HashMap::new(), ), - SchemaField::new( + StructField::new( "y".to_owned(), - SchemaDataType::primitive("integer".to_owned()), + DataType::Primitive(PrimitiveType::Integer), false, - HashMap::new(), ), - SchemaField::new( + StructField::new( "date".to_owned(), - SchemaDataType::primitive("string".to_owned()), + DataType::Primitive(PrimitiveType::String), false, - HashMap::new(), ), ]; @@ -92,9 +90,9 @@ fn generate_random_batch>( Ok(RecordBatch::try_new( Arc::new(ArrowSchema::new(vec![ - Field::new("x", DataType::Int32, false), - Field::new("y", DataType::Int32, false), - Field::new("date", DataType::Utf8, false), + Field::new("x", ArrowDataType::Int32, false), + Field::new("y", ArrowDataType::Int32, false), + Field::new("date", ArrowDataType::Utf8, false), ])), vec![Arc::new(x_array), Arc::new(y_array), Arc::new(date_array)], )?) @@ -121,9 +119,9 @@ fn tuples_to_batch>( Ok(RecordBatch::try_new( Arc::new(ArrowSchema::new(vec![ - Field::new("x", DataType::Int32, false), - Field::new("y", DataType::Int32, false), - Field::new("date", DataType::Utf8, false), + Field::new("x", ArrowDataType::Int32, false), + Field::new("y", ArrowDataType::Int32, false), + Field::new("date", ArrowDataType::Utf8, false), ])), vec![Arc::new(x_array), Arc::new(y_array), Arc::new(date_array)], )?) @@ -294,12 +292,14 @@ async fn test_conflict_for_remove_actions() -> Result<(), Box> { partition_values: Some(add.partition_values.clone()), tags: Some(HashMap::new()), deletion_vector: add.deletion_vector.clone(), + base_row_id: add.base_row_id, + default_row_commit_version: add.default_row_commit_version, }; let operation = DeltaOperation::Delete { predicate: None }; commit( other_dt.object_store().as_ref(), - &vec![Action::remove(remove)], + &vec![Action::Remove(remove)], operation, &other_dt.state, None, diff --git a/crates/deltalake-core/tests/command_restore.rs b/crates/deltalake-core/tests/command_restore.rs index ac9a37d73b..80c2083261 100644 --- a/crates/deltalake-core/tests/command_restore.rs +++ b/crates/deltalake-core/tests/command_restore.rs @@ -2,12 +2,12 @@ use arrow::datatypes::Schema as ArrowSchema; use arrow_array::{Int32Array, RecordBatch}; -use arrow_schema::{DataType, Field}; +use arrow_schema::{DataType as ArrowDataType, Field}; use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; +use deltalake_core::kernel::{DataType, PrimitiveType, StructField}; use deltalake_core::protocol::SaveMode; -use deltalake_core::{DeltaOps, DeltaTable, SchemaDataType, SchemaField}; +use deltalake_core::{DeltaOps, DeltaTable}; use rand::Rng; -use std::collections::HashMap; use std::error::Error; use std::fs; use std::sync::Arc; @@ -21,17 +21,15 @@ struct Context { async fn setup_test() -> Result> { let columns = vec![ - SchemaField::new( + StructField::new( "id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "value".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), ]; @@ -77,8 +75,8 @@ fn get_record_batch() -> RecordBatch { } let schema = ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, true), - Field::new("value", DataType::Int32, true), + Field::new("id", ArrowDataType::Int32, true), + Field::new("value", ArrowDataType::Int32, true), ]); let id_array = Int32Array::from(id_vec); diff --git a/crates/deltalake-core/tests/command_vacuum.rs b/crates/deltalake-core/tests/command_vacuum.rs index f44e1f86b1..0007f479d5 100644 --- a/crates/deltalake-core/tests/command_vacuum.rs +++ b/crates/deltalake-core/tests/command_vacuum.rs @@ -1,9 +1,9 @@ use chrono::Duration; use common::clock::TestClock; use common::TestContext; +use deltalake_core::kernel::StructType; use deltalake_core::operations::vacuum::Clock; use deltalake_core::operations::DeltaOps; -use deltalake_core::Schema; use object_store::{path::Path, Error as ObjectStoreError, ObjectStore}; use serde_json::json; use std::sync::Arc; @@ -11,7 +11,7 @@ use std::sync::Arc; mod common; /// Basic schema -pub fn get_xy_date_schema() -> Schema { +pub fn get_xy_date_schema() -> StructType { serde_json::from_value(json!({ "type": "struct", "fields": [ @@ -24,8 +24,8 @@ pub fn get_xy_date_schema() -> Schema { } /// Schema that contains a column prefiexed with _ -pub fn get_vacuum_underscore_schema() -> Schema { - serde_json::from_value::(json!({ +pub fn get_vacuum_underscore_schema() -> StructType { + serde_json::from_value::(json!({ "type": "struct", "fields": [ {"name": "x", "type": "integer", "nullable": false, "metadata": {}}, diff --git a/crates/deltalake-core/tests/commit_info_format.rs b/crates/deltalake-core/tests/commit_info_format.rs index ba7d80a726..de69397e32 100644 --- a/crates/deltalake-core/tests/commit_info_format.rs +++ b/crates/deltalake-core/tests/commit_info_format.rs @@ -1,8 +1,9 @@ #![allow(dead_code)] mod fs_common; +use deltalake_core::kernel::Action; use deltalake_core::operations::transaction::commit; -use deltalake_core::protocol::{Action, DeltaOperation, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use serde_json::json; use std::error::Error; use tempdir::TempDir; @@ -13,7 +14,7 @@ async fn test_operational_parameters() -> Result<(), Box> { let mut table = fs_common::create_table(path.path().to_str().unwrap(), None).await; let add = fs_common::add(0); - let actions = vec![Action::add(add)]; + let actions = vec![Action::Add(add)]; let operation = DeltaOperation::Write { mode: SaveMode::Append, partition_by: Some(vec!["some_partition".to_string()]), diff --git a/crates/deltalake-core/tests/common/mod.rs b/crates/deltalake-core/tests/common/mod.rs index a53d8b7641..80df899323 100644 --- a/crates/deltalake-core/tests/common/mod.rs +++ b/crates/deltalake-core/tests/common/mod.rs @@ -1,12 +1,13 @@ #![allow(dead_code, unused_variables)] use bytes::Bytes; +use deltalake_core::kernel::{Action, Add, Remove, StructType}; use deltalake_core::operations::create::CreateBuilder; use deltalake_core::operations::transaction::commit; -use deltalake_core::protocol::{self, Add, DeltaOperation, Remove, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::DeltaObjectStore; +use deltalake_core::DeltaTable; use deltalake_core::DeltaTableBuilder; -use deltalake_core::{DeltaTable, Schema}; use object_store::{path::Path, ObjectStore}; use std::any::Any; use std::collections::HashMap; @@ -74,7 +75,7 @@ impl TestContext { //Create and set a new table from the provided schema pub async fn create_table_from_schema( &mut self, - schema: Schema, + schema: StructType, partitions: &[&str], ) -> DeltaTable { let p = partitions @@ -86,7 +87,7 @@ impl TestContext { .with_object_store(backend) .with_table_name("delta-rs_test_table") .with_comment("Table created by delta-rs tests") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_partition_columns(p) .await .unwrap() @@ -133,14 +134,20 @@ pub async fn add_file( modification_time: create_time, partition_values: part_values, data_change: true, - ..Default::default() + stats: None, + stats_parsed: None, + partition_values_parsed: None, + tags: None, + default_row_commit_version: None, + base_row_id: None, + deletion_vector: None, }; let operation = DeltaOperation::Write { mode: SaveMode::Append, partition_by: None, predicate: None, }; - let actions = vec![protocol::Action::add(add)]; + let actions = vec![Action::Add(add)]; commit( table.object_store().as_ref(), &actions, @@ -170,10 +177,15 @@ pub async fn remove_file( deletion_timestamp: Some(deletion_timestamp), partition_values: Some(part_values), data_change: true, - ..Default::default() + extended_file_metadata: None, + size: None, + deletion_vector: None, + default_row_commit_version: None, + base_row_id: None, + tags: None, }; let operation = DeltaOperation::Delete { predicate: None }; - let actions = vec![protocol::Action::remove(remove)]; + let actions = vec![Action::Remove(remove)]; commit( table.object_store().as_ref(), &actions, diff --git a/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..3760ad9930 --- /dev/null +++ b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1677811178585,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"635"},"engineInfo":"Databricks-Runtime/","txnId":"a6a94671-55ef-450e-9546-b8465b9147de"}} +{"protocol":{"minReaderVersion":3,"minWriterVersion":7,"readerFeatures":["deletionVectors"],"writerFeatures":["deletionVectors"]}} +{"metaData":{"id":"testId","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.enableDeletionVectors":"true","delta.columnMapping.mode":"none"},"createdTime":1677811175819}} +{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":true}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} diff --git a/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..e5bcdc1163 --- /dev/null +++ b/crates/deltalake-core/tests/data/table-with-dv-small/_delta_log/00000000000000000001.json @@ -0,0 +1,3 @@ +{"commitInfo":{"timestamp":1677811194429,"operation":"DELETE","operationParameters":{"predicate":"[\"(spark_catalog.delta.`/tmp/table-with-dv-small`.value IN (0, 9))\"]"},"readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":false,"operationMetrics":{"numRemovedFiles":"0","numRemovedBytes":"0","numCopiedRows":"0","numDeletionVectorsAdded":"1","numDeletionVectorsRemoved":"0","numAddedChangeFiles":"0","executionTimeMs":"10364","numDeletedRows":"2","scanTimeMs":"9869","numAddedFiles":"0","numAddedBytes":"0","rewriteTimeMs":"479"},"engineInfo":"Databricks-Runtime/","txnId":"6d9555a2-0e3b-4c15-80c0-d5c3b0cf1277"}} +{"remove":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","deletionTimestamp":1677811194426,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":635,"tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"}}} +{"add":{"path":"part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet","partitionValues":{},"size":635,"modificationTime":1677811178336,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0},\"tightBounds\":false}","tags":{"INSERTION_TIME":"1677811178336000","MIN_INSERTION_TIME":"1677811178336000","MAX_INSERTION_TIME":"1677811178336000","OPTIMIZE_TARGET_SIZE":"268435456"},"deletionVector":{"storageType":"u","pathOrInlineDv":"vBn[lx{q8@P<9BNH/isA","offset":1,"sizeInBytes":36,"cardinality":2}}} diff --git a/crates/deltalake-core/tests/data/table-with-dv-small/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin b/crates/deltalake-core/tests/data/table-with-dv-small/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin new file mode 100644 index 0000000000..f1a01e661c Binary files /dev/null and b/crates/deltalake-core/tests/data/table-with-dv-small/deletion_vector_61d16c75-6994-46b7-a15b-8b538852e50e.bin differ diff --git a/crates/deltalake-core/tests/data/table-with-dv-small/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet b/crates/deltalake-core/tests/data/table-with-dv-small/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet new file mode 100644 index 0000000000..640e643b56 Binary files /dev/null and b/crates/deltalake-core/tests/data/table-with-dv-small/part-00000-fae5310a-a37d-4e51-827b-c3d5516560ca-c000.snappy.parquet differ diff --git a/crates/deltalake-core/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json b/crates/deltalake-core/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..a7941cd087 --- /dev/null +++ b/crates/deltalake-core/tests/data/table-without-dv-small/_delta_log/00000000000000000000.json @@ -0,0 +1,4 @@ +{"commitInfo":{"timestamp":1678020185201,"operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"548"},"engineInfo":"Apache-Spark/3.3.0 Delta-Lake/2.3.0rc1","txnId":"07c0f996-3854-4456-b68b-d1e35e3888cd"}} +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"6524c99f-9a76-4ea1-8ad4-e428a7e065d7","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{},"createdTime":1678020184802}} +{"add":{"path":"part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet","partitionValues":{},"size":548,"modificationTime":1678020185157,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"value\":0},\"maxValues\":{\"value\":9},\"nullCount\":{\"value\":0}}"}} diff --git a/crates/deltalake-core/tests/data/table-without-dv-small/part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet b/crates/deltalake-core/tests/data/table-without-dv-small/part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet new file mode 100644 index 0000000000..7ce78a86b0 Binary files /dev/null and b/crates/deltalake-core/tests/data/table-without-dv-small/part-00000-517f5d32-9c95-48e8-82b4-0229cc194867-c000.snappy.parquet differ diff --git a/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000000.json b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000000.json new file mode 100644 index 0000000000..8ecc70e647 --- /dev/null +++ b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000000.json @@ -0,0 +1,3 @@ +{"protocol":{"minReaderVersion":1,"minWriterVersion":2}} +{"metaData":{"id":"84b09beb-329c-4b5e-b493-f58c6c78b8fd","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"letter\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"int\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"date\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"configuration":{"delta.checkpointInterval":"2"},"createdTime":1674611455081}} +{"commitInfo":{"timestamp":1674611455099,"operation":"CREATE TABLE","operationParameters":{"isManaged":"false","description":null,"partitionBy":"[]","properties":"{\"delta.checkpointInterval\":\"2\"}"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"d87e63fb-7388-4b1c-9afc-750a561012b7"}} diff --git a/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000001.json b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000001.json new file mode 100644 index 0000000000..5f4304c65c --- /dev/null +++ b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000001.json @@ -0,0 +1,2 @@ +{"add":{"path":"part-00000-ad1a4bb7-07e8-4f40-b50b-49910d209e0c-c000.snappy.parquet","partitionValues":{},"size":965,"modificationTime":1674611456921,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"letter\":\"b\",\"int\":288,\"date\":\"1978-02-01\"},\"maxValues\":{\"letter\":\"c\",\"int\":988,\"date\":\"2020-05-01\"},\"nullCount\":{\"letter\":3,\"int\":0,\"date\":0}}"}} +{"commitInfo":{"timestamp":1674611457269,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":0,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"965"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"71d9bcd1-7f2b-46f8-bd1f-e0a8e872f3c3"}} diff --git a/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.checkpoint.parquet b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.checkpoint.parquet new file mode 100644 index 0000000000..659bf517d6 Binary files /dev/null and b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.checkpoint.parquet differ diff --git a/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json new file mode 100644 index 0000000000..f59c40dd67 --- /dev/null +++ b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000002.json @@ -0,0 +1,3 @@ +{"add":{"path":"part-00000-a190be9e-e3df-439e-b366-06a863f51e99-c000.snappy.parquet","partitionValues":{},"size":976,"modificationTime":1674611458901,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"letter\":\"a\",\"int\":120,\"date\":\"1971-07-01\"},\"maxValues\":{\"letter\":\"c\",\"int\":667,\"date\":\"2018-02-01\"},\"nullCount\":{\"letter\":2,\"int\":0,\"date\":0}}"}} +{"remove":{"path":"part-00000-ad1a4bb7-07e8-4f40-b50b-49910d209e0c-c000.snappy.parquet","deletionTimestamp":1674611459307,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":965}} +{"commitInfo":{"timestamp":1674611459307,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":1,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"976"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"b08f5758-a8e9-4dd1-af7e-7b6e53928d7a"}} diff --git a/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json new file mode 100644 index 0000000000..21a717332f --- /dev/null +++ b/crates/deltalake-core/tests/data/with_checkpoint_no_last_checkpoint/_delta_log/00000000000000000003.json @@ -0,0 +1,3 @@ +{"add":{"path":"part-00000-70b1dcdf-0236-4f63-a072-124cdbafd8a0-c000.snappy.parquet","partitionValues":{},"size":1010,"modificationTime":1674611461541,"dataChange":true,"stats":"{\"numRecords\":5,\"minValues\":{\"letter\":\"a\",\"int\":93,\"date\":\"1975-06-01\"},\"maxValues\":{\"letter\":\"c\",\"int\":753,\"date\":\"2013-03-01\"},\"nullCount\":{\"letter\":1,\"int\":0,\"date\":0}}"}} +{"remove":{"path":"part-00000-a190be9e-e3df-439e-b366-06a863f51e99-c000.snappy.parquet","deletionTimestamp":1674611461982,"dataChange":true,"extendedFileMetadata":true,"partitionValues":{},"size":976}} +{"commitInfo":{"timestamp":1674611461982,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"readVersion":2,"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"1","numOutputRows":"5","numOutputBytes":"1010"},"engineInfo":"Apache-Spark/3.3.1 Delta-Lake/2.1.1","txnId":"0403bbaf-a6f2-4543-9e6c-bd068e76670f"}} diff --git a/crates/deltalake-core/tests/fs_common/mod.rs b/crates/deltalake-core/tests/fs_common/mod.rs index 61227ca46b..dc9ec2547a 100644 --- a/crates/deltalake-core/tests/fs_common/mod.rs +++ b/crates/deltalake-core/tests/fs_common/mod.rs @@ -1,9 +1,12 @@ use chrono::Utc; +use deltalake_core::kernel::{ + Action, Add, DataType, PrimitiveType, Remove, StructField, StructType, +}; use deltalake_core::operations::create::CreateBuilder; use deltalake_core::operations::transaction::commit; -use deltalake_core::protocol::{Action, Add, DeltaOperation, Remove, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::storage::{DeltaObjectStore, GetResult, ObjectStoreResult}; -use deltalake_core::{DeltaTable, Schema, SchemaDataType, SchemaField}; +use deltalake_core::DeltaTable; use object_store::path::Path as StorePath; use object_store::ObjectStore; use serde_json::Value; @@ -36,14 +39,14 @@ pub async fn create_table_from_json( std::fs::create_dir_all(path).unwrap(); std::fs::remove_dir_all(path).unwrap(); std::fs::create_dir_all(path).unwrap(); - let schema: Schema = serde_json::from_value(schema).unwrap(); + let schema: StructType = serde_json::from_value(schema).unwrap(); let config: HashMap> = serde_json::from_value(config).unwrap(); create_test_table(path, schema, partition_columns, config).await } pub async fn create_test_table( path: &str, - schema: Schema, + schema: StructType, partition_columns: Vec<&str>, config: HashMap>, ) -> DeltaTable { @@ -51,7 +54,7 @@ pub async fn create_test_table( .with_location(path) .with_table_name("test-table") .with_comment("A table for running tests") - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_partition_columns(partition_columns) .with_configuration(config) .await @@ -66,11 +69,10 @@ pub async fn create_table( fs::create_dir_all(&log_dir).unwrap(); cleanup_dir_except(log_dir, vec![]); - let schema = Schema::new(vec![SchemaField::new( + let schema = StructType::new(vec![StructField::new( "id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), )]); create_test_table(path, schema, Vec::new(), config.unwrap_or_default()).await @@ -88,6 +90,8 @@ pub fn add(offset_millis: i64) -> Add { stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, } } @@ -97,13 +101,13 @@ pub async fn commit_add(table: &mut DeltaTable, add: &Add) -> i64 { partition_by: None, predicate: None, }; - commit_actions(table, vec![Action::add(add.clone())], operation).await + commit_actions(table, vec![Action::Add(add.clone())], operation).await } pub async fn commit_removes(table: &mut DeltaTable, removes: Vec<&Remove>) -> i64 { let vec = removes .iter() - .map(|r| Action::remove((*r).clone())) + .map(|r| Action::Remove((*r).clone())) .collect(); let operation = DeltaOperation::Delete { predicate: None }; commit_actions(table, vec, operation).await diff --git a/crates/deltalake-core/tests/integration_checkpoint.rs b/crates/deltalake-core/tests/integration_checkpoint.rs index 7b2f9ea026..9b5b0a73ff 100644 --- a/crates/deltalake-core/tests/integration_checkpoint.rs +++ b/crates/deltalake-core/tests/integration_checkpoint.rs @@ -2,11 +2,10 @@ use chrono::Utc; use deltalake_core::checkpoints::{cleanup_expired_logs_for, create_checkpoint}; +use deltalake_core::kernel::{DataType, PrimitiveType}; use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestResult}; use deltalake_core::writer::{DeltaWriter, JsonWriter}; -use deltalake_core::{ - errors::DeltaResult, DeltaOps, DeltaTableBuilder, ObjectStore, SchemaDataType, -}; +use deltalake_core::{errors::DeltaResult, DeltaOps, DeltaTableBuilder, ObjectStore}; use object_store::path::Path; use serde_json::json; use serial_test::serial; @@ -121,7 +120,7 @@ async fn test_issue_1420_cleanup_expired_logs_for() -> DeltaResult<()> { .create() .with_column( "id", - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), false, None, ) diff --git a/crates/deltalake-core/tests/integration_concurrent_writes.rs b/crates/deltalake-core/tests/integration_concurrent_writes.rs index 0a6470d5d0..bef44d0693 100644 --- a/crates/deltalake-core/tests/integration_concurrent_writes.rs +++ b/crates/deltalake-core/tests/integration_concurrent_writes.rs @@ -1,10 +1,11 @@ #![cfg(feature = "integration_test")] +use deltalake_core::kernel::{Action, Add, DataType, PrimitiveType, StructField, StructType}; use deltalake_core::operations::transaction::commit; use deltalake_core::operations::DeltaOps; -use deltalake_core::protocol::{Action, Add, DeltaOperation, SaveMode}; +use deltalake_core::protocol::{DeltaOperation, SaveMode}; use deltalake_core::test_utils::{IntegrationContext, StorageIntegration, TestResult, TestTables}; -use deltalake_core::{DeltaTable, DeltaTableBuilder, Schema, SchemaDataType, SchemaField}; +use deltalake_core::{DeltaTable, DeltaTableBuilder}; use std::collections::HashMap; use std::future::Future; use std::iter::FromIterator; @@ -49,11 +50,10 @@ async fn test_concurrent_writes(integration: StorageIntegration) -> TestResult { async fn prepare_table( context: &IntegrationContext, ) -> Result<(DeltaTable, String), Box> { - let schema = Schema::new(vec![SchemaField::new( + let schema = StructType::new(vec![StructField::new( "Id".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), )]); let table_uri = context.uri_for_table(TestTables::Custom("concurrent_workers".into())); @@ -64,7 +64,7 @@ async fn prepare_table( let table = DeltaOps(table) .create() - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .await?; assert_eq!(0, table.version()); @@ -153,7 +153,7 @@ impl Worker { partition_by: None, predicate: None, }; - let actions = vec![Action::add(Add { + let actions = vec![Action::Add(Add { path: format!("{}.parquet", name), size: 396, partition_values: HashMap::new(), @@ -164,6 +164,8 @@ impl Worker { stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, })]; let version = commit( self.table.object_store().as_ref(), diff --git a/crates/deltalake-core/tests/integration_datafusion.rs b/crates/deltalake-core/tests/integration_datafusion.rs index 4978ea2a11..3476de6839 100644 --- a/crates/deltalake-core/tests/integration_datafusion.rs +++ b/crates/deltalake-core/tests/integration_datafusion.rs @@ -10,11 +10,10 @@ use std::path::PathBuf; use std::sync::Arc; use arrow::array::*; -use arrow::datatypes::{ +use arrow::record_batch::RecordBatch; +use arrow_schema::{ DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, TimeUnit, }; -use arrow::record_batch::RecordBatch; -use arrow_schema::{DataType, Field}; use datafusion::assert_batches_sorted_eq; use datafusion::datasource::physical_plan::ParquetExec; use datafusion::datasource::TableProvider; @@ -32,20 +31,22 @@ use datafusion_proto::bytes::{ use url::Url; use deltalake_core::delta_datafusion::{DeltaPhysicalCodec, DeltaScan}; +use deltalake_core::kernel::{DataType, MapType, PrimitiveType, StructField, StructType}; use deltalake_core::operations::create::CreateBuilder; use deltalake_core::protocol::SaveMode; use deltalake_core::storage::DeltaObjectStore; use deltalake_core::writer::{DeltaWriter, RecordBatchWriter}; use deltalake_core::{ + open_table, operations::{write::WriteBuilder, DeltaOps}, - DeltaTable, DeltaTableError, Schema, SchemaDataType, SchemaField, + DeltaTable, DeltaTableError, }; use std::error::Error; mod common; mod local { - use deltalake::{writer::JsonWriter, SchemaTypeMap}; + use deltalake_core::writer::JsonWriter; use super::*; #[tokio::test] @@ -96,14 +97,14 @@ mod local { let table_dir = tempfile::tempdir().unwrap(); let table_path = table_dir.path(); let table_uri = table_path.to_str().unwrap().to_string(); - let table_schema: Schema = batches[0].schema().try_into().unwrap(); + let table_schema: StructType = batches[0].schema().try_into().unwrap(); let mut table = DeltaOps::try_from_uri(table_uri) .await .unwrap() .create() .with_save_mode(SaveMode::Ignore) - .with_columns(table_schema.get_fields().clone()) + .with_columns(table_schema.fields().clone()) .with_partition_columns(partitions) .await .unwrap(); @@ -153,7 +154,7 @@ mod local { #[tokio::test] async fn test_datafusion_simple_query_partitioned() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/delta-0.8.0-partitioned") + let table = open_table("./tests/data/delta-0.8.0-partitioned") .await .unwrap(); ctx.register_table("demo", Arc::new(table))?; @@ -182,7 +183,7 @@ mod local { let source_scan_bytes = { let ctx = SessionContext::new(); let state = ctx.state(); - let source_table = deltalake::open_table("./tests/data/delta-0.8.0-date").await?; + let source_table = open_table("./tests/data/delta-0.8.0-date").await?; let source_scan = source_table.scan(&state, None, &[], None).await?; physical_plan_to_bytes_with_extension_codec(source_scan, &DeltaPhysicalCodec {})? }; @@ -195,9 +196,9 @@ mod local { &ctx, &DeltaPhysicalCodec {}, )?; - let fields = Schema::try_from(source_scan.schema()) + let fields = StructType::try_from(source_scan.schema()) .unwrap() - .get_fields() + .fields() .clone(); // Create target Delta Table @@ -262,9 +263,7 @@ mod local { #[tokio::test] async fn test_datafusion_date_column() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/delta-0.8.0-date") - .await - .unwrap(); + let table = open_table("./tests/data/delta-0.8.0-date").await.unwrap(); ctx.register_table("dates", Arc::new(table))?; let batches = ctx @@ -283,9 +282,7 @@ mod local { #[tokio::test] async fn test_datafusion_stats() -> Result<()> { - let table = deltalake::open_table("./tests/data/delta-0.8.0") - .await - .unwrap(); + let table = open_table("./tests/data/delta-0.8.0").await.unwrap(); let statistics = table.state.datafusion_table_statistics(); assert_eq!(statistics.num_rows, Some(4),); @@ -735,7 +732,7 @@ mod local { assert_eq!(metrics.num_scanned_files(), 1); // Ensure that tables without stats and partition columns can be pruned for just partitions - // let table = deltalake::open_table("./tests/data/delta-0.8.0-null-partition").await?; + // let table = open_table("./tests/data/delta-0.8.0-null-partition").await?; /* // Logically this should prune. See above @@ -765,7 +762,7 @@ mod local { #[tokio::test] async fn test_datafusion_partitioned_types() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/delta-2.2.0-partitioned-types") + let table = open_table("./tests/data/delta-2.2.0-partitioned-types") .await .unwrap(); ctx.register_table("demo", Arc::new(table))?; @@ -814,7 +811,7 @@ mod local { #[tokio::test] async fn test_datafusion_scan_timestamps() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/table_with_edge_timestamps") + let table = open_table("./tests/data/table_with_edge_timestamps") .await .unwrap(); ctx.register_table("demo", Arc::new(table))?; @@ -838,9 +835,7 @@ mod local { #[tokio::test] async fn test_issue_1292_datafusion_sql_projection() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/http_requests") - .await - .unwrap(); + let table = open_table("./tests/data/http_requests").await.unwrap(); ctx.register_table("http_requests", Arc::new(table))?; let batches = ctx @@ -869,9 +864,7 @@ mod local { #[tokio::test] async fn test_issue_1291_datafusion_sql_partitioned_data() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/http_requests") - .await - .unwrap(); + let table = open_table("./tests/data/http_requests").await.unwrap(); ctx.register_table("http_requests", Arc::new(table))?; let batches = ctx @@ -902,9 +895,7 @@ mod local { #[tokio::test] async fn test_issue_1374() -> Result<()> { let ctx = SessionContext::new(); - let table = deltalake::open_table("./tests/data/issue_1374") - .await - .unwrap(); + let table = open_table("./tests/data/issue_1374").await.unwrap(); ctx.register_table("t", Arc::new(table))?; let batches = ctx @@ -939,24 +930,24 @@ mod local { #[tokio::test] async fn test_issue_1619_parquet_panic_using_map_type() -> Result<()> { let _ = tokio::fs::remove_dir_all("./tests/data/issue-1619").await; - let fields: Vec = vec![SchemaField::new( + let fields: Vec = vec![StructField::new( "metadata".to_string(), - SchemaDataType::map(SchemaTypeMap::new( - Box::new(SchemaDataType::primitive("string".to_string())), - Box::new(SchemaDataType::primitive("string".to_string())), + DataType::Map(Box::new(MapType::new( + DataType::Primitive(PrimitiveType::String), + DataType::Primitive(PrimitiveType::String), true, - )), + ))), true, - HashMap::new(), )]; - let schema = deltalake::Schema::new(fields); - let table = deltalake::DeltaTableBuilder::from_uri("./tests/data/issue-1619").build()?; + let schema = StructType::new(fields); + let table = + deltalake_core::DeltaTableBuilder::from_uri("./tests/data/issue-1619").build()?; let _ = DeltaOps::from(table) .create() - .with_columns(schema.get_fields().to_owned()) + .with_columns(schema.fields().to_owned()) .await?; - let mut table = deltalake::open_table("./tests/data/issue-1619").await?; + let mut table = open_table("./tests/data/issue-1619").await?; let mut writer = JsonWriter::for_table(&table).unwrap(); writer @@ -1082,17 +1073,15 @@ mod date_partitions { async fn setup_test() -> Result> { let columns = vec![ - SchemaField::new( + StructField::new( "id".to_owned(), - SchemaDataType::primitive("integer".to_owned()), + DataType::Primitive(PrimitiveType::Integer), false, - HashMap::new(), ), - SchemaField::new( + StructField::new( "date".to_owned(), - SchemaDataType::primitive("date".to_owned()), + DataType::Primitive(PrimitiveType::Date), false, - HashMap::new(), ), ]; @@ -1114,8 +1103,8 @@ mod date_partitions { Ok(RecordBatch::try_new( Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("date", DataType::Date32, false), + ArrowField::new("id", ArrowDataType::Int32, false), + ArrowField::new("date", ArrowDataType::Date32, false), ])), vec![Arc::new(ids_array), Arc::new(date_array)], )?) diff --git a/crates/deltalake-core/tests/integration_read.rs b/crates/deltalake-core/tests/integration_read.rs index 3056a3263b..0e17d34397 100644 --- a/crates/deltalake-core/tests/integration_read.rs +++ b/crates/deltalake-core/tests/integration_read.rs @@ -60,7 +60,7 @@ mod local { assert_eq!(table.get_files(), vec![Path::from(a.path.clone())]); // Remove added file. - let r = deltalake::protocol::Remove { + let r = deltalake_core::kernel::Remove { path: a.path.clone(), deletion_timestamp: Some(chrono::Utc::now().timestamp_millis()), data_change: false, @@ -69,6 +69,8 @@ mod local { size: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, }; assert_eq!(2, fs_common::commit_removes(&mut table, vec![&r]).await); @@ -210,12 +212,17 @@ async fn read_simple_table(integration: &IntegrationContext) -> TestResult { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 31); - assert!(tombstones.contains(&deltalake::protocol::Remove { + assert!(tombstones.contains(&deltalake_core::kernel::Remove { path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1587968596250), data_change: true, extended_file_metadata: None, - ..Default::default() + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + partition_values: None, + tags: None, })); Ok(()) @@ -246,11 +253,17 @@ async fn read_simple_table_with_version(integration: &IntegrationContext) -> Tes ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 29); - assert!(tombstones.contains(&deltalake::protocol::Remove { + assert!(tombstones.contains(&deltalake_core::kernel::Remove { path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1587968596250), data_change: true, - ..Default::default() + tags: None, + partition_values: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + deletion_vector: None, + extended_file_metadata: None, })); Ok(()) @@ -291,7 +304,7 @@ mod gcs { #[tokio::test] async fn test_gcs_simple() { let bucket = std::env::var("GCS_DELTA_BUCKET").unwrap(); - let table = deltalake::open_table(format!("gs://{}/simple_table", bucket).as_str()) + let table = deltalake_core::open_table(format!("gs://{}/simple_table", bucket).as_str()) .await .unwrap(); assert_eq!(table.version(), 4); @@ -309,11 +322,17 @@ mod gcs { ); let tombstones = table.get_state().all_tombstones(); assert_eq!(tombstones.len(), 31); - assert!(tombstones.contains(&deltalake::protocol::Remove { + assert!(tombstones.contains(&deltalake_core::kernel::Remove { path: "part-00006-63ce9deb-bc0f-482d-b9a1-7e717b67f294-c000.snappy.parquet".to_string(), deletion_timestamp: Some(1587968596250), data_change: true, - ..Default::default() + extended_file_metadata: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + size: None, + partition_values: None, + tags: None, })); } } diff --git a/crates/deltalake-core/tests/read_delta_partitions_test.rs b/crates/deltalake-core/tests/read_delta_partitions_test.rs index c579e242a6..514cdefde8 100644 --- a/crates/deltalake-core/tests/read_delta_partitions_test.rs +++ b/crates/deltalake-core/tests/read_delta_partitions_test.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::convert::TryFrom; -use deltalake_core::schema::SchemaDataType; +use deltalake_core::kernel::{DataType, PrimitiveType}; #[allow(dead_code)] mod fs_common; @@ -50,7 +50,7 @@ fn test_match_partition() { key: "month".to_string(), value: deltalake_core::PartitionValue::Equal("12".to_string()), }; - let string_type = SchemaDataType::primitive(String::from("string")); + let string_type = DataType::Primitive(PrimitiveType::String); assert!(!partition_year_2020_filter.match_partition(&partition_2021, &string_type)); assert!(partition_year_2020_filter.match_partition(&partition_2020, &string_type)); @@ -71,11 +71,13 @@ fn test_match_filters() { }, ]; - let string_type = SchemaDataType::primitive(String::from("string")); - let partition_data_types: HashMap<&str, &SchemaDataType> = - vec![("year", &string_type), ("month", &string_type)] - .into_iter() - .collect(); + let string_type = DataType::Primitive(PrimitiveType::String); + let partition_data_types: HashMap<&String, &DataType> = vec![ + (&partitions[0].key, &string_type), + (&partitions[1].key, &string_type), + ] + .into_iter() + .collect(); let valid_filters = deltalake_core::PartitionFilter { key: "year".to_string(), @@ -101,7 +103,7 @@ fn test_match_filters() { #[cfg(all(feature = "arrow", feature = "parquet"))] #[tokio::test] async fn read_null_partitions_from_checkpoint() { - use deltalake_core::protocol::Add; + use deltalake_core::kernel::Add; use maplit::hashmap; use serde_json::json; diff --git a/crates/deltalake-core/tests/serde/checkpoint_schema.json b/crates/deltalake-core/tests/serde/checkpoint_schema.json new file mode 100644 index 0000000000..9e397cd978 --- /dev/null +++ b/crates/deltalake-core/tests/serde/checkpoint_schema.json @@ -0,0 +1,267 @@ +{ + "type": "struct", + "fields": [ + { + "name": "txn", + "type": { + "type": "struct", + "fields": [ + { + "name": "appId", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "version", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "lastUpdated", + "type": "long", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "add", + "type": { + "type": "struct", + "fields": [ + { + "name": "path", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "partitionValues", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "size", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "modificationTime", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "dataChange", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "tags", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "stats", + "type": "string", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "remove", + "type": { + "type": "struct", + "fields": [ + { + "name": "path", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "deletionTimestamp", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "dataChange", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "extendedFileMetadata", + "type": "boolean", + "nullable": true, + "metadata": {} + }, + { + "name": "partitionValues", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "size", + "type": "long", + "nullable": true, + "metadata": {} + }, + { + "name": "tags", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "metaData", + "type": { + "type": "struct", + "fields": [ + { + "name": "id", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "name", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "description", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "format", + "type": { + "type": "struct", + "fields": [ + { + "name": "provider", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "options", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "schemaString", + "type": "string", + "nullable": true, + "metadata": {} + }, + { + "name": "partitionColumns", + "type": { + "type": "array", + "elementType": "string", + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "configuration", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "createdTime", + "type": "long", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "protocol", + "type": { + "type": "struct", + "fields": [ + { + "name": "minReaderVersion", + "type": "integer", + "nullable": true, + "metadata": {} + }, + { + "name": "minWriterVersion", + "type": "integer", + "nullable": true, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + } + ] +} diff --git a/crates/deltalake-core/tests/serde/schema.json b/crates/deltalake-core/tests/serde/schema.json new file mode 100644 index 0000000000..710a9e5080 --- /dev/null +++ b/crates/deltalake-core/tests/serde/schema.json @@ -0,0 +1,68 @@ +{ + "type": "struct", + "fields": [ + { + "name": "a", + "type": "integer", + "nullable": false, + "metadata": {} + }, + { + "name": "b", + "type": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": {} + } + ] + }, + "nullable": true, + "metadata": {} + }, + { + "name": "c", + "type": { + "type": "array", + "elementType": "integer", + "containsNull": false + }, + "nullable": true, + "metadata": {} + }, + { + "name": "e", + "type": { + "type": "array", + "elementType": { + "type": "struct", + "fields": [ + { + "name": "d", + "type": "integer", + "nullable": false, + "metadata": {} + } + ] + }, + "containsNull": true + }, + "nullable": true, + "metadata": {} + }, + { + "name": "f", + "type": { + "type": "map", + "keyType": "string", + "valueType": "string", + "valueContainsNull": true + }, + "nullable": true, + "metadata": {} + } + ] +} diff --git a/crates/deltalake/examples/basic_operations.rs b/crates/deltalake/examples/basic_operations.rs index 13a1c60f73..e697e4cf53 100644 --- a/crates/deltalake/examples/basic_operations.rs +++ b/crates/deltalake/examples/basic_operations.rs @@ -1,47 +1,45 @@ use deltalake::arrow::{ array::{Int32Array, StringArray, TimestampMicrosecondArray}, - datatypes::{DataType, Field, Schema as ArrowSchema, TimeUnit}, + datatypes::{DataType as ArrowDataType, Field, Schema as ArrowSchema, TimeUnit}, record_batch::RecordBatch, }; +use deltalake::kernel::{DataType, PrimitiveType, StructField}; use deltalake::operations::collect_sendable_stream; use deltalake::parquet::{ basic::{Compression, ZstdLevel}, file::properties::WriterProperties, }; -use deltalake::{protocol::SaveMode, DeltaOps, SchemaDataType, SchemaField}; +use deltalake::{protocol::SaveMode, DeltaOps}; use std::sync::Arc; -fn get_table_columns() -> Vec { +fn get_table_columns() -> Vec { vec![ - SchemaField::new( + StructField::new( String::from("int"), - SchemaDataType::primitive(String::from("integer")), + DataType::Primitive(PrimitiveType::Integer), false, - Default::default(), ), - SchemaField::new( + StructField::new( String::from("string"), - SchemaDataType::primitive(String::from("string")), + DataType::Primitive(PrimitiveType::String), true, - Default::default(), ), - SchemaField::new( + StructField::new( String::from("timestamp"), - SchemaDataType::primitive(String::from("timestamp")), + DataType::Primitive(PrimitiveType::Timestamp), true, - Default::default(), ), ] } fn get_table_batches() -> RecordBatch { let schema = Arc::new(ArrowSchema::new(vec![ - Field::new("int", DataType::Int32, false), - Field::new("string", DataType::Utf8, true), + Field::new("int", ArrowDataType::Int32, false), + Field::new("string", ArrowDataType::Utf8, true), Field::new( "timestamp", - DataType::Timestamp(TimeUnit::Microsecond, None), + ArrowDataType::Timestamp(TimeUnit::Microsecond, None), true, ), ])); diff --git a/crates/deltalake/examples/recordbatch-writer.rs b/crates/deltalake/examples/recordbatch-writer.rs index 1347da1baa..e7fd7125cd 100644 --- a/crates/deltalake/examples/recordbatch-writer.rs +++ b/crates/deltalake/examples/recordbatch-writer.rs @@ -6,11 +6,11 @@ * This example was originally posted by @rtyler in: * */ - use chrono::prelude::*; use deltalake::arrow::array::*; use deltalake::arrow::record_batch::RecordBatch; use deltalake::errors::DeltaTableError; +use deltalake::kernel::{DataType, PrimitiveType, StructField, StructType}; use deltalake::parquet::{ basic::{Compression, ZstdLevel}, file::properties::WriterProperties, @@ -19,8 +19,6 @@ use deltalake::writer::{DeltaWriter, RecordBatchWriter}; use deltalake::Path; use deltalake::*; use log::*; - -use std::collections::HashMap; use std::sync::Arc; /* @@ -86,31 +84,27 @@ struct WeatherRecord { } impl WeatherRecord { - fn columns() -> Vec { + fn columns() -> Vec { vec![ - SchemaField::new( + StructField::new( "timestamp".to_string(), - SchemaDataType::primitive("timestamp".to_string()), + DataType::Primitive(PrimitiveType::Timestamp), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "temp".to_string(), - SchemaDataType::primitive("integer".to_string()), + DataType::Primitive(PrimitiveType::Integer), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "lat".to_string(), - SchemaDataType::primitive("double".to_string()), + DataType::Primitive(PrimitiveType::Float), true, - HashMap::new(), ), - SchemaField::new( + StructField::new( "long".to_string(), - SchemaDataType::primitive("double".to_string()), + DataType::Primitive(PrimitiveType::Float), true, - HashMap::new(), ), ] } @@ -167,7 +161,7 @@ fn convert_to_batch(table: &DeltaTable, records: &Vec) -> RecordB let metadata = table .get_metadata() .expect("Failed to get metadata for the table"); - let arrow_schema = >::try_from( + let arrow_schema = >::try_from( &metadata.schema.clone(), ) .expect("Failed to convert to arrow schema"); diff --git a/python/src/error.rs b/python/src/error.rs index 1b5a9f6839..f72c6361d2 100644 --- a/python/src/error.rs +++ b/python/src/error.rs @@ -73,6 +73,7 @@ fn checkpoint_to_py(err: ProtocolError) -> PyErr { ProtocolError::ParquetParseError { source } => PyIOError::new_err(source.to_string()), ProtocolError::IO { source } => PyIOError::new_err(source.to_string()), ProtocolError::Generic(msg) => DeltaError::new_err(msg), + ProtocolError::Kernel { source } => DeltaError::new_err(source.to_string()), } } diff --git a/python/src/lib.rs b/python/src/lib.rs index cc6b2202c3..923a06d159 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -26,6 +26,7 @@ use deltalake::datafusion::datasource::provider::TableProvider; use deltalake::datafusion::prelude::SessionContext; use deltalake::delta_datafusion::DeltaDataChecker; use deltalake::errors::DeltaTableError; +use deltalake::kernel::{Action, Add, Invariant, Metadata, Remove, StructType}; use deltalake::operations::delete::DeleteBuilder; use deltalake::operations::filesystem_check::FileSystemCheckBuilder; use deltalake::operations::merge::MergeBuilder; @@ -36,11 +37,9 @@ use deltalake::operations::update::UpdateBuilder; use deltalake::operations::vacuum::VacuumBuilder; use deltalake::parquet::file::properties::WriterProperties; use deltalake::partitions::PartitionFilter; -use deltalake::protocol::{ - self, Action, ColumnCountStat, ColumnValueStat, DeltaOperation, SaveMode, Stats, -}; +use deltalake::protocol::{ColumnCountStat, ColumnValueStat, DeltaOperation, SaveMode, Stats}; +use deltalake::DeltaOps; use deltalake::DeltaTableBuilder; -use deltalake::{DeltaOps, Invariant, Schema}; use pyo3::exceptions::{PyIOError, PyRuntimeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyFrozenSet, PyType}; @@ -262,7 +261,7 @@ impl RawDeltaTable { #[getter] pub fn schema(&self, py: Python) -> PyResult { - let schema: &Schema = self._table.get_schema().map_err(PythonError::from)?; + let schema: &StructType = self._table.get_schema().map_err(PythonError::from)?; schema_to_pyobject(schema, py) } @@ -688,9 +687,9 @@ impl RawDeltaTable { ._table .schema() .ok_or_else(|| DeltaProtocolError::new_err("table does not yet have a schema"))? - .get_fields() + .fields() .iter() - .map(|field| field.get_name()) + .map(|field| field.name().as_str()) .collect(); let partition_columns: HashSet<&str> = self ._table @@ -760,13 +759,13 @@ impl RawDeltaTable { partitions_filters: Option>, ) -> PyResult<()> { let mode = save_mode_from_str(mode)?; - let schema: Schema = (&schema.0).try_into().map_err(PythonError::from)?; + let schema: StructType = (&schema.0).try_into().map_err(PythonError::from)?; let existing_schema = self._table.get_schema().map_err(PythonError::from)?; - let mut actions: Vec = add_actions + let mut actions: Vec = add_actions .iter() - .map(|add| Action::add(add.into())) + .map(|add| Action::Add(add.into())) .collect(); match mode { @@ -782,7 +781,7 @@ impl RawDeltaTable { .map_err(PythonError::from)?; for old_add in add_actions { - let remove_action = Action::remove(protocol::Remove { + let remove_action = Action::Remove(Remove { path: old_add.path.clone(), deletion_timestamp: Some(current_timestamp()), data_change: true, @@ -791,6 +790,8 @@ impl RawDeltaTable { size: Some(old_add.size), deletion_vector: old_add.deletion_vector.clone(), tags: old_add.tags.clone(), + base_row_id: old_add.base_row_id, + default_row_commit_version: old_add.default_row_commit_version, }); actions.push(remove_action); } @@ -803,9 +804,9 @@ impl RawDeltaTable { .map_err(PythonError::from)? .clone(); metadata.schema = schema; - let metadata_action = protocol::MetaData::try_from(metadata) + let metadata_action = Metadata::try_from(metadata) .map_err(|_| PyValueError::new_err("Failed to reparse metadata"))?; - actions.push(Action::metaData(metadata_action)); + actions.push(Action::Metadata(metadata_action)); } } _ => { @@ -1108,9 +1109,9 @@ pub struct PyAddAction { stats: Option, } -impl From<&PyAddAction> for protocol::Add { +impl From<&PyAddAction> for Add { fn from(action: &PyAddAction) -> Self { - protocol::Add { + Add { path: action.path.clone(), size: action.size, partition_values: action.partition_values.clone(), @@ -1121,6 +1122,8 @@ impl From<&PyAddAction> for protocol::Add { stats_parsed: None, tags: None, deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, } } } @@ -1143,13 +1146,13 @@ fn write_new_deltalake( .build() .map_err(PythonError::from)?; - let schema: Schema = (&schema.0).try_into().map_err(PythonError::from)?; + let schema: StructType = (&schema.0).try_into().map_err(PythonError::from)?; let mut builder = DeltaOps(table) .create() - .with_columns(schema.get_fields().clone()) + .with_columns(schema.fields().clone()) .with_partition_columns(partition_by) - .with_actions(add_actions.iter().map(|add| Action::add(add.into()))); + .with_actions(add_actions.iter().map(|add| Action::Add(add.into()))); if let Some(name) = &name { builder = builder.with_table_name(name); diff --git a/python/src/schema.rs b/python/src/schema.rs index 77e5f0d4da..c56010f131 100644 --- a/python/src/schema.rs +++ b/python/src/schema.rs @@ -6,15 +6,14 @@ use deltalake::arrow::datatypes::{ }; use deltalake::arrow::error::ArrowError; use deltalake::arrow::pyarrow::PyArrowType; -use deltalake::schema::{ - Schema, SchemaDataType, SchemaField, SchemaTypeArray, SchemaTypeMap, SchemaTypeStruct, +use deltalake::kernel::{ + ArrayType as DeltaArrayType, DataType, MapType as DeltaMapType, PrimitiveType as DeltaPrimitve, + StructField, StructType as DeltaStructType, }; -use lazy_static::lazy_static; use pyo3::exceptions::{PyException, PyNotImplementedError, PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::IntoPyDict; use pyo3::{PyRef, PyResult}; -use regex::Regex; use std::collections::HashMap; // PyO3 doesn't yet support converting classes with inheritance with Python @@ -23,55 +22,30 @@ use std::collections::HashMap; // See: https://github.com/PyO3/pyo3/issues/1836 // Decimal is separate special case, since it has parameters -const VALID_PRIMITIVE_TYPES: [&str; 11] = [ - "string", - "long", - "integer", - "short", - "byte", - "float", - "double", - "boolean", - "binary", - "date", - "timestamp", -]; - -fn try_parse_decimal_type(data_type: &str) -> Option<(usize, usize)> { - lazy_static! { - static ref DECIMAL_REGEX: Regex = Regex::new(r"\((\d{1,2}),(\d{1,2})\)").unwrap(); - } - let extract = DECIMAL_REGEX.captures(data_type)?; - let precision = extract - .get(1) - .and_then(|v| v.as_str().parse::().ok())?; - let scale = extract - .get(2) - .and_then(|v| v.as_str().parse::().ok())?; - Some((precision, scale)) -} -fn schema_type_to_python(schema_type: SchemaDataType, py: Python) -> PyResult { +fn schema_type_to_python(schema_type: DataType, py: Python) -> PyResult { match schema_type { - SchemaDataType::primitive(data_type) => Ok((PrimitiveType::new(data_type)?).into_py(py)), - SchemaDataType::array(array_type) => { - let array_type: ArrayType = array_type.into(); + DataType::Primitive(data_type) => { + Ok((PrimitiveType::new(data_type.to_string())?).into_py(py)) + } + DataType::Array(array_type) => { + let array_type: ArrayType = (*array_type).into(); Ok(array_type.into_py(py)) } - SchemaDataType::map(map_type) => { - let map_type: MapType = map_type.into(); + DataType::Map(map_type) => { + let map_type: MapType = (*map_type).into(); Ok(map_type.into_py(py)) } - SchemaDataType::r#struct(struct_type) => { - let struct_type: StructType = struct_type.into(); + DataType::Struct(struct_type) => { + let struct_type: StructType = (*struct_type).into(); Ok(struct_type.into_py(py)) } } } -fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { +fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { if let Ok(data_type) = ob.extract::(py) { - return Ok(SchemaDataType::primitive(data_type.inner_type)); + return Ok(DataType::Primitive(data_type.inner_type)); } if let Ok(array_type) = ob.extract::(py) { return Ok(array_type.into()); @@ -85,7 +59,7 @@ fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { if let Ok(raw_primitive) = ob.extract::(py) { // Pass through PrimitiveType::new() to do validation return PrimitiveType::new(raw_primitive) - .map(|data_type| SchemaDataType::primitive(data_type.inner_type)); + .map(|data_type| DataType::Primitive(data_type.inner_type)); } Err(PyValueError::new_err("Invalid data type")) } @@ -93,14 +67,14 @@ fn python_type_to_schema(ob: PyObject, py: Python) -> PyResult { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct PrimitiveType { - inner_type: String, + inner_type: DeltaPrimitve, } -impl TryFrom for PrimitiveType { +impl TryFrom for PrimitiveType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::primitive(type_name) => Self::new(type_name), + DataType::Primitive(type_name) => Self::new(type_name.to_string()), _ => Err(PyTypeError::new_err("Type is not primitive")), } } @@ -111,34 +85,41 @@ impl PrimitiveType { #[new] #[pyo3(signature = (data_type))] fn new(data_type: String) -> PyResult { - if data_type.starts_with("decimal") { - if try_parse_decimal_type(&data_type).is_none() { - Err(PyValueError::new_err(format!( - "invalid decimal type: {data_type}" - ))) - } else { - Ok(Self { - inner_type: data_type, - }) - } - } else if !VALID_PRIMITIVE_TYPES - .iter() - .any(|&valid| data_type == valid) - { - Err(PyValueError::new_err(format!( - "data_type must be one of decimal(, ), {}.", - VALID_PRIMITIVE_TYPES.join(", ") - ))) - } else { - Ok(Self { - inner_type: data_type, - }) - } + let data_type: DeltaPrimitve = serde_json::from_str(&format!("\"{data_type}\"")) + .map_err(|_| PyValueError::new_err(format!("invalid type string: {data_type}")))?; + + Ok(Self { + inner_type: data_type, + }) + + // if data_type.starts_with("decimal") { + // if try_parse_decimal_type(&data_type).is_none() { + // Err(PyValueError::new_err(format!( + // "invalid decimal type: {data_type}" + // ))) + // } else { + // Ok(Self { + // inner_type: data_type, + // }) + // } + // } else if !VALID_PRIMITIVE_TYPES + // .iter() + // .any(|&valid| data_type == valid) + // { + // Err(PyValueError::new_err(format!( + // "data_type must be one of decimal(, ), {}.", + // VALID_PRIMITIVE_TYPES.join(", ") + // ))) + // } else { + // Ok(Self { + // inner_type: data_type, + // }) + // } } #[getter] fn get_type(&self) -> PyResult { - Ok(self.inner_type.clone()) + Ok(self.inner_type.to_string()) } fn __richcmp__(&self, other: PrimitiveType, cmp: pyo3::basic::CompareOp) -> PyResult { @@ -157,14 +138,14 @@ impl PrimitiveType { #[pyo3(text_signature = "($self)")] fn to_json(&self) -> PyResult { - let inner_type = SchemaDataType::primitive(self.inner_type.clone()); + let inner_type = DataType::Primitive(self.inner_type.clone()); serde_json::to_string(&inner_type).map_err(|err| PyException::new_err(err.to_string())) } #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -172,7 +153,7 @@ impl PrimitiveType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { - let inner_type = SchemaDataType::primitive(self.inner_type.clone()); + let inner_type = DataType::Primitive(self.inner_type.clone()); Ok(PyArrowType((&inner_type).try_into().map_err( |err: ArrowError| PyException::new_err(err.to_string()), )?)) @@ -181,7 +162,7 @@ impl PrimitiveType { #[pyo3(text_signature = "(data_type)")] #[staticmethod] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -192,26 +173,28 @@ impl PrimitiveType { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct ArrayType { - inner_type: SchemaTypeArray, + inner_type: DeltaArrayType, } -impl From for ArrayType { - fn from(inner_type: SchemaTypeArray) -> Self { +impl From for ArrayType { + fn from(inner_type: DeltaArrayType) -> Self { Self { inner_type } } } -impl From for SchemaDataType { - fn from(arr: ArrayType) -> SchemaDataType { - SchemaDataType::array(arr.inner_type) +impl From for DataType { + fn from(arr: ArrayType) -> DataType { + DataType::Array(Box::new(arr.inner_type)) } } -impl TryFrom for ArrayType { +impl TryFrom for ArrayType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::array(inner_type) => Ok(Self { inner_type }), + DataType::Array(inner_type) => Ok(Self { + inner_type: *inner_type, + }), _ => Err(PyTypeError::new_err("Type is not an array")), } } @@ -222,18 +205,15 @@ impl ArrayType { #[new] #[pyo3(signature = (element_type, contains_null = true))] fn new(element_type: PyObject, contains_null: bool, py: Python) -> PyResult { - let inner_type = SchemaTypeArray::new( - Box::new(python_type_to_schema(element_type, py)?), - contains_null, - ); + let inner_type = + DeltaArrayType::new(python_type_to_schema(element_type, py)?, contains_null); Ok(Self { inner_type }) } fn __repr__(&self, py: Python) -> PyResult { - let type_repr: String = - schema_type_to_python(self.inner_type.get_element_type().clone(), py)? - .call_method0(py, "__repr__")? - .extract(py)?; + let type_repr: String = schema_type_to_python(self.inner_type.element_type().clone(), py)? + .call_method0(py, "__repr__")? + .extract(py)?; Ok(format!( "ArrayType({}, contains_null={})", type_repr, @@ -262,7 +242,7 @@ impl ArrayType { #[getter] fn element_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner_type.get_element_type().to_owned(), py) + schema_type_to_python(self.inner_type.element_type().to_owned(), py) } #[getter] @@ -278,7 +258,7 @@ impl ArrayType { #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -287,7 +267,7 @@ impl ArrayType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( - (&SchemaDataType::array(self.inner_type.clone())) + (&DataType::Array(Box::new(self.inner_type.clone()))) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, )) @@ -296,7 +276,7 @@ impl ArrayType { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -307,26 +287,28 @@ impl ArrayType { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct MapType { - inner_type: SchemaTypeMap, + inner_type: DeltaMapType, } -impl From for MapType { - fn from(inner_type: SchemaTypeMap) -> Self { +impl From for MapType { + fn from(inner_type: DeltaMapType) -> Self { Self { inner_type } } } -impl From for SchemaDataType { - fn from(map: MapType) -> SchemaDataType { - SchemaDataType::map(map.inner_type) +impl From for DataType { + fn from(map: MapType) -> DataType { + DataType::Map(Box::new(map.inner_type)) } } -impl TryFrom for MapType { +impl TryFrom for MapType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::map(inner_type) => Ok(Self { inner_type }), + DataType::Map(inner_type) => Ok(Self { + inner_type: *inner_type, + }), _ => Err(PyTypeError::new_err("Type is not a map")), } } @@ -342,27 +324,26 @@ impl MapType { value_contains_null: bool, py: Python, ) -> PyResult { - let inner_type = SchemaTypeMap::new( - Box::new(python_type_to_schema(key_type, py)?), - Box::new(python_type_to_schema(value_type, py)?), + let inner_type = DeltaMapType::new( + python_type_to_schema(key_type, py)?, + python_type_to_schema(value_type, py)?, value_contains_null, ); Ok(Self { inner_type }) } fn __repr__(&self, py: Python) -> PyResult { - let key_repr: String = schema_type_to_python(self.inner_type.get_key_type().clone(), py)? + let key_repr: String = schema_type_to_python(self.inner_type.key_type().clone(), py)? + .call_method0(py, "__repr__")? + .extract(py)?; + let value_repr: String = schema_type_to_python(self.inner_type.value_type().clone(), py)? .call_method0(py, "__repr__")? .extract(py)?; - let value_repr: String = - schema_type_to_python(self.inner_type.get_value_type().clone(), py)? - .call_method0(py, "__repr__")? - .extract(py)?; Ok(format!( "MapType({}, {}, value_contains_null={})", key_repr, value_repr, - if self.inner_type.get_value_contains_null() { + if self.inner_type.value_contains_null() { "True" } else { "False" @@ -387,17 +368,17 @@ impl MapType { #[getter] fn key_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner_type.get_key_type().to_owned(), py) + schema_type_to_python(self.inner_type.key_type().to_owned(), py) } #[getter] fn value_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner_type.get_value_type().to_owned(), py) + schema_type_to_python(self.inner_type.value_type().to_owned(), py) } #[getter] fn value_contains_null(&self, py: Python) -> PyResult { - Ok(self.inner_type.get_value_contains_null().into_py(py)) + Ok(self.inner_type.value_contains_null().into_py(py)) } #[pyo3(text_signature = "($self)")] @@ -408,7 +389,7 @@ impl MapType { #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -417,7 +398,7 @@ impl MapType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( - (&SchemaDataType::map(self.inner_type.clone())) + (&DataType::Map(Box::new(self.inner_type.clone()))) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, )) @@ -426,7 +407,7 @@ impl MapType { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -437,7 +418,7 @@ impl MapType { #[pyclass(module = "deltalake._internal")] #[derive(Clone)] pub struct Field { - inner: SchemaField, + inner: StructField, } #[pymethods] @@ -466,19 +447,20 @@ impl Field { HashMap::new() }; - Ok(Self { - inner: SchemaField::new(name, ty, nullable, metadata), - }) + let mut inner = StructField::new(name, ty, nullable); + inner = inner.with_metadata(metadata); + + Ok(Self { inner }) } #[getter] fn name(&self) -> String { - self.inner.get_name().to_string() + self.inner.name().to_string() } #[getter] fn get_type(&self, py: Python) -> PyResult { - schema_type_to_python(self.inner.get_type().clone(), py) + schema_type_to_python(self.inner.data_type().clone(), py) } #[getter] @@ -489,17 +471,17 @@ impl Field { #[getter] fn metadata(&self, py: Python) -> PyResult { let json_loads = PyModule::import(py, "json")?.getattr("loads")?; - let metadata_json: String = serde_json::to_string(self.inner.get_metadata()) + let metadata_json: String = serde_json::to_string(self.inner.metadata()) .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(json_loads.call1((metadata_json,))?.to_object(py)) } fn __repr__(&self, py: Python) -> PyResult { - let type_repr: String = schema_type_to_python(self.inner.get_type().clone(), py)? + let type_repr: String = schema_type_to_python(self.inner.data_type().clone(), py)? .call_method0(py, "__repr__")? .extract(py)?; - let metadata = self.inner.get_metadata(); + let metadata = self.inner.metadata(); let maybe_metadata = if metadata.is_empty() { "".to_string() } else { @@ -511,7 +493,7 @@ impl Field { }; Ok(format!( "Field({}, {}, nullable={}{})", - self.inner.get_name(), + self.inner.name(), type_repr, if self.inner.is_nullable() { "True" @@ -540,7 +522,7 @@ impl Field { #[staticmethod] #[pyo3(text_signature = "(field_json)")] fn from_json(field_json: String) -> PyResult { - let field: SchemaField = serde_json::from_str(&field_json) + let field: StructField = serde_json::from_str(&field_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; Ok(Self { inner: field }) @@ -557,7 +539,7 @@ impl Field { #[pyo3(text_signature = "(field)")] fn from_pyarrow(field: PyArrowType) -> PyResult { Ok(Self { - inner: SchemaField::try_from(&field.0) + inner: StructField::try_from(&field.0) .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, }) } @@ -566,26 +548,28 @@ impl Field { #[pyclass(subclass, module = "deltalake._internal")] #[derive(Clone)] pub struct StructType { - inner_type: SchemaTypeStruct, + inner_type: DeltaStructType, } -impl From for StructType { - fn from(inner_type: SchemaTypeStruct) -> Self { +impl From for StructType { + fn from(inner_type: DeltaStructType) -> Self { Self { inner_type } } } -impl From for SchemaDataType { - fn from(str: StructType) -> SchemaDataType { - SchemaDataType::r#struct(str.inner_type) +impl From for DataType { + fn from(str: StructType) -> DataType { + DataType::Struct(Box::new(str.inner_type)) } } -impl TryFrom for StructType { +impl TryFrom for StructType { type Error = PyErr; - fn try_from(value: SchemaDataType) -> PyResult { + fn try_from(value: DataType) -> PyResult { match value { - SchemaDataType::r#struct(inner_type) => Ok(Self { inner_type }), + DataType::Struct(inner_type) => Ok(Self { + inner_type: *inner_type, + }), _ => Err(PyTypeError::new_err("Type is not a struct")), } } @@ -594,18 +578,18 @@ impl TryFrom for StructType { impl StructType { #[new] fn new(fields: Vec>) -> Self { - let fields: Vec = fields + let fields: Vec = fields .into_iter() .map(|field| field.inner.clone()) .collect(); - let inner_type = SchemaTypeStruct::new(fields); + let inner_type = DeltaStructType::new(fields); Self { inner_type } } fn __repr__(&self, py: Python) -> PyResult { let inner_data: Vec = self .inner_type - .get_fields() + .fields() .iter() .map(|field| { let field = Field { @@ -636,7 +620,7 @@ impl StructType { #[getter] fn fields(&self) -> Vec { self.inner_type - .get_fields() + .fields() .iter() .map(|field| Field { inner: field.clone(), @@ -652,7 +636,7 @@ impl StructType { #[staticmethod] #[pyo3(text_signature = "(type_json)")] fn from_json(type_json: String) -> PyResult { - let data_type: SchemaDataType = serde_json::from_str(&type_json) + let data_type: DataType = serde_json::from_str(&type_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; data_type.try_into() @@ -661,7 +645,7 @@ impl StructType { #[pyo3(text_signature = "($self)")] fn to_pyarrow(&self) -> PyResult> { Ok(PyArrowType( - (&SchemaDataType::r#struct(self.inner_type.clone())) + (&DataType::Struct(Box::new(self.inner_type.clone()))) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?, )) @@ -670,7 +654,7 @@ impl StructType { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType) -> PyResult { - let inner_type: SchemaDataType = (&data_type.0) + let inner_type: DataType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -678,9 +662,9 @@ impl StructType { } } -pub fn schema_to_pyobject(schema: &Schema, py: Python) -> PyResult { +pub fn schema_to_pyobject(schema: &DeltaStructType, py: Python) -> PyResult { let fields: Vec = schema - .get_fields() + .fields() .iter() .map(|field| Field { inner: field.clone(), @@ -714,11 +698,11 @@ impl PySchema { #[new] #[pyo3(signature = (fields))] fn new(fields: Vec>) -> PyResult<(Self, StructType)> { - let fields: Vec = fields + let fields: Vec = fields .into_iter() .map(|field| field.inner.clone()) .collect(); - let inner_type = SchemaTypeStruct::new(fields); + let inner_type = DeltaStructType::new(fields); Ok((Self {}, StructType { inner_type })) } @@ -726,7 +710,7 @@ impl PySchema { let super_ = self_.as_ref(); let inner_data: Vec = super_ .inner_type - .get_fields() + .fields() .iter() .map(|field| { let field = Field { @@ -836,7 +820,7 @@ impl PySchema { #[staticmethod] #[pyo3(text_signature = "(data_type)")] fn from_pyarrow(data_type: PyArrowType, py: Python) -> PyResult { - let inner_type: SchemaTypeStruct = (&data_type.0) + let inner_type: DeltaStructType = (&data_type.0) .try_into() .map_err(|err: ArrowError| PyException::new_err(err.to_string()))?; @@ -852,11 +836,19 @@ impl PySchema { #[staticmethod] #[pyo3(text_signature = "(schema_json)")] fn from_json(schema_json: String, py: Python) -> PyResult> { - let data_type: SchemaDataType = serde_json::from_str(&schema_json) + let data_type: DataType = serde_json::from_str(&schema_json) .map_err(|err| PyValueError::new_err(err.to_string()))?; - if let SchemaDataType::r#struct(inner_type) = data_type { - Py::new(py, (Self {}, StructType { inner_type })) + if let DataType::Struct(inner_type) = data_type { + Py::new( + py, + ( + Self {}, + StructType { + inner_type: *inner_type, + }, + ), + ) } else { Err(PyTypeError::new_err("Type is not a struct")) } diff --git a/python/tests/test_schema.py b/python/tests/test_schema.py index e6d45441c0..f63df0e9fb 100644 --- a/python/tests/test_schema.py +++ b/python/tests/test_schema.py @@ -175,7 +175,10 @@ def test_delta_field(): assert field.name == name assert field.type == (PrimitiveType(ty) if isinstance(ty, str) else ty) assert field.nullable == nullable - assert field.metadata == (metadata or {}) + if metadata: + assert json.loads(field.metadata["x"]) == {"y": 3} + else: + assert field.metadata == {} # Field metadata doesn't roundtrip currently # See: https://github.com/apache/arrow-rs/issues/478