-
Notifications
You must be signed in to change notification settings - Fork 435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix checkpoint compatibility for remove fields #427
Changes from all commits
28ab1df
e7847ec
67574d4
31f95cd
f42de42
e33b5c7
ab9670b
56ff5bf
d96c8e3
e28094e
a647d1f
edadc04
0f7403c
6121baa
386a5fb
a1decb6
f8b133e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -162,9 +162,18 @@ impl TryFrom<&schema::SchemaDataType> for ArrowDataType { | |
} | ||
} | ||
|
||
/// Returns an arrow schema representing the delta log for use in checkpoints | ||
/// | ||
/// # Arguments | ||
/// | ||
/// * `table_schema` - The arrow schema representing the table backed by the delta log | ||
/// * `partition_columns` - The list of partition columns of the table. | ||
/// * `use_extended_remove_schema` - Whether to include extended file metadata in remove action schema. | ||
/// Required for compatibility with different versions of Databricks runtime. | ||
pub(crate) fn delta_log_schema_for_table( | ||
table_schema: ArrowSchema, | ||
partition_columns: &[String], | ||
use_extended_remove_schema: bool, | ||
) -> SchemaRef { | ||
lazy_static! { | ||
static ref SCHEMA_FIELDS: Vec<ArrowField> = vec![ | ||
|
@@ -241,47 +250,6 @@ pub(crate) fn delta_log_schema_for_table( | |
]), | ||
true | ||
), | ||
ArrowField::new( | ||
"remove", | ||
ArrowDataType::Struct(vec![ | ||
ArrowField::new("path", ArrowDataType::Utf8, true), | ||
ArrowField::new("deletionTimestamp", ArrowDataType::Int64, true), | ||
ArrowField::new("dataChange", ArrowDataType::Boolean, true), | ||
ArrowField::new("extendedFileMetadata", ArrowDataType::Boolean, true), | ||
ArrowField::new("size", ArrowDataType::Int64, true), | ||
ArrowField::new( | ||
"partitionValues", | ||
ArrowDataType::Map( | ||
Box::new(ArrowField::new( | ||
"key_value", | ||
ArrowDataType::Struct(vec![ | ||
ArrowField::new("key", ArrowDataType::Utf8, false), | ||
ArrowField::new("value", ArrowDataType::Utf8, true), | ||
]), | ||
false | ||
)), | ||
false | ||
), | ||
true | ||
), | ||
ArrowField::new( | ||
"tags", | ||
ArrowDataType::Map( | ||
Box::new(ArrowField::new( | ||
"key_value", | ||
ArrowDataType::Struct(vec![ | ||
ArrowField::new("key", ArrowDataType::Utf8, false), | ||
ArrowField::new("value", ArrowDataType::Utf8, true), | ||
]), | ||
false | ||
)), | ||
false | ||
), | ||
true | ||
) | ||
]), | ||
true | ||
) | ||
]; | ||
static ref ADD_FIELDS: Vec<ArrowField> = vec![ | ||
ArrowField::new("path", ArrowDataType::Utf8, true), | ||
|
@@ -320,17 +288,55 @@ pub(crate) fn delta_log_schema_for_table( | |
true | ||
) | ||
]; | ||
static ref REMOVE_FIELDS: Vec<ArrowField> = vec![ | ||
ArrowField::new("path", ArrowDataType::Utf8, true), | ||
ArrowField::new("deletionTimestamp", ArrowDataType::Int64, true), | ||
ArrowField::new("dataChange", ArrowDataType::Boolean, true), | ||
ArrowField::new("extendedFileMetadata", ArrowDataType::Boolean, true), | ||
]; | ||
static ref REMOVE_EXTENDED_FILE_METADATA_FIELDS: Vec<ArrowField> = vec![ | ||
ArrowField::new("size", ArrowDataType::Int64, true), | ||
ArrowField::new( | ||
"partitionValues", | ||
ArrowDataType::Map( | ||
Box::new(ArrowField::new( | ||
"key_value", | ||
ArrowDataType::Struct(vec![ | ||
ArrowField::new("key", ArrowDataType::Utf8, false), | ||
ArrowField::new("value", ArrowDataType::Utf8, true), | ||
]), | ||
false | ||
)), | ||
false | ||
), | ||
true | ||
), | ||
ArrowField::new( | ||
"tags", | ||
ArrowDataType::Map( | ||
Box::new(ArrowField::new( | ||
"key_value", | ||
ArrowDataType::Struct(vec![ | ||
ArrowField::new("key", ArrowDataType::Utf8, false), | ||
ArrowField::new("value", ArrowDataType::Utf8, true), | ||
]), | ||
false | ||
)), | ||
false | ||
), | ||
true | ||
) | ||
]; | ||
} | ||
|
||
// create add fields according to the specific data table schema | ||
let (partition_fields, non_partition_fields): (Vec<ArrowField>, Vec<ArrowField>) = table_schema | ||
.fields() | ||
.iter() | ||
.map(|f| f.to_owned()) | ||
.partition(|field| partition_columns.contains(field.name())); | ||
|
||
let mut stats_parsed_fields: Vec<ArrowField> = | ||
vec![ArrowField::new("numRecords", ArrowDataType::Int64, true)]; | ||
|
||
if !non_partition_fields.is_empty() { | ||
let mut max_min_vec = Vec::new(); | ||
non_partition_fields | ||
|
@@ -352,15 +358,12 @@ pub(crate) fn delta_log_schema_for_table( | |
|
||
stats_parsed_fields.push(null_count_struct); | ||
} | ||
|
||
let mut add_fields = ADD_FIELDS.clone(); | ||
|
||
add_fields.push(ArrowField::new( | ||
"stats_parsed", | ||
ArrowDataType::Struct(stats_parsed_fields), | ||
true, | ||
)); | ||
|
||
if !partition_fields.is_empty() { | ||
add_fields.push(ArrowField::new( | ||
"partitionValues_parsed", | ||
|
@@ -369,12 +372,24 @@ pub(crate) fn delta_log_schema_for_table( | |
)); | ||
} | ||
|
||
// create remove fields with or without extendedFileMetadata | ||
let mut remove_fields = REMOVE_FIELDS.clone(); | ||
if use_extended_remove_schema { | ||
remove_fields.extend(REMOVE_EXTENDED_FILE_METADATA_FIELDS.clone()); | ||
} | ||
|
||
Comment on lines
+375
to
+380
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @xianwill btw, wouldn't the easier hotfix will be a just write a extended_file_metadata=false without any other columns There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. however for the full compitability with delta 1.0 I agree that we must include them There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mosyp - Just setting |
||
// include add and remove fields in checkpoint schema | ||
let mut schema_fields = SCHEMA_FIELDS.clone(); | ||
schema_fields.push(ArrowField::new( | ||
"add", | ||
ArrowDataType::Struct(add_fields), | ||
true, | ||
)); | ||
schema_fields.push(ArrowField::new( | ||
"remove", | ||
ArrowDataType::Struct(remove_fields), | ||
true, | ||
)); | ||
|
||
let arrow_schema = ArrowSchema::new(schema_fields); | ||
|
||
|
@@ -442,12 +457,17 @@ mod tests { | |
ArrowField::new("col1", ArrowDataType::Int32, true), | ||
]); | ||
let partition_columns = vec!["pcol".to_string()]; | ||
let log_schema = delta_log_schema_for_table(table_schema, partition_columns.as_slice()); | ||
let log_schema = | ||
delta_log_schema_for_table(table_schema.clone(), partition_columns.as_slice(), false); | ||
|
||
// verify top-level schema contains all expected fields and they are named correctly. | ||
let expected_fields = vec!["metaData", "protocol", "txn", "remove", "add"]; | ||
for f in log_schema.fields().iter() { | ||
assert!(expected_fields.contains(&f.name().as_str())); | ||
} | ||
assert_eq!(5, log_schema.fields().len()); | ||
|
||
// verify add fields match as expected. a lot of transformation goes into these. | ||
let add_fields: Vec<_> = log_schema | ||
.fields() | ||
.iter() | ||
|
@@ -462,12 +482,10 @@ mod tests { | |
.flatten() | ||
.collect(); | ||
assert_eq!(9, add_fields.len()); | ||
|
||
let add_field_map: HashMap<_, _> = add_fields | ||
.iter() | ||
.map(|f| (f.name().to_owned(), f.clone())) | ||
.collect(); | ||
|
||
let partition_values_parsed = add_field_map.get("partitionValues_parsed").unwrap(); | ||
if let ArrowDataType::Struct(fields) = partition_values_parsed.data_type() { | ||
assert_eq!(1, fields.len()); | ||
|
@@ -476,7 +494,6 @@ mod tests { | |
} else { | ||
unreachable!(); | ||
} | ||
|
||
let stats_parsed = add_field_map.get("stats_parsed").unwrap(); | ||
if let ArrowDataType::Struct(fields) = stats_parsed.data_type() { | ||
assert_eq!(4, fields.len()); | ||
|
@@ -508,5 +525,51 @@ mod tests { | |
} else { | ||
unreachable!(); | ||
} | ||
|
||
// verify extended remove schema fields **ARE NOT** included when `use_extended_remove_schema` is false. | ||
let remove_fields: Vec<_> = log_schema | ||
.fields() | ||
.iter() | ||
.filter(|f| f.name() == "remove") | ||
.map(|f| { | ||
if let ArrowDataType::Struct(fields) = f.data_type() { | ||
fields.iter().map(|f| f.clone()) | ||
} else { | ||
unreachable!(); | ||
} | ||
}) | ||
.flatten() | ||
.collect(); | ||
assert_eq!(4, remove_fields.len()); | ||
|
||
// verify extended remove schema fields **ARE** included when `use_extended_remove_schema` is true. | ||
let log_schema = | ||
delta_log_schema_for_table(table_schema, partition_columns.as_slice(), true); | ||
let remove_fields: Vec<_> = log_schema | ||
.fields() | ||
.iter() | ||
.filter(|f| f.name() == "remove") | ||
.map(|f| { | ||
if let ArrowDataType::Struct(fields) = f.data_type() { | ||
fields.iter().map(|f| f.clone()) | ||
} else { | ||
unreachable!(); | ||
} | ||
}) | ||
.flatten() | ||
.collect(); | ||
assert_eq!(7, remove_fields.len()); | ||
let expected_fields = vec![ | ||
"path", | ||
"deletionTimestamp", | ||
"dataChange", | ||
"extendedFileMetadata", | ||
"partitionValues", | ||
"size", | ||
"tags", | ||
]; | ||
for f in remove_fields.iter() { | ||
assert!(expected_fields.contains(&f.name().as_str())); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is to avoid extending parquet schema with null metadata? E.g. so it'll make DBR 8.x to fail I suppose
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exactly - this is to avoid writing out the additional fields in the schema and prevent the break in DBR 8.x.