From 4d9e288e4a8fded809da37fb34e2ccda1bf03279 Mon Sep 17 00:00:00 2001 From: Nick Lanham Date: Tue, 6 Aug 2024 14:31:12 -0700 Subject: [PATCH] Use `item` as list root name, enable a bunch of tests --- kernel/src/engine/arrow_conversion.rs | 2 +- kernel/tests/golden_tables.rs | 43 +++++++++++++++++---------- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/kernel/src/engine/arrow_conversion.rs b/kernel/src/engine/arrow_conversion.rs index a502b1672..532c7489b 100644 --- a/kernel/src/engine/arrow_conversion.rs +++ b/kernel/src/engine/arrow_conversion.rs @@ -11,7 +11,7 @@ use itertools::Itertools; use crate::error::Error; use crate::schema::{ArrayType, DataType, MapType, PrimitiveType, StructField, StructType}; -pub(crate) const LIST_ARRAY_ROOT: &str = "item"; +pub(crate) const LIST_ARRAY_ROOT: &str = "element"; pub(crate) const MAP_ROOT_DEFAULT: &str = "key_value"; pub(crate) const MAP_KEY_DEFAULT: &str = "key"; pub(crate) const MAP_VALUE_DEFAULT: &str = "value"; diff --git a/kernel/tests/golden_tables.rs b/kernel/tests/golden_tables.rs index b5ff6adfb..97168d9a9 100644 --- a/kernel/tests/golden_tables.rs +++ b/kernel/tests/golden_tables.rs @@ -62,12 +62,26 @@ async fn read_expected(path: &Path) -> DeltaResult { // copied from DAT fn sort_record_batch(batch: RecordBatch) -> DeltaResult { - // Sort by all columns + if batch.num_rows() < 2 { + // 0 or 1 rows doesn't need sorting + return Ok(batch); + } + // Sort by as many columns as possible let mut sort_columns = vec![]; for col in batch.columns() { match col.data_type() { - DataType::Struct(_) | DataType::List(_) | DataType::Map(_, _) => { - // can't sort structs, lists, or maps + DataType::Struct(_) | DataType::Map(_, _) => { + // can't sort by structs or maps + } + DataType::List(list_field) => { + let list_dt = list_field.data_type(); + if list_dt.is_primitive() { + // we can sort lists of primitives + sort_columns.push(SortColumn { + values: col.clone(), + options: None, + }) + } } _ => sort_columns.push(SortColumn { values: col.clone(), @@ -129,10 +143,7 @@ fn assert_columns_match(actual: &[Arc], expected: &[Arc]) let expected = normalize_col(expected.clone()); // note that array equality includes data_type equality // See: https://arrow.apache.org/rust/arrow_data/equal/fn.equal.html - assert_eq!( - &actual, &expected, - "Column data didn't match. Got {actual:?}, expected {expected:?}" - ); + assert_eq!(&actual, &expected, "Column data didn't match."); } } @@ -173,7 +184,7 @@ async fn latest_snapshot_test( let result = concat_batches(&schema, &batches).unwrap(); let result = sort_record_batch(result).unwrap(); let expected = sort_record_batch(expected).unwrap(); - assert_columns_match(expected.columns(), result.columns()); + assert_columns_match(result.columns(), expected.columns()); assert_schema_fields_match(expected.schema().as_ref(), result.schema().as_ref()); assert!( expected.num_rows() == result.num_rows(), @@ -304,8 +315,8 @@ golden_test!( ); golden_test!("checkpoint", checkpoint_test); skip_test!("corrupted-last-checkpoint-kernel": "BUG: should fallback to old commits/checkpoint"); -skip_test!("data-reader-array-complex-objects": "list field expected name item but got name element"); -skip_test!("data-reader-array-primitives": "list field expected name item but got name element"); +golden_test!("data-reader-array-complex-objects", latest_snapshot_test); +golden_test!("data-reader-array-primitives", latest_snapshot_test); golden_test!("data-reader-date-types-America", latest_snapshot_test); golden_test!("data-reader-date-types-Asia", latest_snapshot_test); golden_test!("data-reader-date-types-Etc", latest_snapshot_test); @@ -316,9 +327,11 @@ golden_test!("data-reader-date-types-utc", latest_snapshot_test); golden_test!("data-reader-escaped-chars", latest_snapshot_test); skip_test!("data-reader-map": "map field named 'entries' vs 'key_value'"); golden_test!("data-reader-nested-struct", latest_snapshot_test); -skip_test!("data-reader-nullable-field-invalid-schema-key": - "list field expected name item but got name element"); -skip_test!("data-reader-partition-values": "list field expected name item but got name element"); +golden_test!( + "data-reader-nullable-field-invalid-schema-key", + latest_snapshot_test +); +skip_test!("data-reader-partition-values": "Golden data needs to have 2021-09-08T11:11:11+00:00 as expected value for as_timestamp col"); golden_test!("data-reader-primitives", latest_snapshot_test); golden_test!("data-reader-timestamp_ntz", latest_snapshot_test); skip_test!("data-reader-timestamp_ntz-id-mode": "id column mapping mode not supported"); @@ -366,8 +379,8 @@ golden_test!("multi-part-checkpoint", latest_snapshot_test); golden_test!("only-checkpoint-files", latest_snapshot_test); // TODO some of the parquet tests use projections -skip_test!("parquet-all-types": "list field expected name item but got name element"); -skip_test!("parquet-all-types-legacy-format": "list field expected name item but got name element"); +skip_test!("parquet-all-types": "schemas disagree about nullability, need to figure out which is correct and adjust"); +skip_test!("parquet-all-types-legacy-format": "legacy parquet has name `array`, we should have adjusted this to `element`"); golden_test!("parquet-decimal-dictionaries", latest_snapshot_test); golden_test!("parquet-decimal-dictionaries-v1", latest_snapshot_test); golden_test!("parquet-decimal-dictionaries-v2", latest_snapshot_test);