From 2198a11070b7241dbc239ff4087942a3cbbf544b Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 10:57:10 +0200 Subject: [PATCH 01/12] Add tests for the known failure cases --- serde_arrow/src/test/mod.rs | 1 + serde_arrow/src/test/schema_tracing.rs | 82 ++++++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 serde_arrow/src/test/schema_tracing.rs diff --git a/serde_arrow/src/test/mod.rs b/serde_arrow/src/test/mod.rs index 93168059..4ae98782 100644 --- a/serde_arrow/src/test/mod.rs +++ b/serde_arrow/src/test/mod.rs @@ -2,3 +2,4 @@ mod api_chrono; mod decimal_representations; mod error; mod schema_like; +mod schema_tracing; diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs new file mode 100644 index 00000000..0b5ff2eb --- /dev/null +++ b/serde_arrow/src/test/schema_tracing.rs @@ -0,0 +1,82 @@ +use serde_json::json; + +use crate::internal::{ + error::PanicOnError, + schema::{SchemaLike, SerdeArrowSchema, TracingOptions}, +}; + +// see https://github.com/chmp/serde_arrow/issues/216 +mod json_utf8_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + {"name": "flavor", "data_type": "LargeUtf8", "nullable": true}, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default())?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(str_null, [{"flavor": "delicious"}, {"flavor": null}]); + test!(null_str, [{"flavor": null}, {"flavor": "delicious"}]); +} + +// A mixture of negative and positive ints is traced as i64 +mod json_i64_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + {"name": "value", "data_type": "I64", "nullable": true}, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default())?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(pos_null_neg, [{"value": 42}, {"value": null}, {"value": -13}]); + test!(pos_neg_null, [{"value": 42}, {"value": -13}, {"value": null}]); + test!(null_neg_pos, [{"value": null}, {"value": -13}, {"value": 42}]); + test!(null_pos_neg, [{"value": null}, {"value": 42}, {"value": -13}]); + test!(neg_null_pos, [{"value": -13}, {"value": null}, {"value": 42}]); + test!(neg_pos, [{"value": -13}, {"value": 42}, {"value": null}]); +} + +// Positive ints are traced as u64 +mod json_u64_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + {"name": "value", "data_type": "U64", "nullable": true}, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default())?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(pos_null, [{"value": 42}, {"value": null}]); + test!(null_pos, [{"value": null}, {"value": 42}]); +} From 46edeae9bb94a3ecadbe098c503fa146b804316a Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 11:01:47 +0200 Subject: [PATCH 02/12] Add tests for bool, structs, lists --- serde_arrow/src/test/schema_tracing.rs | 83 ++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs index 0b5ff2eb..f6495371 100644 --- a/serde_arrow/src/test/schema_tracing.rs +++ b/serde_arrow/src/test/schema_tracing.rs @@ -80,3 +80,86 @@ mod json_u64_null { test!(pos_null, [{"value": 42}, {"value": null}]); test!(null_pos, [{"value": null}, {"value": 42}]); } + +mod json_bool_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + {"name": "value", "data_type": "Bool", "nullable": true}, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default())?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(pos_null, [{"value": true}, {"value": null}]); + test!(null_pos, [{"value": null}, {"value": false}]); +} + +mod json_struct_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + { + "name": "value", + "data_type": "Struct", + "nullable": true, + "children": [ + {"name": "child", "data_type": "U32"}, + ] + }, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default())?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(struct_null, [{"value": {"child": 13}}, {"value": null}]); + test!(null_struct, [{"value": null}, {"value": {"child": 13}}]); +} + +mod json_list_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + { + "name": "value", + "data_type": "LargeList", + "nullable": true, + "children": [ + {"name": "element", "data_type": "U32"}, + ] + }, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default())?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(list_null, [{"value": [13]}, {"value": null}]); + test!(null_list, [{"value": null}, {"value": [13]}]); +} From cc1e55e79d4e281c901cf543f8c9d284d35b08e5 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 11:52:42 +0200 Subject: [PATCH 03/12] Fix primitive data type coercion --- serde_arrow/src/internal/schema/tracer.rs | 162 ++++++++++------------ serde_arrow/src/test/schema_tracing.rs | 4 +- 2 files changed, 76 insertions(+), 90 deletions(-) diff --git a/serde_arrow/src/internal/schema/tracer.rs b/serde_arrow/src/internal/schema/tracer.rs index 9e4e6358..28bf71ad 100644 --- a/serde_arrow/src/internal/schema/tracer.rs +++ b/serde_arrow/src/internal/schema/tracer.rs @@ -442,112 +442,98 @@ impl Tracer { item_type: GenericDataType, strategy: Option, ) -> Result<()> { - if self.is_unknown() { - let tracer = dispatch_tracer!(self, tracer => PrimitiveTracer::new( - tracer.name.clone(), - tracer.path.clone(), - tracer.options.clone(), - item_type, - tracer.nullable, - )) - .with_strategy(strategy); - *self = Self::Primitive(tracer); - } else if let Tracer::Primitive(tracer) = self { - use { - GenericDataType::Date64, GenericDataType::LargeUtf8, Strategy::NaiveStrAsDate64, - Strategy::UtcStrAsDate64, - }; - let (item_type, strategy) = match ((&tracer.item_type), (item_type)) { - (Date64, Date64) => match (&tracer.strategy, strategy) { - (Some(NaiveStrAsDate64), Some(NaiveStrAsDate64)) => { - (Date64, Some(NaiveStrAsDate64)) - } - (Some(UtcStrAsDate64), Some(UtcStrAsDate64)) => (Date64, Some(UtcStrAsDate64)), - // incompatible strategies, coerce to string - (_, _) => (LargeUtf8, None), - }, - (LargeUtf8, _) | (_, LargeUtf8) => (LargeUtf8, None), - (prev_ty, new_ty) => { - fail!("mismatched types, previous {prev_ty}, current {new_ty}") - } - }; - tracer.item_type = item_type; - tracer.strategy = strategy; - } else { - let Some(ty) = self.get_type() else { - unreachable!("tracer cannot be unknown"); - }; - fail!("mismatched types, previous {ty}, current {item_type}"); - } - Ok(()) + self.ensure_primitive_with_strategy(item_type, strategy) } pub fn ensure_primitive(&mut self, item_type: GenericDataType) -> Result<()> { - match self { - this @ Self::Unknown(_) => { - let tracer = dispatch_tracer!(this, tracer => PrimitiveTracer::new( - tracer.name.clone(), - tracer.path.clone(), - tracer.options.clone(), - item_type, - tracer.nullable, - )); - *this = Self::Primitive(tracer); - } - Self::Primitive(tracer) if tracer.item_type == item_type => {} - _ => fail!( - "mismatched types, previous {:?}, current {:?}", - self.get_type(), - item_type - ), - } - Ok(()) + self.ensure_primitive_with_strategy(item_type, None) } pub fn ensure_number(&mut self, item_type: GenericDataType) -> Result<()> { + self.ensure_primitive_with_strategy(item_type, None) + } + + fn ensure_primitive_with_strategy( + &mut self, + item_type: GenericDataType, + strategy: Option, + ) -> Result<()> { match self { this @ Self::Unknown(_) => { - let tracer = dispatch_tracer!(this, tracer => PrimitiveTracer::new( - tracer.name.clone(), - tracer.path.clone(), - tracer.options.clone(), + let is_null_type = matches!(item_type, GenericDataType::Null); + let tracer = dispatch_tracer!(this, tracer => PrimitiveTracer { + name: tracer.name.clone(), + path: tracer.path.clone(), + options: tracer.options.clone(), + nullable: tracer.nullable || is_null_type, item_type, - tracer.nullable, - )); + strategy, + }); *this = Self::Primitive(tracer); } - Self::Primitive(tracer) if tracer.options.coerce_numbers => { - use GenericDataType::{F32, F64, I16, I32, I64, I8, U16, U32, U64, U8}; - let item_type = match (&tracer.item_type, item_type) { - // unsigned x unsigned -> u64 - (U8 | U16 | U32 | U64, U8 | U16 | U32 | U64) => U64, - // signed x signed -> i64 - (I8 | I16 | I32 | I64, I8 | I16 | I32 | I64) => I64, - // signed x unsigned -> i64 - (I8 | I16 | I32 | I64, U8 | U16 | U32 | U64) => I64, - // unsigned x signed -> i64 - (U8 | U16 | U32 | U64, I8 | I16 | I32 | I64) => I64, - // float x float -> f64 - (F32 | F64, F32 | F64) => F64, - // int x float -> f64 - (I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64, F32 | F64) => F64, - // float x int -> f64 - (F32 | F64, I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64) => F64, - (ty, ev) => fail!("Cannot accept event {ev} for tracer of primitive type {ty}"), - }; + this @ (Self::List(_) + | Self::Map(_) + | Self::Struct(_) + | Self::Tuple(_) + | Self::Union(_)) => { + if matches!(item_type, GenericDataType::Null) { + dispatch_tracer!(this, tracer => { tracer.nullable = true }); + } else { + fail!("Cannot merge {ty:?} with {item_type}", ty = this.get_type()); + } + } + Self::Primitive(tracer) => { + let (item_type, nullable, strategy) = coerce_primitive_type( + (&tracer.item_type, tracer.nullable, tracer.strategy.as_ref()), + (item_type, strategy), + tracer.options.as_ref(), + )?; + tracer.item_type = item_type; + tracer.strategy = strategy; + tracer.nullable = nullable; } - Self::Primitive(tracer) if tracer.item_type == item_type => {} - _ => fail!( - "mismatched types, previous {:?}, current {:?}", - self.get_type(), - item_type - ), } Ok(()) } } +fn coerce_primitive_type( + prev: (&GenericDataType, bool, Option<&Strategy>), + curr: (GenericDataType, Option), + options: &TracingOptions, +) -> Result<(GenericDataType, bool, Option)> { + use GenericDataType::{ + Date64, LargeUtf8, Null, F32, F64, I16, I32, I64, I8, U16, U32, U64, U8, + }; + + let res = match (prev, curr) { + ((Null, _, _), (curr_ty, curr_st)) => (curr_ty, true, curr_st), + ((prev_ty, _, prev_st), (Null, _)) => (prev_ty.clone(), true, prev_st.cloned()), + // unsigned x unsigned -> u64 + ((U8 | U16 | U32 | U64, nullable, _), (U8 | U16 | U32 | U64, _,)) if options.coerce_numbers => (U64, nullable, None), + // signed x signed -> i64 + ((I8 | I16 | I32 | I64, nullable, _), (I8 | I16 | I32 | I64, _)) if options.coerce_numbers => (I64, nullable, None), + // signed x unsigned -> i64 + ((I8 | I16 | I32 | I64, nullable, _), (U8 | U16 | U32 | U64, _)) if options.coerce_numbers => (I64, nullable, None), + // unsigned x signed -> i64 + ((U8 | U16 | U32 | U64, nullable, _), (I8 | I16 | I32 | I64, _)) if options.coerce_numbers => (I64, nullable, None), + // float x float -> f64 + ((F32 | F64, nullable, _), (F32 | F64, _)) if options.coerce_numbers=> (F64, nullable, None), + // int x float -> f64 + ((I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64, nullable, _), (F32 | F64, _)) if options.coerce_numbers=> (F64, nullable, None), + // float x int -> f64 + ((F32 | F64, nullable, _), (I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64, _)) if options.coerce_numbers=> (F64, nullable, None), + // incompatible formats, coerce to string + ((Date64, nullable, _), (LargeUtf8, _)) => (LargeUtf8, nullable, None), + ((LargeUtf8, nullable, _), (Date64, _)) => (LargeUtf8, nullable, None), + ((Date64, nullable, prev_st), (Date64, curr_st)) if prev_st != curr_st.as_ref() => (LargeUtf8, nullable, None), + ((prev_ty, nullable, prev_st), (curr_ty, curr_st)) if *prev_ty == curr_ty && prev_st == curr_st.as_ref() => (curr_ty, nullable, curr_st), + ((prev_ty, _, prev_st), (curr_ty, curr_st)) => fail!("Cannot accept event {curr_ty} with strategy {curr_st:?} for tracer of primitive type {prev_ty} with strategy {prev_st:?}"), + }; + Ok(res) +} + #[derive(Debug, PartialEq, Clone)] pub struct UnknownTracer { pub name: String, diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs index f6495371..81a894ee 100644 --- a/serde_arrow/src/test/schema_tracing.rs +++ b/serde_arrow/src/test/schema_tracing.rs @@ -42,7 +42,7 @@ mod json_i64_null { ]))?; let data = json!($($data)*); - let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default())?; + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default().coerce_numbers(true))?; assert_eq!(actual, expected); Ok(()) } @@ -54,7 +54,7 @@ mod json_i64_null { test!(null_neg_pos, [{"value": null}, {"value": -13}, {"value": 42}]); test!(null_pos_neg, [{"value": null}, {"value": 42}, {"value": -13}]); test!(neg_null_pos, [{"value": -13}, {"value": null}, {"value": 42}]); - test!(neg_pos, [{"value": -13}, {"value": 42}, {"value": null}]); + test!(neg_pos_null, [{"value": -13}, {"value": 42}, {"value": null}]); } // Positive ints are traced as u64 From 78ade7bde46f4d1e0d074598fb80325f042c4056 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 12:12:45 +0200 Subject: [PATCH 04/12] fix null followed by non-primitive --- serde_arrow/src/internal/schema/tracer.rs | 24 ++++++++++++++++------- serde_arrow/src/test/schema_tracing.rs | 5 +++-- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/serde_arrow/src/internal/schema/tracer.rs b/serde_arrow/src/internal/schema/tracer.rs index 28bf71ad..791ddb2c 100644 --- a/serde_arrow/src/internal/schema/tracer.rs +++ b/serde_arrow/src/internal/schema/tracer.rs @@ -263,7 +263,9 @@ impl Tracer { self.enforce_depth_limit()?; match self { - this @ Self::Unknown(_) => { + this if matches!(this, Self::Unknown(_)) + || matches!(this, Self::Primitive(ref tracer) if tracer.item_type == GenericDataType::Null) => + { let field_names = fields .iter() .map(|field| field.to_string()) @@ -314,7 +316,9 @@ impl Tracer { self.enforce_depth_limit()?; match self { - this @ Self::Unknown(_) => { + this if matches!(this, Self::Unknown(_)) + || matches!(this, Self::Primitive(ref tracer) if tracer.item_type == GenericDataType::Null) => + { let tracer = dispatch_tracer!(this, tracer => TupleTracer { name: tracer.name.clone(), path: tracer.path.clone(), @@ -346,7 +350,9 @@ impl Tracer { self.enforce_depth_limit()?; match self { - this @ Self::Unknown(_) => { + this if matches!(this, Self::Unknown(_)) + || matches!(this, Self::Primitive(ref tracer) if tracer.item_type == GenericDataType::Null) => + { let tracer = dispatch_tracer!(this, tracer => UnionTracer { name: tracer.name.clone(), path: tracer.path.clone(), @@ -382,7 +388,9 @@ impl Tracer { self.enforce_depth_limit()?; match self { - this @ Self::Unknown(_) => { + this if matches!(this, Self::Unknown(_)) + || matches!(this, Self::Primitive(ref tracer) if tracer.item_type == GenericDataType::Null) => + { let tracer = dispatch_tracer!(this, tracer => ListTracer { name: tracer.name.clone(), path: tracer.path.clone(), @@ -409,7 +417,9 @@ impl Tracer { self.enforce_depth_limit()?; match self { - this @ Self::Unknown(_) => { + this if matches!(this, Self::Unknown(_)) + || matches!(this, Self::Primitive(ref tracer) if tracer.item_type == GenericDataType::Null) => + { let tracer = dispatch_tracer!(this, tracer => MapTracer { name: tracer.name.clone(), path: tracer.path.clone(), @@ -521,9 +531,9 @@ fn coerce_primitive_type( // float x float -> f64 ((F32 | F64, nullable, _), (F32 | F64, _)) if options.coerce_numbers=> (F64, nullable, None), // int x float -> f64 - ((I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64, nullable, _), (F32 | F64, _)) if options.coerce_numbers=> (F64, nullable, None), + ((I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64, nullable, _), (F32 | F64, _)) if options.coerce_numbers => (F64, nullable, None), // float x int -> f64 - ((F32 | F64, nullable, _), (I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64, _)) if options.coerce_numbers=> (F64, nullable, None), + ((F32 | F64, nullable, _), (I8 | I16 | I32 | I64 | U8 | U16 | U32 | U64, _)) if options.coerce_numbers => (F64, nullable, None), // incompatible formats, coerce to string ((Date64, nullable, _), (LargeUtf8, _)) => (LargeUtf8, nullable, None), ((LargeUtf8, nullable, _), (Date64, _)) => (LargeUtf8, nullable, None), diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs index 81a894ee..7ed71daa 100644 --- a/serde_arrow/src/test/schema_tracing.rs +++ b/serde_arrow/src/test/schema_tracing.rs @@ -116,8 +116,9 @@ mod json_struct_null { "name": "value", "data_type": "Struct", "nullable": true, + "strategy": "MapAsStruct", "children": [ - {"name": "child", "data_type": "U32"}, + {"name": "child", "data_type": "U64"}, ] }, ]))?; @@ -147,7 +148,7 @@ mod json_list_null { "data_type": "LargeList", "nullable": true, "children": [ - {"name": "element", "data_type": "U32"}, + {"name": "element", "data_type": "U64"}, ] }, ]))?; From ef78816c3794a1d956233135f3831eae44da5937 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 12:25:29 +0200 Subject: [PATCH 05/12] Add Date64 -> String tests --- serde_arrow/src/test/schema_tracing.rs | 102 +++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs index 7ed71daa..4d87ea07 100644 --- a/serde_arrow/src/test/schema_tracing.rs +++ b/serde_arrow/src/test/schema_tracing.rs @@ -164,3 +164,105 @@ mod json_list_null { test!(list_null, [{"value": [13]}, {"value": null}]); test!(null_list, [{"value": null}, {"value": [13]}]); } + +mod json_date64_naive_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + { + "name": "date", + "data_type": "Date64", + "strategy": "NaiveStrAsDate64", + "nullable": true, + }, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default().guess_dates(true))?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(str_null, [{"date": "2024-08-09T12:15:00"}, {"date": null}]); + test!(null_str, [{"date": null}, {"date": "2024-08-09T12:15:00"}]); +} + +mod json_date64_utc_null { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + { + "name": "date", + "data_type": "Date64", + "strategy": "UtcStrAsDate64", + "nullable": true, + }, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default().guess_dates(true))?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(str_null, [{"date": "2024-08-09T12:15:00Z"}, {"date": null}]); + test!(null_str, [{"date": null}, {"date": "2024-08-09T12:15:00Z"}]); +} + +/// Mixing different date formats or dates and non-dates, results in Strings +mod json_date64_to_string_coercions { + use super::*; + + macro_rules! test { + ($name:ident, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + { + "name": "date", + "data_type": "LargeUtf8", + "nullable": true, + }, + ]))?; + + let data = json!($($data)*); + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default().guess_dates(true))?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + test!(utc_naive_null, [{"date": "2024-08-09T12:15:00Z"}, {"date": "2024-08-09T12:15:00"}, {"date": null}]); + test!(utc_null_naive, [{"date": "2024-08-09T12:15:00Z"}, {"date": null}, {"date": "2024-08-09T12:15:00"}]); + test!(naive_utc_null, [{"date": "2024-08-09T12:15:00"}, {"date": "2024-08-09T12:15:00Z"}, {"date": null}]); + test!(naive_null_utc, [{"date": "2024-08-09T12:15:00"}, {"date": null}, {"date": "2024-08-09T12:15:00Z"}]); + test!(null_naive_utc, [{"date": null}, {"date": "2024-08-09T12:15:00"}, {"date": "2024-08-09T12:15:00Z"}]); + test!(null_utc_naive, [{"date": null}, {"date": "2024-08-09T12:15:00Z"}, {"date": "2024-08-09T12:15:00"}]); + + test!(utc_str_null, [{"date": "2024-08-09T12:15:00Z"}, {"date": "foo"}, {"date": null}]); + test!(utc_null_str, [{"date": "2024-08-09T12:15:00Z"}, {"date": null}, {"date": "foo"}]); + test!(str_utc_null, [{"date": "bar"}, {"date": "2024-08-09T12:15:00Z"}, {"date": null}]); + test!(str_null_utc, [{"date": "bar"}, {"date": null}, {"date": "2024-08-09T12:15:00Z"}]); + test!(null_str_utc, [{"date": null}, {"date": "baz"}, {"date": "2024-08-09T12:15:00Z"}]); + test!(null_utc_str, [{"date": null}, {"date": "2024-08-09T12:15:00Z"}, {"date": "baz"}]); + + test!(naive_str_null, [{"date": "2024-08-09T12:15:00"}, {"date": "foo"}, {"date": null}]); + test!(naive_null_str, [{"date": "2024-08-09T12:15:00"}, {"date": null}, {"date": "foo"}]); + test!(str_naive_null, [{"date": "bar"}, {"date": "2024-08-09T12:15:00"}, {"date": null}]); + test!(str_null_naive, [{"date": "bar"}, {"date": null}, {"date": "2024-08-09T12:15:00"}]); + test!(null_str_naive, [{"date": null}, {"date": "baz"}, {"date": "2024-08-09T12:15:00"}]); + test!(null_naive_str, [{"date": null}, {"date": "2024-08-09T12:15:00"}, {"date": "baz"}]); +} From 10f37b5b5b976ba3f9109c12ee33ac12866b331d Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 12:33:01 +0200 Subject: [PATCH 06/12] Update changelog --- Changes.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Changes.md b/Changes.md index a33b7bb2..f83784f3 100644 --- a/Changes.md +++ b/Changes.md @@ -1,5 +1,9 @@ # Change log +## 0.11.7 + +- Fix tracing of JSON mixing nulls with non-null data + ## 0.11.6 - Add `arrow=52` support From 91a54656482f6f78a6e51b0cb6e832db65f54a3f Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 13:15:21 +0200 Subject: [PATCH 07/12] Fix number coercion (and add tests) --- serde_arrow/src/internal/schema/tracer.rs | 2 +- serde_arrow/src/test/schema_tracing.rs | 126 ++++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/serde_arrow/src/internal/schema/tracer.rs b/serde_arrow/src/internal/schema/tracer.rs index 791ddb2c..a2b50a3f 100644 --- a/serde_arrow/src/internal/schema/tracer.rs +++ b/serde_arrow/src/internal/schema/tracer.rs @@ -518,6 +518,7 @@ fn coerce_primitive_type( }; let res = match (prev, curr) { + ((prev_ty, nullable, prev_st), (curr_ty, curr_st)) if prev_ty == &curr_ty && prev_st == curr_st.as_ref() => (curr_ty, nullable, curr_st), ((Null, _, _), (curr_ty, curr_st)) => (curr_ty, true, curr_st), ((prev_ty, _, prev_st), (Null, _)) => (prev_ty.clone(), true, prev_st.cloned()), // unsigned x unsigned -> u64 @@ -538,7 +539,6 @@ fn coerce_primitive_type( ((Date64, nullable, _), (LargeUtf8, _)) => (LargeUtf8, nullable, None), ((LargeUtf8, nullable, _), (Date64, _)) => (LargeUtf8, nullable, None), ((Date64, nullable, prev_st), (Date64, curr_st)) if prev_st != curr_st.as_ref() => (LargeUtf8, nullable, None), - ((prev_ty, nullable, prev_st), (curr_ty, curr_st)) if *prev_ty == curr_ty && prev_st == curr_st.as_ref() => (curr_ty, nullable, curr_st), ((prev_ty, _, prev_st), (curr_ty, curr_st)) => fail!("Cannot accept event {curr_ty} with strategy {curr_st:?} for tracer of primitive type {prev_ty} with strategy {prev_st:?}"), }; Ok(res) diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs index 4d87ea07..e7b44946 100644 --- a/serde_arrow/src/test/schema_tracing.rs +++ b/serde_arrow/src/test/schema_tracing.rs @@ -266,3 +266,129 @@ mod json_date64_to_string_coercions { test!(null_str_naive, [{"date": null}, {"date": "baz"}, {"date": "2024-08-09T12:15:00"}]); test!(null_naive_str, [{"date": null}, {"date": "2024-08-09T12:15:00"}, {"date": "baz"}]); } + +mod untagged_enum_number_coercion { + use serde::Serialize; + + use super::*; + + #[derive(Debug, Serialize)] + #[serde(untagged)] + enum Num { + Null(()), + I8(i8), + I16(i16), + I32(i32), + I64(i64), + U8(u8), + U16(u16), + U32(u32), + U64(u64), + F32(f32), + F64(f64), + } + + macro_rules! test_impl { + ($name:ident, $data_type:expr, $nullable:expr, $($data:tt)*) => { + #[test] + fn $name() -> PanicOnError<()> { + let expected = SerdeArrowSchema::from_value(&json!([ + { + "name": "0", + "data_type": $data_type, + "nullable": $nullable, + }, + ]))?; + + let data = $($data)*; + let actual = SerdeArrowSchema::from_samples(&data, TracingOptions::default().coerce_numbers(true))?; + assert_eq!(actual, expected); + Ok(()) + } + }; + } + + macro_rules! test { + ($name:ident, $data_type:expr, $nullable:expr, [$a:expr, $b:expr]) => { + mod $name { + use super::*; + + test_impl!(ab, $data_type, $nullable, [$a, $b]); + test_impl!(ba, $data_type, $nullable, [$b, $a]); + } + }; + ($name:ident, $data_type:expr, $nullable:expr, [$a:expr, $b:expr, $c:expr]) => { + mod $name { + use super::*; + + test_impl!(abc, $data_type, $nullable, [$a, $b, $c]); + test_impl!(acb, $data_type, $nullable, [$a, $c, $b]); + test_impl!(bac, $data_type, $nullable, [$b, $a, $c]); + test_impl!(bca, $data_type, $nullable, [$b, $c, $a]); + test_impl!(cab, $data_type, $nullable, [$c, $a, $b]); + test_impl!(cba, $data_type, $nullable, [$c, $b, $a]); + } + }; + } + + test!(i32_i32_undecorated, "I32", false, [(0_i32,), (0_i32,)]); + test!(i8_i8, "I8", false, [(Num::I8(0),), (Num::I8(0),)]); + test!(i16_i16, "I16", false, [(Num::I16(0),), (Num::I16(0),)]); + test!(i32_i32, "I32", false, [(Num::I32(0),), (Num::I32(0),)]); + test!(i64_i64, "I64", false, [(Num::I64(0),), (Num::I64(0),)]); + test!(u8_u8, "U8", false, [(Num::U8(0),), (Num::U8(0),)]); + test!(u16_u16, "U16", false, [(Num::U16(0),), (Num::U16(0),)]); + test!(u32_u32, "U32", false, [(Num::U32(0),), (Num::U32(0),)]); + test!(u64_u64, "U64", false, [(Num::U64(0),), (Num::U64(0),)]); + test!(f32_f32, "F32", false, [(Num::F32(0.0),), (Num::F32(0.0),)]); + test!(f64_f64, "F64", false, [(Num::F64(0.0),), (Num::F64(0.0),)]); + + // _, null -> nullable + test!(i8_null, "I8", true, [(Num::I8(0),), (Num::Null(()),)]); + test!(i16_null, "I16", true, [(Num::I16(0),), (Num::Null(()),)]); + test!(i32_null, "I32", true, [(Num::I32(0),), (Num::Null(()),)]); + test!(i64_null, "I64", true, [(Num::I64(0),), (Num::Null(()),)]); + test!(u8_null, "U8", true, [(Num::U8(0),), (Num::Null(()),)]); + test!(u16_null, "U16", true, [(Num::U16(0),), (Num::Null(()),)]); + test!(u32_null, "U32", true, [(Num::U32(0),), (Num::Null(()),)]); + test!(u64_null, "U64", true, [(Num::U64(0),), (Num::Null(()),)]); + test!(f32_null, "F32", true, [(Num::F32(0.0),), (Num::Null(()),)]); + test!(f64_null, "F64", true, [(Num::F64(0.0),), (Num::Null(()),)]); + + // unsigned, unsigned -> u64 + test!(u8_u16, "U64", false, [(Num::U8(0),), (Num::U16(0),)]); + test!(u8_u32, "U64", false, [(Num::U8(0),), (Num::U32(0),)]); + test!(u8_u64, "U64", false, [(Num::U8(0),), (Num::U64(0),)]); + test!(u16_u32, "U64", false, [(Num::U16(0),), (Num::U32(0),)]); + test!(u16_u64, "U64", false, [(Num::U16(0),), (Num::U64(0),)]); + test!(u32_u64, "U64", false, [(Num::U32(0),), (Num::U64(0),)]); + + // signed,signed -> i64 + test!(i8_i16, "I64", false, [(Num::I8(0),), (Num::I16(0),)]); + test!(i8_i32, "I64", false, [(Num::I8(0),), (Num::I32(0),)]); + test!(i8_i64, "I64", false, [(Num::I8(0),), (Num::I64(0),)]); + test!(i16_i32, "I64", false, [(Num::I16(0),), (Num::I32(0),)]); + test!(i16_i64, "I64", false, [(Num::I16(0),), (Num::I64(0),)]); + test!(i32_i64, "I64", false, [(Num::I32(0),), (Num::I64(0),)]); + + // float, float -> f64 + test!(f32_f64, "F64", false, [(Num::F32(0.0),), (Num::F64(0.0),)]); + + // float, number -> f64 + test!(f32_i8, "F64", false, [(Num::F32(0.0),), (Num::I8(0),)]); + test!(f32_i16, "F64", false, [(Num::F32(0.0),), (Num::I16(0),)]); + test!(f32_i32, "F64", false, [(Num::F32(0.0),), (Num::I32(0),)]); + test!(f32_i64, "F64", false, [(Num::F32(0.0),), (Num::I64(0),)]); + test!(f32_u8, "F64", false, [(Num::F32(0.0),), (Num::U8(0),)]); + test!(f32_u16, "F64", false, [(Num::F32(0.0),), (Num::U16(0),)]); + test!(f32_u32, "F64", false, [(Num::F32(0.0),), (Num::U32(0),)]); + test!(f32_u64, "F64", false, [(Num::F32(0.0),), (Num::U64(0),)]); + test!(f64_i8, "F64", false, [(Num::F64(0.0),), (Num::I8(0),)]); + test!(f64_i16, "F64", false, [(Num::F64(0.0),), (Num::I16(0),)]); + test!(f64_i32, "F64", false, [(Num::F64(0.0),), (Num::I32(0),)]); + test!(f64_i64, "F64", false, [(Num::F64(0.0),), (Num::I64(0),)]); + test!(f64_u8, "F64", false, [(Num::F64(0.0),), (Num::U8(0),)]); + test!(f64_u16, "F64", false, [(Num::F64(0.0),), (Num::U16(0),)]); + test!(f64_u32, "F64", false, [(Num::F64(0.0),), (Num::U32(0),)]); + test!(f64_u64, "F64", false, [(Num::F64(0.0),), (Num::U64(0),)]); +} From 7746873aa6e3d22579c52d05719a30fa31d8810a Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Fri, 9 Aug 2024 13:19:50 +0200 Subject: [PATCH 08/12] Add some coercion + null examples --- serde_arrow/src/test/schema_tracing.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs index e7b44946..0527ae04 100644 --- a/serde_arrow/src/test/schema_tracing.rs +++ b/serde_arrow/src/test/schema_tracing.rs @@ -391,4 +391,30 @@ mod untagged_enum_number_coercion { test!(f64_u16, "F64", false, [(Num::F64(0.0),), (Num::U16(0),)]); test!(f64_u32, "F64", false, [(Num::F64(0.0),), (Num::U32(0),)]); test!(f64_u64, "F64", false, [(Num::F64(0.0),), (Num::U64(0),)]); + + // some coercion + null examples + test!( + f64_u8_null, + "F64", + true, + [(Num::F64(0.0),), (Num::U8(0),), (Num::Null(()),)] + ); + test!( + i32_i64_null, + "I64", + true, + [(Num::I32(0),), (Num::I64(0),), (Num::Null(()),)] + ); + test!( + f32_f64_null, + "F64", + true, + [(Num::F32(0.0),), (Num::F64(0.0),), (Num::Null(()),)] + ); + test!( + u8_u16_null, + "U64", + true, + [(Num::U8(0),), (Num::U16(0),), (Num::Null(()),)] + ); } From 1edd53fa7a16e5bcd1155bcecd4aac7820177578 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sun, 4 Aug 2024 10:15:40 +0200 Subject: [PATCH 09/12] Add known cfgs to Cargo.toml --- serde_arrow/Cargo.toml | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/serde_arrow/Cargo.toml b/serde_arrow/Cargo.toml index ee35bba1..5451b00c 100644 --- a/serde_arrow/Cargo.toml +++ b/serde_arrow/Cargo.toml @@ -166,4 +166,31 @@ features = [ "serde-with-float", # NOTE activating this feature breaks JSON -> float processing # "serde-with-arbitrary-precision", -] \ No newline at end of file +] + +[lints.rust.unexpected_cfgs] +level = "warn" +check-cfg = [ + 'cfg(has_arrow2)', + 'cfg(has_arrow2_0_17)', + 'cfg(has_arrow2_0_16)', + 'cfg(has_arrow)', + 'cfg(has_arrow_fixed_binary_support)', + # arrow-version:insert: 'cfg(has_arrow_{version})', + 'cfg(has_arrow_52)', + 'cfg(has_arrow_51)', + 'cfg(has_arrow_50)', + 'cfg(has_arrow_49)', + 'cfg(has_arrow_48)', + 'cfg(has_arrow_47)', + 'cfg(has_arrow_46)', + 'cfg(has_arrow_45)', + 'cfg(has_arrow_44)', + 'cfg(has_arrow_43)', + 'cfg(has_arrow_42)', + 'cfg(has_arrow_41)', + 'cfg(has_arrow_40)', + 'cfg(has_arrow_39)', + 'cfg(has_arrow_38)', + 'cfg(has_arrow_37)', +] From 2d157927e66d1c17a14132d8653113e188b49fc7 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 31 Aug 2024 16:52:56 +0200 Subject: [PATCH 10/12] Remove unused arrow_36 cfg --- serde_arrow/src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 64f6f8a9..51ed3373 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -284,7 +284,6 @@ pub mod _impl { #[cfg(has_arrow_39)] build_arrow_crate!(arrow_array_39, arrow_buffer_39, arrow_data_39, arrow_schema_39); #[cfg(has_arrow_38)] build_arrow_crate!(arrow_array_38, arrow_buffer_38, arrow_data_38, arrow_schema_38); #[cfg(has_arrow_37)] build_arrow_crate!(arrow_array_37, arrow_buffer_37, arrow_data_37, arrow_schema_37); - #[cfg(has_arrow_36)] build_arrow_crate!(arrow_array_36, arrow_buffer_36, arrow_data_36, arrow_schema_36); /// Documentation pub mod docs { From b82c134c4049b88f5120c024a40842e6fd0d4f99 Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 31 Aug 2024 16:56:54 +0200 Subject: [PATCH 11/12] Document the tracing tests --- serde_arrow/src/test/schema_tracing.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/serde_arrow/src/test/schema_tracing.rs b/serde_arrow/src/test/schema_tracing.rs index 0527ae04..8c826498 100644 --- a/serde_arrow/src/test/schema_tracing.rs +++ b/serde_arrow/src/test/schema_tracing.rs @@ -5,7 +5,9 @@ use crate::internal::{ schema::{SchemaLike, SerdeArrowSchema, TracingOptions}, }; -// see https://github.com/chmp/serde_arrow/issues/216 +/// A mixture of nulls and strings is parsed as a nullable LargeUtf8 +/// +/// see https://github.com/chmp/serde_arrow/issues/216 mod json_utf8_null { use super::*; @@ -29,7 +31,7 @@ mod json_utf8_null { test!(null_str, [{"flavor": null}, {"flavor": "delicious"}]); } -// A mixture of negative and positive ints is traced as i64 +/// A mixture of negative, positive ints and nulls is traced as a nullable i64 mod json_i64_null { use super::*; @@ -57,7 +59,7 @@ mod json_i64_null { test!(neg_pos_null, [{"value": -13}, {"value": 42}, {"value": null}]); } -// Positive ints are traced as u64 +/// Positive ints and nulls are traced as a nullable u64 mod json_u64_null { use super::*; @@ -81,6 +83,7 @@ mod json_u64_null { test!(null_pos, [{"value": null}, {"value": 42}]); } +/// Bools and nulls are traced as a nullable Bool mod json_bool_null { use super::*; @@ -104,6 +107,7 @@ mod json_bool_null { test!(null_pos, [{"value": null}, {"value": false}]); } +/// Structs and nulls are traced as nullable structs mod json_struct_null { use super::*; @@ -135,6 +139,7 @@ mod json_struct_null { test!(null_struct, [{"value": null}, {"value": {"child": 13}}]); } +/// Lists and nulls are traced as nullable lists mod json_list_null { use super::*; @@ -165,6 +170,7 @@ mod json_list_null { test!(null_list, [{"value": null}, {"value": [13]}]); } +/// Strings encoding dates and nulls are traced as nullable Date64 fields mod json_date64_naive_null { use super::*; @@ -193,6 +199,7 @@ mod json_date64_naive_null { test!(null_str, [{"date": null}, {"date": "2024-08-09T12:15:00"}]); } +/// Strings encoding dates and nulls are traced as nullable Date64 fields mod json_date64_utc_null { use super::*; @@ -267,6 +274,7 @@ mod json_date64_to_string_coercions { test!(null_naive_str, [{"date": null}, {"date": "2024-08-09T12:15:00"}, {"date": "baz"}]); } +/// Test how different number and null types are coerced (using untagged enums) mod untagged_enum_number_coercion { use serde::Serialize; From 48a12505e41b2c5d345b0ea865f52a001ec1aada Mon Sep 17 00:00:00 2001 From: Christopher Prohm Date: Sat, 31 Aug 2024 17:04:38 +0200 Subject: [PATCH 12/12] Bump version --- Cargo.lock | 2 +- serde_arrow/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e036c5c4..c98188e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2283,7 +2283,7 @@ dependencies = [ [[package]] name = "serde_arrow" -version = "0.11.6" +version = "0.11.7" dependencies = [ "anyhow", "arrow-array 37.0.0", diff --git a/serde_arrow/Cargo.toml b/serde_arrow/Cargo.toml index 5451b00c..76ffd592 100644 --- a/serde_arrow/Cargo.toml +++ b/serde_arrow/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "serde_arrow" -version = "0.11.6" +version = "0.11.7" authors = ["Christopher Prohm "] edition = "2021" description = "Convert sequences of Rust objects to Arrow arrays and back again"