From 349354b44da92d04244b0c906ee1b12aac505bc8 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Fri, 27 Jan 2023 15:37:25 +0100 Subject: [PATCH] feat(rust, python)!: error on string <-> date cmp --- .../src/logical_plan/optimizer/mod.rs | 4 +- .../logical_plan/optimizer/simplify_expr.rs | 79 +++++++------ .../optimizer/slice_pushdown_expr.rs | 9 +- .../src/logical_plan/optimizer/stack_opt.rs | 12 +- .../logical_plan/optimizer/type_coercion.rs | 110 +++++++++++------- polars/polars-lazy/src/tests/queries.rs | 9 +- py-polars/polars/internals/expr/expr.py | 2 + py-polars/tests/unit/test_errors.py | 17 +++ py-polars/tests/unit/test_rolling.py | 3 +- 9 files changed, 153 insertions(+), 92 deletions(-) diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs index 6ffe93515924..d532bad51df9 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/mod.rs @@ -152,7 +152,7 @@ pub fn optimize( // we must clean up the predicates, because the agg_scan_projection // uses them in the hashtable to determine duplicates. let simplify_bools = &mut [Box::new(SimplifyBooleanRule {}) as Box]; - lp_top = opt.optimize_loop(simplify_bools, expr_arena, lp_arena, lp_top); + lp_top = opt.optimize_loop(simplify_bools, expr_arena, lp_arena, lp_top)?; // scan the LP to aggregate all the column used in scans // these columns will be added to the state of the AggScanProjection rule @@ -176,7 +176,7 @@ pub fn optimize( rules.push(Box::new(ReplaceDropNulls {})); - lp_top = opt.optimize_loop(&mut rules, expr_arena, lp_arena, lp_top); + lp_top = opt.optimize_loop(&mut rules, expr_arena, lp_arena, lp_top)?; // during debug we check if the optimizations have not modified the final schema #[cfg(debug_assertions)] diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/simplify_expr.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/simplify_expr.rs index 616844716d7f..93b90d081434 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/simplify_expr.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/simplify_expr.rs @@ -10,7 +10,7 @@ use crate::prelude::function_expr::FunctionExpr; macro_rules! eval_binary_same_type { ($lhs:expr, $operand: tt, $rhs:expr) => {{ if let (AExpr::Literal(lit_left), AExpr::Literal(lit_right)) = ($lhs, $rhs) { - return match (lit_left, lit_right) { + match (lit_left, lit_right) { (LiteralValue::Float32(x), LiteralValue::Float32(y)) => { Some(AExpr::Literal(LiteralValue::Float32(x $operand y))) } @@ -46,9 +46,10 @@ macro_rules! eval_binary_same_type { Some(AExpr::Literal(LiteralValue::UInt64(x $operand y))) } _ => None, - }; + } + } else { + None } - None }} } @@ -56,7 +57,7 @@ macro_rules! eval_binary_same_type { macro_rules! eval_binary_bool_type { ($lhs:expr, $operand: tt, $rhs:expr) => {{ if let (AExpr::Literal(lit_left), AExpr::Literal(lit_right)) = ($lhs, $rhs) { - return match (lit_left, lit_right) { + match (lit_left, lit_right) { (LiteralValue::Float32(x), LiteralValue::Float32(y)) => { Some(AExpr::Literal(LiteralValue::Boolean(x $operand y))) } @@ -95,9 +96,10 @@ macro_rules! eval_binary_bool_type { Some(AExpr::Literal(LiteralValue::Boolean(x $operand y))) } _ => None, - }; + } + } else { + None } - None }} } @@ -111,9 +113,9 @@ impl OptimizationRule for SimplifyBooleanRule { expr_node: Node, _: &Arena, _: Node, - ) -> Option { + ) -> PolarsResult> { let expr = expr_arena.get(expr_node); - match expr { + let out = match expr { // true AND x => x AExpr::BinaryExpr { left, @@ -273,7 +275,8 @@ impl OptimizationRule for SimplifyBooleanRule { } } _ => None, - } + }; + Ok(out) } } @@ -436,10 +439,10 @@ impl OptimizationRule for SimplifyExprRule { expr_node: Node, _lp_arena: &Arena, _lp_node: Node, - ) -> Option { + ) -> PolarsResult> { let expr = expr_arena.get(expr_node); - match expr { + let out = match expr { // lit(left) + lit(right) => lit(left + right) // and null propagation AExpr::BinaryExpr { left, op, right } => { @@ -449,28 +452,30 @@ impl OptimizationRule for SimplifyExprRule { // lit(left) + lit(right) => lit(left + right) #[allow(clippy::manual_map)] let out = match op { - Operator::Plus => match eval_binary_same_type!(left_aexpr, +, right_aexpr) { - Some(new) => Some(new), - None => { - // try to replace addition of string columns with `concat_str` - #[cfg(all(feature = "strings", feature = "concat_str"))] - { - string_addition_to_linear_concat( - _lp_arena, - _lp_node, - expr_arena, - *left, - *right, - left_aexpr, - right_aexpr, - ) - } - #[cfg(not(all(feature = "strings", feature = "concat_str")))] - { - None + Operator::Plus => { + match dbg!(eval_binary_same_type!(left_aexpr, +, right_aexpr)) { + Some(new) => Some(new), + None => { + // try to replace addition of string columns with `concat_str` + #[cfg(all(feature = "strings", feature = "concat_str"))] + { + string_addition_to_linear_concat( + _lp_arena, + _lp_node, + expr_arena, + *left, + *right, + left_aexpr, + right_aexpr, + ) + } + #[cfg(not(all(feature = "strings", feature = "concat_str")))] + { + None + } } } - }, + } Operator::Minus => eval_binary_same_type!(left_aexpr, -, right_aexpr), Operator::Multiply => eval_binary_same_type!(left_aexpr, *, right_aexpr), Operator::Divide => eval_binary_same_type!(left_aexpr, /, right_aexpr), @@ -478,7 +483,7 @@ impl OptimizationRule for SimplifyExprRule { if let (AExpr::Literal(lit_left), AExpr::Literal(lit_right)) = (left_aexpr, right_aexpr) { - return match (lit_left, lit_right) { + match (lit_left, lit_right) { (LiteralValue::Float32(x), LiteralValue::Float32(y)) => { Some(AExpr::Literal(LiteralValue::Float32(x / y))) } @@ -514,9 +519,10 @@ impl OptimizationRule for SimplifyExprRule { AExpr::Literal(LiteralValue::Float64(*x as f64 / *y as f64)), ), _ => None, - }; + } + } else { + None } - None } Operator::FloorDivide => None, Operator::Modulus => eval_binary_same_type!(left_aexpr, %, right_aexpr), @@ -531,7 +537,7 @@ impl OptimizationRule for SimplifyExprRule { Operator::Xor => eval_bitwise(left_aexpr, right_aexpr, |l, r| l ^ r), }; if out.is_some() { - return out; + return Ok(out); } // Null propagation. @@ -644,7 +650,8 @@ impl OptimizationRule for SimplifyExprRule { } _ => None, - } + }; + Ok(out) } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs index bf10b9e1786b..b2ac68a32278 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/slice_pushdown_expr.rs @@ -15,7 +15,7 @@ impl OptimizationRule for SlicePushDown { expr_node: Node, _lp_arena: &Arena, _lp_node: Node, - ) -> Option { + ) -> PolarsResult> { if let AExpr::Slice { input, offset, @@ -26,7 +26,7 @@ impl OptimizationRule for SlicePushDown { let length = *length; use AExpr::*; - match expr_arena.get(*input) { + let out = match expr_arena.get(*input) { m @ Alias(..) | m @ Cast { .. } => { let m = m.clone(); let input = m.get_input(); @@ -94,9 +94,10 @@ impl OptimizationRule for SlicePushDown { } } _ => None, - } + }; + Ok(out) } else { - None + Ok(None) } } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/stack_opt.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/stack_opt.rs index 7fd892aa43a6..43f77472c45f 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/stack_opt.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/stack_opt.rs @@ -1,3 +1,5 @@ +use polars_core::prelude::PolarsResult; + use crate::logical_plan::aexpr::AExpr; use crate::logical_plan::alp::ALogicalPlan; use crate::prelude::{Arena, Node}; @@ -12,7 +14,7 @@ impl StackOptimizer { expr_arena: &mut Arena, lp_arena: &mut Arena, lp_top: Node, - ) -> Node { + ) -> PolarsResult { let mut changed = true; let mut plans = Vec::with_capacity(32); @@ -50,7 +52,7 @@ impl StackOptimizer { current_expr_node, lp_arena, current_node, - ) { + )? { expr_arena.replace(current_expr_node, x); changed = true; } @@ -62,7 +64,7 @@ impl StackOptimizer { } } } - lp_top + Ok(lp_top) } } @@ -86,7 +88,7 @@ pub trait OptimizationRule { _expr_node: Node, _lp_arena: &Arena, _lp_node: Node, - ) -> Option { - None + ) -> PolarsResult> { + Ok(None) } } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion.rs index 7b2725a016f7..29b1ffe79228 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/type_coercion.rs @@ -10,6 +10,15 @@ use crate::utils::is_scan; pub struct TypeCoercionRule {} +macro_rules! unpack { + ($packed:expr) => {{ + match $packed { + Some(payload) => payload, + None => return Ok(None), + } + }}; +} + /// determine if we use the supertype or not. For instance when we have a column Int64 and we compare with literal UInt32 /// it would be wasteful to cast the column instead of the literal. fn modify_supertype( @@ -117,9 +126,20 @@ fn get_aexpr_and_type<'a>( )) } -fn print_date_str_comparison_warning() { - eprintln!("Warning: Comparing date/datetime/time column to string value, this will lead to string comparison and is unlikely what you want.\n\ - If this is intended, consider using an explicit cast to silence this warning.") +#[cfg(feature = "python")] +fn err_date_str_compare() -> PolarsResult<()> { + Err(PolarsError::ComputeError( + "Cannot compare 'date/datetime/time' to a string value.\n\ + Create native python {{ 'date', 'datetime', 'time' }} or compare to a temporal column." + .into(), + )) +} + +#[cfg(not(feature = "python"))] +fn err_date_str_compare() -> PolarsResult<()> { + Err(PolarsError::ComputeError( + "Cannot compare 'date/datetime/time' to a string value.".into(), + )) } impl OptimizationRule for TypeCoercionRule { @@ -129,9 +149,9 @@ impl OptimizationRule for TypeCoercionRule { expr_node: Node, lp_arena: &Arena, lp_node: Node, - ) -> Option { + ) -> PolarsResult> { let expr = expr_arena.get(expr_node); - match *expr { + let out = match *expr { AExpr::Ternary { truthy: truthy_node, falsy: falsy_node, @@ -139,12 +159,12 @@ impl OptimizationRule for TypeCoercionRule { } => { let input_schema = get_schema(lp_arena, lp_node); let (truthy, type_true) = - get_aexpr_and_type(expr_arena, truthy_node, &input_schema)?; + unpack!(get_aexpr_and_type(expr_arena, truthy_node, &input_schema)); let (falsy, type_false) = - get_aexpr_and_type(expr_arena, falsy_node, &input_schema)?; + unpack!(get_aexpr_and_type(expr_arena, falsy_node, &input_schema)); - early_escape(&type_true, &type_false)?; - let st = get_supertype(&type_true, &type_false)?; + unpack!(early_escape(&type_true, &type_false)); + let st = unpack!(get_supertype(&type_true, &type_false)); let st = modify_supertype(st, truthy, falsy, &type_true, &type_false); // only cast if the type is not already the super type. @@ -183,11 +203,12 @@ impl OptimizationRule for TypeCoercionRule { right: node_right, } => { let input_schema = get_schema(lp_arena, lp_node); - let (left, type_left) = get_aexpr_and_type(expr_arena, node_left, &input_schema)?; + let (left, type_left) = + unpack!(get_aexpr_and_type(expr_arena, node_left, &input_schema)); let (right, type_right) = - get_aexpr_and_type(expr_arena, node_right, &input_schema)?; - early_escape(&type_left, &type_right)?; + unpack!(get_aexpr_and_type(expr_arena, node_right, &input_schema)); + unpack!(early_escape(&type_left, &type_right)); // don't coerce string with number comparisons. They must error match (&type_left, &type_right, op) { @@ -195,30 +216,30 @@ impl OptimizationRule for TypeCoercionRule { (DataType::Utf8, dt, op) | (dt, DataType::Utf8, op) if op.is_comparison() && dt.is_numeric() => { - return None + return Ok(None) } #[cfg(feature = "dtype-categorical")] (DataType::Utf8 | DataType::Categorical(_), dt, op) | (dt, DataType::Utf8 | DataType::Categorical(_), op) if op.is_comparison() && dt.is_numeric() => { - return None + return Ok(None) } #[cfg(feature = "dtype-date")] (DataType::Date, DataType::Utf8, op) if op.is_comparison() => { - print_date_str_comparison_warning() + err_date_str_compare()? } #[cfg(feature = "dtype-datetime")] (DataType::Datetime(_, _), DataType::Utf8, op) if op.is_comparison() => { - print_date_str_comparison_warning() + err_date_str_compare()? } #[cfg(feature = "dtype-time")] (DataType::Time, DataType::Utf8, op) if op.is_comparison() => { - print_date_str_comparison_warning() + err_date_str_compare()? } // structs can be arbitrarily nested, leave the complexity to the caller for now. #[cfg(feature = "dtype-struct")] - (DataType::Struct(_), DataType::Struct(_), _op) => return None, + (DataType::Struct(_), DataType::Struct(_), _op) => return Ok(None), _ => {} } @@ -266,13 +287,13 @@ impl OptimizationRule for TypeCoercionRule { strict: false, }); - Some(AExpr::BinaryExpr { + Ok(Some(AExpr::BinaryExpr { left: node_left, op, right: new_node_right, - }) + })) } else { - None + Ok(None) }; } (_, DataType::List(inner)) => { @@ -283,13 +304,13 @@ impl OptimizationRule for TypeCoercionRule { strict: false, }); - Some(AExpr::BinaryExpr { + Ok(Some(AExpr::BinaryExpr { left: new_node_left, op, right: node_right, - }) + })) } else { - None + Ok(None) }; } _ => unreachable!(), @@ -302,7 +323,7 @@ impl OptimizationRule for TypeCoercionRule { { None } else { - let st = get_supertype(&type_left, &type_right)?; + let st = unpack!(get_supertype(&type_left, &type_right)); let mut st = modify_supertype(st, left, right, &type_left, &type_right); #[allow(unused_mut, unused_assignments)] @@ -358,10 +379,12 @@ impl OptimizationRule for TypeCoercionRule { } => { let input_schema = get_schema(lp_arena, lp_node); let other_node = input[1]; - let (_, type_left) = get_aexpr_and_type(expr_arena, input[0], &input_schema)?; - let (_, type_other) = get_aexpr_and_type(expr_arena, other_node, &input_schema)?; + let (_, type_left) = + unpack!(get_aexpr_and_type(expr_arena, input[0], &input_schema)); + let (_, type_other) = + unpack!(get_aexpr_and_type(expr_arena, other_node, &input_schema)); - early_escape(&type_left, &type_other)?; + unpack!(early_escape(&type_left, &type_other)); let casted_expr = match (&type_left, &type_other) { // cast both local and global string cache @@ -375,9 +398,9 @@ impl OptimizationRule for TypeCoercionRule { strict: false, } } - (DataType::List(_), _) | (_, DataType::List(_)) => return None, + (DataType::List(_), _) | (_, DataType::List(_)) => return Ok(None), #[cfg(feature = "dtype-struct")] - (DataType::Struct(_), _) | (_, DataType::Struct(_)) => return None, + (DataType::Struct(_), _) | (_, DataType::Struct(_)) => return Ok(None), // if right is another type, we cast it to left // we do not use super-type as an `is_in` operation should not // cast the whole column implicitly. @@ -390,7 +413,7 @@ impl OptimizationRule for TypeCoercionRule { } } // types are equal, do nothing - _ => return None, + _ => return Ok(None), }; let mut input = input.clone(); @@ -412,11 +435,12 @@ impl OptimizationRule for TypeCoercionRule { } => { let input_schema = get_schema(lp_arena, lp_node); let other_node = input[1]; - let (left, type_left) = get_aexpr_and_type(expr_arena, input[0], &input_schema)?; + let (left, type_left) = + unpack!(get_aexpr_and_type(expr_arena, input[0], &input_schema)); let (fill_value, type_fill_value) = - get_aexpr_and_type(expr_arena, other_node, &input_schema)?; + unpack!(get_aexpr_and_type(expr_arena, other_node, &input_schema)); - let new_st = get_supertype(&type_left, &type_fill_value)?; + let new_st = unpack!(get_supertype(&type_left, &type_fill_value)); let new_st = modify_supertype(new_st, left, fill_value, &type_left, &type_fill_value); if &new_st != super_type { @@ -443,16 +467,16 @@ impl OptimizationRule for TypeCoercionRule { let input_schema = get_schema(lp_arena, lp_node); let self_node = input[0]; let (self_ae, type_self) = - get_aexpr_and_type(expr_arena, self_node, &input_schema)?; + unpack!(get_aexpr_and_type(expr_arena, self_node, &input_schema)); let mut super_type = type_self.clone(); for other in &input[1..] { let (other, type_other) = - get_aexpr_and_type(expr_arena, *other, &input_schema)?; + unpack!(get_aexpr_and_type(expr_arena, *other, &input_schema)); // early return until Unknown is set - early_escape(&super_type, &type_other)?; - let new_st = get_supertype(&super_type, &type_other)?; + unpack!(early_escape(&super_type, &type_other)); + let new_st = unpack!(get_supertype(&super_type, &type_other)); super_type = modify_supertype(new_st, self_ae, other, &type_self, &type_other) } // only cast if the type is not already the super type. @@ -472,8 +496,11 @@ impl OptimizationRule for TypeCoercionRule { new_nodes.push(new_node_self); for other_node in &input[1..] { - let (_, type_other) = - get_aexpr_and_type(expr_arena, *other_node, &input_schema)?; + let type_other = + match get_aexpr_and_type(expr_arena, *other_node, &input_schema) { + Some((_, type_other)) => type_other, + None => return Ok(None), + }; let new_node_other = if type_other != super_type { expr_arena.add(AExpr::Cast { expr: *other_node, @@ -495,7 +522,8 @@ impl OptimizationRule for TypeCoercionRule { }) } _ => None, - } + }; + Ok(out) } } diff --git a/polars/polars-lazy/src/tests/queries.rs b/polars/polars-lazy/src/tests/queries.rs index a490925b0ed2..4a2865f80eb7 100644 --- a/polars/polars-lazy/src/tests/queries.rs +++ b/polars/polars-lazy/src/tests/queries.rs @@ -539,8 +539,11 @@ fn test_simplify_expr() { let optimizer = StackOptimizer {}; let mut lp_top = to_alp(plan, &mut expr_arena, &mut lp_arena).unwrap(); - lp_top = optimizer.optimize_loop(rules, &mut expr_arena, &mut lp_arena, lp_top); + lp_top = optimizer + .optimize_loop(rules, &mut expr_arena, &mut lp_arena, lp_top) + .unwrap(); let plan = node_to_lp(lp_top, &mut expr_arena, &mut lp_arena); + dbg!(&plan); assert!( matches!(plan, LogicalPlan::Projection{ expr, ..} if matches!(&expr[0], Expr::BinaryExpr{left, ..} if **left == Expr::Literal(LiteralValue::Float32(2.0)))) ); @@ -619,7 +622,9 @@ fn test_type_coercion() { let optimizer = StackOptimizer {}; let mut lp_top = to_alp(lp, &mut expr_arena, &mut lp_arena).unwrap(); - lp_top = optimizer.optimize_loop(rules, &mut expr_arena, &mut lp_arena, lp_top); + lp_top = optimizer + .optimize_loop(rules, &mut expr_arena, &mut lp_arena, lp_top) + .unwrap(); let lp = node_to_lp(lp_top, &mut expr_arena, &mut lp_arena); if let LogicalPlan::Projection { expr, .. } = lp { diff --git a/py-polars/polars/internals/expr/expr.py b/py-polars/polars/internals/expr/expr.py index 2626e509d7aa..e380c29bb690 100644 --- a/py-polars/polars/internals/expr/expr.py +++ b/py-polars/polars/internals/expr/expr.py @@ -3462,6 +3462,8 @@ def is_between( └─────┴────────────┘ """ + start = expr_to_lit_or_expr(start, str_to_lit=False) + end = expr_to_lit_or_expr(end, str_to_lit=False) if closed == "none": return ((self > start) & (self < end)).alias("is_between") elif closed == "both": diff --git a/py-polars/tests/unit/test_errors.py b/py-polars/tests/unit/test_errors.py index 9b32c713f50f..3d1e7b179917 100644 --- a/py-polars/tests/unit/test_errors.py +++ b/py-polars/tests/unit/test_errors.py @@ -386,3 +386,20 @@ def test_err_filter_no_expansion() -> None: ): # we filter by ints df.filter(pl.col(pl.Int16).min() < 0.1) + + +def test_date_string_comparison() -> None: + df = pl.DataFrame( + { + "date": [ + "2022-11-01", + "2022-11-02", + "2022-11-05", + ], + } + ).with_columns(pl.col("date").str.strptime(pl.Date, "%Y-%m-%d")) + + with pytest.raises( + pl.ComputeError, match=r"Cannot compare 'date/datetime/time' to a string value" + ): + df.select(pl.col("date") > "2021-11-10") diff --git a/py-polars/tests/unit/test_rolling.py b/py-polars/tests/unit/test_rolling.py index 66ad9d8c48eb..6e6b782d90e2 100644 --- a/py-polars/tests/unit/test_rolling.py +++ b/py-polars/tests/unit/test_rolling.py @@ -509,11 +509,10 @@ def test_groupby_dynamic_by_monday_and_offset_5444() -> None: # test empty result_empty = ( - df.filter(pl.col("date") == "z") + df.filter(pl.col("date") == date(1, 1, 1)) .groupby_dynamic("date", every="1w", offset="1d", by="label", start_by="monday") .agg(pl.col("value").sum()) ) - print(result_empty, result) assert result_empty.schema == result.schema