From c13f7e314cba95a59cd89e0c7b3ec03c1afe83e7 Mon Sep 17 00:00:00 2001 From: ritchie Date: Mon, 20 May 2024 09:41:56 +0200 Subject: [PATCH 1/3] feat: Raise when join on the same keys twice --- crates/polars-ops/src/frame/join/args.rs | 20 +++++++++++++++----- crates/polars-ops/src/frame/join/mod.rs | 6 +++++- py-polars/tests/unit/operations/test_join.py | 7 +++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index 148c46ce7953..987c7e8ba18c 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -185,12 +185,22 @@ impl JoinValidation { } } - pub(super) fn is_valid_join(&self, join_type: &JoinType) -> PolarsResult<()> { - if !self.needs_checks() { - return Ok(()); - } - polars_ensure!(matches!(join_type, JoinType::Inner | JoinType::Outer{..} | JoinType::Left), + pub(super) fn is_valid_join<'a, L: Iterator, R: Iterator>( + &'a self, + join_type: &JoinType, + names_left: L, + names_right: R, + ) -> PolarsResult<()> { + if self.needs_checks() { + polars_ensure!(matches!(join_type, JoinType::Inner | JoinType::Outer{..} | JoinType::Left), ComputeError: "{self} validation on a {join_type} join is not supported"); + } + + let mut joined_on = PlHashSet::new(); + for (l, r) in names_left.zip(names_right) { + polars_ensure!(joined_on.insert((l, r)), InvalidOperation: "already joined on {} and {}", l, r) + } + Ok(()) } diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 0ac9de8976c7..b0b64ee4e2e0 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -115,7 +115,11 @@ pub trait DataFrameJoinOps: IntoDf { _verbose: bool, ) -> PolarsResult { let left_df = self.to_df(); - args.validation.is_valid_join(&args.how)?; + args.validation.is_valid_join( + &args.how, + selected_left.iter().map(|s| s.name()), + selected_right.iter().map(|s| s.name()), + )?; let should_coalesce = args.coalesce.coalesce(&args.how); diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index d2f27d4eab06..3722cdc35608 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -1002,3 +1002,10 @@ def test_join_empties(how: str) -> None: df = df1.join(df2, on="col2", how=how) assert df.height == 0 + + +def test_join_raise_on_redundant_keys() -> None: + left = pl.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5], "c": [5, 6, 7]}) + right = pl.DataFrame({"a": [2, 3, 4], "c": [4, 5, 6]}) + with pytest.raises(pl.InvalidOperationError, match="already joined on"): + left.join(right, on=["a", "a"], how="outer_coalesce") From 5d5e4dbb59effb81e9dee0170aea504a1afcc796 Mon Sep 17 00:00:00 2001 From: ritchie Date: Mon, 20 May 2024 09:56:37 +0200 Subject: [PATCH 2/3] move more checks to conversion phase --- crates/polars-ops/src/frame/join/args.rs | 20 +++++-------------- crates/polars-ops/src/frame/join/mod.rs | 16 --------------- .../src/logical_plan/conversion/dsl_to_ir.rs | 17 ++++++++++++++++ 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/crates/polars-ops/src/frame/join/args.rs b/crates/polars-ops/src/frame/join/args.rs index 987c7e8ba18c..ff57c1fb140e 100644 --- a/crates/polars-ops/src/frame/join/args.rs +++ b/crates/polars-ops/src/frame/join/args.rs @@ -185,22 +185,12 @@ impl JoinValidation { } } - pub(super) fn is_valid_join<'a, L: Iterator, R: Iterator>( - &'a self, - join_type: &JoinType, - names_left: L, - names_right: R, - ) -> PolarsResult<()> { - if self.needs_checks() { - polars_ensure!(matches!(join_type, JoinType::Inner | JoinType::Outer{..} | JoinType::Left), - ComputeError: "{self} validation on a {join_type} join is not supported"); - } - - let mut joined_on = PlHashSet::new(); - for (l, r) in names_left.zip(names_right) { - polars_ensure!(joined_on.insert((l, r)), InvalidOperation: "already joined on {} and {}", l, r) + pub fn is_valid_join(&self, join_type: &JoinType) -> PolarsResult<()> { + if !self.needs_checks() { + return Ok(()); } - + polars_ensure!(matches!(join_type, JoinType::Inner | JoinType::Outer{..} | JoinType::Left), + ComputeError: "{self} validation on a {join_type} join is not supported"); Ok(()) } diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index b0b64ee4e2e0..2082ffdda2c7 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -115,12 +115,6 @@ pub trait DataFrameJoinOps: IntoDf { _verbose: bool, ) -> PolarsResult { let left_df = self.to_df(); - args.validation.is_valid_join( - &args.how, - selected_left.iter().map(|s| s.name()), - selected_right.iter().map(|s| s.name()), - )?; - let should_coalesce = args.coalesce.coalesce(&args.how); #[cfg(feature = "cross_join")] @@ -167,16 +161,6 @@ pub trait DataFrameJoinOps: IntoDf { } } - polars_ensure!( - selected_left.len() == selected_right.len(), - ComputeError: - format!( - "the number of columns given as join key (left: {}, right:{}) should be equal", - selected_left.len(), - selected_right.len() - ) - ); - if let Some((l, r)) = selected_left .iter() .zip(&selected_right) diff --git a/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs b/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs index b5ba674f45d7..72ce0d2b0f56 100644 --- a/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/logical_plan/conversion/dsl_to_ir.rs @@ -324,6 +324,23 @@ pub fn to_alp_impl( } } + let mut joined_on = PlHashSet::new(); + for (l, r) in left_on.iter().zip(right_on.iter()) { + polars_ensure!(joined_on.insert((l, r)), InvalidOperation: "joins on same keys twice; already joined on {} and {}", l, r) + } + drop(joined_on); + options.args.validation.is_valid_join(&options.args.how)?; + + polars_ensure!( + left_on.len() == right_on.len(), + ComputeError: + format!( + "the number of columns given as join key (left: {}, right:{}) should be equal", + left_on.len(), + right_on.len() + ) + ); + let input_left = to_alp_impl(owned(input_left), expr_arena, lp_arena, convert) .map_err(|e| e.context(failed_input!(join left)))?; let input_right = to_alp_impl(owned(input_right), expr_arena, lp_arena, convert) From 21dd6c948979ea1531e8abefa6552f5ea64f7ad4 Mon Sep 17 00:00:00 2001 From: ritchie Date: Mon, 20 May 2024 10:12:08 +0200 Subject: [PATCH 3/3] only assert on impl --- crates/polars-ops/src/frame/join/mod.rs | 1 + crates/polars/tests/it/core/joins.rs | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/polars-ops/src/frame/join/mod.rs b/crates/polars-ops/src/frame/join/mod.rs index 2082ffdda2c7..ffbb1858c0a4 100644 --- a/crates/polars-ops/src/frame/join/mod.rs +++ b/crates/polars-ops/src/frame/join/mod.rs @@ -116,6 +116,7 @@ pub trait DataFrameJoinOps: IntoDf { ) -> PolarsResult { let left_df = self.to_df(); let should_coalesce = args.coalesce.coalesce(&args.how); + assert_eq!(selected_left.len(), selected_right.len()); #[cfg(feature = "cross_join")] if let JoinType::Cross = args.how { diff --git a/crates/polars/tests/it/core/joins.rs b/crates/polars/tests/it/core/joins.rs index 1fd9a58303c9..d4240e5bfa3d 100644 --- a/crates/polars/tests/it/core/joins.rs +++ b/crates/polars/tests/it/core/joins.rs @@ -420,10 +420,6 @@ fn test_join_err() -> PolarsResult<()> { assert!(df1 .join(&df2, vec!["a", "b"], vec!["a", "b"], JoinType::Left.into()) .is_err()); - // length of join keys don't match error - assert!(df1 - .join(&df2, vec!["a"], vec!["a", "b"], JoinType::Left.into()) - .is_err()); Ok(()) }