From e69118eabea8a3c393b02855476e36d10c1ed87f Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Mon, 2 Sep 2024 17:49:39 +0400 Subject: [PATCH] fix(python): Address incorrect `align_frames` result when frames contain NULL values --- py-polars/polars/functions/eager.py | 9 ++++++++- .../tests/unit/functions/test_functions.py | 19 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/py-polars/polars/functions/eager.py b/py-polars/polars/functions/eager.py index a841520be8c5..e8cbb00e3dca 100644 --- a/py-polars/polars/functions/eager.py +++ b/py-polars/polars/functions/eager.py @@ -273,7 +273,14 @@ def join_func( idx_y: tuple[int, LazyFrame], ) -> tuple[int, LazyFrame]: (_, x), (y_idx, y) = idx_x, idx_y - return y_idx, x.join(y, how=how, on=align_on, suffix=f":{y_idx}", coalesce=True) + return y_idx, x.join( + y, + how=how, + on=align_on, + suffix=f":{y_idx}", + join_nulls=True, + coalesce=True, + ) joined = reduce(join_func, idx_frames)[1].sort(by=align_on, descending=descending) if post_align_collect: diff --git a/py-polars/tests/unit/functions/test_functions.py b/py-polars/tests/unit/functions/test_functions.py index a5ee29ae530f..de7e49574393 100644 --- a/py-polars/tests/unit/functions/test_functions.py +++ b/py-polars/tests/unit/functions/test_functions.py @@ -254,7 +254,7 @@ def test_align_frames() -> None: assert_frame_equal(pl_dot, pl.from_pandas(pd_dot)) pd.testing.assert_frame_equal(pd_dot, pl_dot.to_pandas()) - # (also: confirm alignment function works with lazyframes) + # confirm alignment function works with lazy frames lf1, lf2 = pl.align_frames( pl.from_pandas(pdf1.reset_index()).lazy(), pl.from_pandas(pdf2.reset_index()).lazy(), @@ -264,7 +264,7 @@ def test_align_frames() -> None: assert_frame_equal(lf1.collect(), pf1) assert_frame_equal(lf2.collect(), pf2) - # misc + # misc: no frames results in an empty list assert pl.align_frames(on="date") == [] # expected error condition @@ -275,6 +275,8 @@ def test_align_frames() -> None: on="date", ) + +def test_align_frames_misc() -> None: # descending result df1 = pl.DataFrame([[3, 5, 6], [5, 8, 9]], orient="row") df2 = pl.DataFrame([[2, 5, 6], [3, 8, 9], [4, 2, 0]], orient="row") @@ -290,6 +292,19 @@ def test_align_frames() -> None: assert pf.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)] +def test_align_frames_with_nulls() -> None: + df1 = pl.DataFrame({"key": ["x", "y", None], "value": [1, 2, 0]}) + df2 = pl.DataFrame({"key": ["x", None, "z", "y"], "value": [4, 3, 6, 5]}) + + a1, a2 = pl.align_frames(df1, df2, on="key") + + aligned_frame_data = a1.to_dict(as_series=False), a2.to_dict(as_series=False) + assert aligned_frame_data == ( + {"key": [None, "x", "y", "z"], "value": [0, 1, 2, None]}, + {"key": [None, "x", "y", "z"], "value": [3, 4, 5, 6]}, + ) + + def test_align_frames_duplicate_key() -> None: # setup some test frames with duplicate key/alignment values df1 = pl.DataFrame({"x": ["a", "a", "a", "e"], "y": [1, 2, 4, 5]})