Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(python): Address incorrect align_frames result when the alignment column contains NULL values #18521

Merged
merged 1 commit into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion py-polars/polars/functions/eager.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,14 @@ def join_func(
idx_y: tuple[int, LazyFrame],
) -> tuple[int, LazyFrame]:
(_, x), (y_idx, y) = idx_x, idx_y
return y_idx, x.join(y, how=how, on=align_on, suffix=f":{y_idx}", coalesce=True)
return y_idx, x.join(
y,
how=how,
on=align_on,
suffix=f":{y_idx}",
join_nulls=True,
coalesce=True,
)

joined = reduce(join_func, idx_frames)[1].sort(by=align_on, descending=descending)
if post_align_collect:
Expand Down
19 changes: 17 additions & 2 deletions py-polars/tests/unit/functions/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def test_align_frames() -> None:
assert_frame_equal(pl_dot, pl.from_pandas(pd_dot))
pd.testing.assert_frame_equal(pd_dot, pl_dot.to_pandas())

# (also: confirm alignment function works with lazyframes)
# confirm alignment function works with lazy frames
lf1, lf2 = pl.align_frames(
pl.from_pandas(pdf1.reset_index()).lazy(),
pl.from_pandas(pdf2.reset_index()).lazy(),
Expand All @@ -264,7 +264,7 @@ def test_align_frames() -> None:
assert_frame_equal(lf1.collect(), pf1)
assert_frame_equal(lf2.collect(), pf2)

# misc
# misc: no frames results in an empty list
assert pl.align_frames(on="date") == []

# expected error condition
Expand All @@ -275,6 +275,8 @@ def test_align_frames() -> None:
on="date",
)


def test_align_frames_misc() -> None:
# descending result
df1 = pl.DataFrame([[3, 5, 6], [5, 8, 9]], orient="row")
df2 = pl.DataFrame([[2, 5, 6], [3, 8, 9], [4, 2, 0]], orient="row")
Expand All @@ -290,6 +292,19 @@ def test_align_frames() -> None:
assert pf.rows() == [(5, None, None), (4, 2, 0), (3, 8, 9), (2, 5, 6)]


def test_align_frames_with_nulls() -> None:
df1 = pl.DataFrame({"key": ["x", "y", None], "value": [1, 2, 0]})
df2 = pl.DataFrame({"key": ["x", None, "z", "y"], "value": [4, 3, 6, 5]})

a1, a2 = pl.align_frames(df1, df2, on="key")

aligned_frame_data = a1.to_dict(as_series=False), a2.to_dict(as_series=False)
assert aligned_frame_data == (
{"key": [None, "x", "y", "z"], "value": [0, 1, 2, None]},
{"key": [None, "x", "y", "z"], "value": [3, 4, 5, 6]},
)


def test_align_frames_duplicate_key() -> None:
# setup some test frames with duplicate key/alignment values
df1 = pl.DataFrame({"x": ["a", "a", "a", "e"], "y": [1, 2, 4, 5]})
Expand Down