Skip to content

Commit

Permalink
fix(python): Fix issue in DataFrame.__getitem__ with 2 column inputs (
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored May 27, 2024
1 parent d856b49 commit cc2c905
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 82 deletions.
9 changes: 7 additions & 2 deletions py-polars/polars/_utils/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,19 @@ def get_df_item_by_key(
# Two inputs, e.g. df[1, 2:5]
if isinstance(key, tuple) and len(key) == 2:
row_key, col_key = key

# Support df[True, False] and df["a", "b"] as these are not ambiguous
if isinstance(row_key, (bool, str)):
return _select_columns(df, key) # type: ignore[arg-type]

selection = _select_columns(df, col_key)

if selection.is_empty():
return selection
elif isinstance(selection, pl.Series):
return get_series_item_by_key(selection, row_key) # type: ignore[arg-type]
return get_series_item_by_key(selection, row_key)
else:
return _select_rows(selection, row_key) # type: ignore[arg-type]
return _select_rows(selection, row_key)

# Single input, e.g. df[1]
elif isinstance(key, str):
Expand Down
71 changes: 0 additions & 71 deletions py-polars/tests/unit/dataframe/test_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,77 +131,6 @@ def test_selection() -> None:
assert_series_equal(df.to_series(1), pl.Series("b", [1.0, 2.0, 3.0]))
assert_series_equal(df.to_series(-1), pl.Series("c", ["a", "b", "c"]))

# select columns by mask
assert df[:2, :1].rows() == [(1,), (2,)]
assert df[:2, ["a"]].rows() == [(1,), (2,)]

# column selection by string(s) in first dimension
assert df["a"].to_list() == [1, 2, 3]
assert df["b"].to_list() == [1.0, 2.0, 3.0]
assert df["c"].to_list() == ["a", "b", "c"]

# row selection by integers(s) in first dimension
assert_frame_equal(df[0], pl.DataFrame({"a": [1], "b": [1.0], "c": ["a"]}))
assert_frame_equal(df[-1], pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]}))

# row, column selection when using two dimensions
assert df[:, "a"].to_list() == [1, 2, 3]
assert df[:, 1].to_list() == [1.0, 2.0, 3.0]
assert df[:2, 2].to_list() == ["a", "b"]

assert_frame_equal(
df[[1, 2]], pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]})
)
assert_frame_equal(
df[[-1, -2]], pl.DataFrame({"a": [3, 2], "b": [3.0, 2.0], "c": ["c", "b"]})
)

assert df[["a", "b"]].columns == ["a", "b"]
assert_frame_equal(
df[[1, 2], [1, 2]], pl.DataFrame({"b": [2.0, 3.0], "c": ["b", "c"]})
)
assert typing.cast(str, df[1, 2]) == "b"
assert typing.cast(float, df[1, 1]) == 2.0
assert typing.cast(int, df[2, 0]) == 3

assert df[[2], ["a", "b"]].rows() == [(3, 3.0)]
assert df.to_series(0).name == "a"
assert (df["a"] == df["a"]).sum() == 3
assert (df["c"] == df["a"].cast(str)).sum() == 0
assert df[:, "a":"b"].rows() == [(1, 1.0), (2, 2.0), (3, 3.0)] # type: ignore[index, misc]
assert df[:, "a":"c"].columns == ["a", "b", "c"] # type: ignore[index, misc]
assert df[:, []].shape == (0, 0)
expect = pl.DataFrame({"c": ["b"]})
assert_frame_equal(df[1, [2]], expect)
expect = pl.DataFrame({"b": [1.0, 3.0]})
assert_frame_equal(df[[0, 2], [1]], expect)
assert typing.cast(str, df[0, "c"]) == "a"
assert typing.cast(str, df[1, "c"]) == "b"
assert typing.cast(str, df[2, "c"]) == "c"
assert typing.cast(int, df[0, "a"]) == 1

# more slicing
expect = pl.DataFrame({"a": [3, 2, 1], "b": [3.0, 2.0, 1.0], "c": ["c", "b", "a"]})
assert_frame_equal(df[::-1], expect)
expect = pl.DataFrame({"a": [1, 2], "b": [1.0, 2.0], "c": ["a", "b"]})
assert_frame_equal(df[:-1], expect)

expect = pl.DataFrame({"a": [1, 3], "b": [1.0, 3.0], "c": ["a", "c"]})
assert_frame_equal(df[::2], expect)

# only allow boolean values in column position
df = pl.DataFrame(
{
"a": [1, 2],
"b": [2, 3],
"c": [3, 4],
}
)

assert df[:, [False, True, True]].columns == ["b", "c"]
assert df[:, pl.Series([False, True, True])].columns == ["b", "c"]
assert df[:, pl.Series([False, False, False])].columns == []


def test_mixed_sequence_selection() -> None:
df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
Expand Down
82 changes: 82 additions & 0 deletions py-polars/tests/unit/dataframe/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,13 @@ def test_df_getitem_col_single_index() -> None:
assert_series_equal(result, expected)


def test_df_getitem_col_two_entries() -> None:
df = pl.DataFrame({"x": [1.0], "y": [1.0]})

assert_frame_equal(df["x", "y"], df)
assert_frame_equal(df[True, True], df)


@pytest.mark.parametrize(
("input", "expected_cols"),
[
Expand Down Expand Up @@ -381,6 +388,81 @@ def test_df_getitem() -> None:
df[:, [True, False, True]]


def test_df_getitem2() -> None:
df = pl.DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})

# select columns by mask
assert df[:2, :1].rows() == [(1,), (2,)]
assert df[:2, ["a"]].rows() == [(1,), (2,)]

# column selection by string(s) in first dimension
assert df["a"].to_list() == [1, 2, 3]
assert df["b"].to_list() == [1.0, 2.0, 3.0]
assert df["c"].to_list() == ["a", "b", "c"]

# row selection by integers(s) in first dimension
assert_frame_equal(df[0], pl.DataFrame({"a": [1], "b": [1.0], "c": ["a"]}))
assert_frame_equal(df[-1], pl.DataFrame({"a": [3], "b": [3.0], "c": ["c"]}))

# row, column selection when using two dimensions
assert df[:, "a"].to_list() == [1, 2, 3]
assert df[:, 1].to_list() == [1.0, 2.0, 3.0]
assert df[:2, 2].to_list() == ["a", "b"]

assert_frame_equal(
df[[1, 2]], pl.DataFrame({"a": [2, 3], "b": [2.0, 3.0], "c": ["b", "c"]})
)
assert_frame_equal(
df[[-1, -2]], pl.DataFrame({"a": [3, 2], "b": [3.0, 2.0], "c": ["c", "b"]})
)

assert df[["a", "b"]].columns == ["a", "b"]
assert_frame_equal(
df[[1, 2], [1, 2]], pl.DataFrame({"b": [2.0, 3.0], "c": ["b", "c"]})
)
assert df[1, 2] == "b"
assert df[1, 1] == 2.0
assert df[2, 0] == 3

assert df[[2], ["a", "b"]].rows() == [(3, 3.0)]
assert df.to_series(0).name == "a"
assert (df["a"] == df["a"]).sum() == 3
assert (df["c"] == df["a"].cast(str)).sum() == 0
assert df[:, "a":"b"].rows() == [(1, 1.0), (2, 2.0), (3, 3.0)] # type: ignore[index, misc]
assert df[:, "a":"c"].columns == ["a", "b", "c"] # type: ignore[index, misc]
assert df[:, []].shape == (0, 0)
expect = pl.DataFrame({"c": ["b"]})
assert_frame_equal(df[1, [2]], expect)
expect = pl.DataFrame({"b": [1.0, 3.0]})
assert_frame_equal(df[[0, 2], [1]], expect)
assert df[0, "c"] == "a"
assert df[1, "c"] == "b"
assert df[2, "c"] == "c"
assert df[0, "a"] == 1

# more slicing
expect = pl.DataFrame({"a": [3, 2, 1], "b": [3.0, 2.0, 1.0], "c": ["c", "b", "a"]})
assert_frame_equal(df[::-1], expect)
expect = pl.DataFrame({"a": [1, 2], "b": [1.0, 2.0], "c": ["a", "b"]})
assert_frame_equal(df[:-1], expect)

expect = pl.DataFrame({"a": [1, 3], "b": [1.0, 3.0], "c": ["a", "c"]})
assert_frame_equal(df[::2], expect)

# only allow boolean values in column position
df = pl.DataFrame(
{
"a": [1, 2],
"b": [2, 3],
"c": [3, 4],
}
)

assert df[:, [False, True, True]].columns == ["b", "c"]
assert df[:, pl.Series([False, True, True])].columns == ["b", "c"]
assert df[:, pl.Series([False, False, False])].columns == []


def test_df_getitem_5343() -> None:
# https://github.com/pola-rs/polars/issues/5343
df = pl.DataFrame(
Expand Down
9 changes: 0 additions & 9 deletions py-polars/tests/unit/test_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,15 +530,6 @@ def test_window_size_validation() -> None:
df.with_columns(trailing_min=pl.col("x").rolling_min(window_size=-3))


def test_invalid_getitem_key_err() -> None:
df = pl.DataFrame({"x": [1.0], "y": [1.0]})

with pytest.raises(
TypeError, match="cannot treat Series of type String as indices"
):
df["x", "y"]


def test_invalid_group_by_arg() -> None:
df = pl.DataFrame({"a": [1]})
with pytest.raises(
Expand Down

0 comments on commit cc2c905

Please sign in to comment.