From f4383c2b9e999757828348221dd06dbb87b0bd05 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Mon, 8 Jan 2024 06:07:16 +0100 Subject: [PATCH] Improve docstrings and handle bad offsets --- py-polars/polars/dataframe/frame.py | 29 +++++++++++++++++++---- py-polars/polars/lazyframe/frame.py | 29 +++++++++++++++++++---- py-polars/tests/unit/dataframe/test_df.py | 22 +++++++++++++++++ 3 files changed, 72 insertions(+), 8 deletions(-) diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index bdf256af1b1c..666d88e82214 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5207,14 +5207,19 @@ def pipe( def with_row_index(self, name: str = "index", offset: int = 0) -> Self: """ - Add a column at index 0 that counts the rows. + Add a row index as the first column in the DataFrame. Parameters ---------- name - Name of the column to add. + Name of the index column. offset - Start the row count at this offset. Default = 0 + Start the index at this offset. Cannot be negative. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). Examples -------- @@ -5235,8 +5240,24 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self: │ 1 ┆ 3 ┆ 4 │ │ 2 ┆ 5 ┆ 6 │ └───────┴─────┴─────┘ + >>> df.with_row_index("id", offset=1000) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ """ - return self._from_pydf(self._df.with_row_index(name, offset)) + try: + return self._from_pydf(self._df.with_row_index(name, offset)) + except OverflowError: + issue = "negative" if offset < 0 else "greater than the maximum index value" + msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}" + raise ValueError(msg) from None @deprecate_function( "Use `with_row_index` instead." diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index d0594b5f21f4..d3ec947c4b7b 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -4565,20 +4565,25 @@ def approx_n_unique(self) -> Self: def with_row_index(self, name: str = "index", offset: int = 0) -> Self: """ - Add a column at index 0 with the row number. + Add a row index as the first column in the LazyFrame. Parameters ---------- name - Name of the column to add. + Name of the index column. offset - Start the row count at this offset. + Start the index at this offset. Cannot be negative. Warnings -------- Using this function can have a negative effect on query performance. This may, for instance, block predicate pushdown optimization. + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + Examples -------- >>> lf = pl.LazyFrame( @@ -4598,8 +4603,24 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self: │ 1 ┆ 3 ┆ 4 │ │ 2 ┆ 5 ┆ 6 │ └───────┴─────┴─────┘ + >>> lf.with_row_index("id", offset=1000).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ """ - return self._from_pyldf(self._ldf.with_row_index(name, offset)) + try: + return self._from_pyldf(self._ldf.with_row_index(name, offset)) + except OverflowError: + issue = "negative" if offset < 0 else "greater than the maximum index value" + msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}" + raise ValueError(msg) from None @deprecate_function( "Use `with_row_index` instead." diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 487b6bcf78d4..a09ac3512b98 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1689,6 +1689,28 @@ def test_with_row_index() -> None: assert out["index"].to_list() == [0, 1, 2] +def test_with_row_index_bad_offset() -> None: + df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]}) + + with pytest.raises(ValueError, match="cannot be negative"): + df.with_row_index(offset=-1) + with pytest.raises( + ValueError, match="cannot be greater than the maximum index value" + ): + df.with_row_index(offset=2**32) + + +def test_with_row_index_bad_offset_lazy() -> None: + lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]}) + + with pytest.raises(ValueError, match="cannot be negative"): + lf.with_row_index(offset=-1) + with pytest.raises( + ValueError, match="cannot be greater than the maximum index value" + ): + lf.with_row_index(offset=2**32) + + def test_with_row_count_deprecated() -> None: df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})