From f4383c2b9e999757828348221dd06dbb87b0bd05 Mon Sep 17 00:00:00 2001
From: Stijn de Gooijer <stijndegooijer@gmail.com>
Date: Mon, 8 Jan 2024 06:07:16 +0100
Subject: [PATCH] Improve docstrings and handle bad offsets

---
 py-polars/polars/dataframe/frame.py       | 29 +++++++++++++++++++----
 py-polars/polars/lazyframe/frame.py       | 29 +++++++++++++++++++----
 py-polars/tests/unit/dataframe/test_df.py | 22 +++++++++++++++++
 3 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
index bdf256af1b1c..666d88e82214 100644
--- a/py-polars/polars/dataframe/frame.py
+++ b/py-polars/polars/dataframe/frame.py
@@ -5207,14 +5207,19 @@ def pipe(
 
     def with_row_index(self, name: str = "index", offset: int = 0) -> Self:
         """
-        Add a column at index 0 that counts the rows.
+        Add a row index as the first column in the DataFrame.
 
         Parameters
         ----------
         name
-            Name of the column to add.
+            Name of the index column.
         offset
-            Start the row count at this offset. Default = 0
+            Start the index at this offset. Cannot be negative.
+
+        Notes
+        -----
+        The resulting column does not have any special properties. It is a regular
+        column of type `UInt32` (or `UInt64` in `polars-u64-idx`).
 
         Examples
         --------
@@ -5235,8 +5240,24 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self:
         │ 1     ┆ 3   ┆ 4   │
         │ 2     ┆ 5   ┆ 6   │
         └───────┴─────┴─────┘
+        >>> df.with_row_index("id", offset=1000)
+        shape: (3, 3)
+        ┌──────┬─────┬─────┐
+        │ id   ┆ a   ┆ b   │
+        │ ---  ┆ --- ┆ --- │
+        │ u32  ┆ i64 ┆ i64 │
+        ╞══════╪═════╪═════╡
+        │ 1000 ┆ 1   ┆ 2   │
+        │ 1001 ┆ 3   ┆ 4   │
+        │ 1002 ┆ 5   ┆ 6   │
+        └──────┴─────┴─────┘
         """
-        return self._from_pydf(self._df.with_row_index(name, offset))
+        try:
+            return self._from_pydf(self._df.with_row_index(name, offset))
+        except OverflowError:
+            issue = "negative" if offset < 0 else "greater than the maximum index value"
+            msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}"
+            raise ValueError(msg) from None
 
     @deprecate_function(
         "Use `with_row_index` instead."
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index d0594b5f21f4..d3ec947c4b7b 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -4565,20 +4565,25 @@ def approx_n_unique(self) -> Self:
 
     def with_row_index(self, name: str = "index", offset: int = 0) -> Self:
         """
-        Add a column at index 0 with the row number.
+        Add a row index as the first column in the LazyFrame.
 
         Parameters
         ----------
         name
-            Name of the column to add.
+            Name of the index column.
         offset
-            Start the row count at this offset.
+            Start the index at this offset. Cannot be negative.
 
         Warnings
         --------
         Using this function can have a negative effect on query performance.
         This may, for instance, block predicate pushdown optimization.
 
+        Notes
+        -----
+        The resulting column does not have any special properties. It is a regular
+        column of type `UInt32` (or `UInt64` in `polars-u64-idx`).
+
         Examples
         --------
         >>> lf = pl.LazyFrame(
@@ -4598,8 +4603,24 @@ def with_row_index(self, name: str = "index", offset: int = 0) -> Self:
         │ 1     ┆ 3   ┆ 4   │
         │ 2     ┆ 5   ┆ 6   │
         └───────┴─────┴─────┘
+        >>> lf.with_row_index("id", offset=1000).collect()
+        shape: (3, 3)
+        ┌──────┬─────┬─────┐
+        │ id   ┆ a   ┆ b   │
+        │ ---  ┆ --- ┆ --- │
+        │ u32  ┆ i64 ┆ i64 │
+        ╞══════╪═════╪═════╡
+        │ 1000 ┆ 1   ┆ 2   │
+        │ 1001 ┆ 3   ┆ 4   │
+        │ 1002 ┆ 5   ┆ 6   │
+        └──────┴─────┴─────┘
         """
-        return self._from_pyldf(self._ldf.with_row_index(name, offset))
+        try:
+            return self._from_pyldf(self._ldf.with_row_index(name, offset))
+        except OverflowError:
+            issue = "negative" if offset < 0 else "greater than the maximum index value"
+            msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}"
+            raise ValueError(msg) from None
 
     @deprecate_function(
         "Use `with_row_index` instead."
diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py
index 487b6bcf78d4..a09ac3512b98 100644
--- a/py-polars/tests/unit/dataframe/test_df.py
+++ b/py-polars/tests/unit/dataframe/test_df.py
@@ -1689,6 +1689,28 @@ def test_with_row_index() -> None:
     assert out["index"].to_list() == [0, 1, 2]
 
 
+def test_with_row_index_bad_offset() -> None:
+    df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
+
+    with pytest.raises(ValueError, match="cannot be negative"):
+        df.with_row_index(offset=-1)
+    with pytest.raises(
+        ValueError, match="cannot be greater than the maximum index value"
+    ):
+        df.with_row_index(offset=2**32)
+
+
+def test_with_row_index_bad_offset_lazy() -> None:
+    lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})
+
+    with pytest.raises(ValueError, match="cannot be negative"):
+        lf.with_row_index(offset=-1)
+    with pytest.raises(
+        ValueError, match="cannot be greater than the maximum index value"
+    ):
+        lf.with_row_index(offset=2**32)
+
+
 def test_with_row_count_deprecated() -> None:
     df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]})