fix(python): Make to_pandas() work for Dataframe and Series with dt…

…ype `Object` (#13910) Co-authored-by: Itamar Turner-Trauring <itamar@pythonspeed.com> Co-authored-by: Stijn de Gooijer <stijndegooijer@gmail.com>
pola-rs · Jan 26, 2024 · 79a0984 · 79a0984
1 parent f3c4cc5
commit 79a0984
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 11 deletions.
diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -2266,9 +2266,6 @@ def to_pandas(
         ham    large_string[pyarrow]
         dtype: object
         """
-        if not self.width:  # 0x0 dataframe, cannot infer schema from batches
-            return pd.DataFrame()
-
         if use_pyarrow_extension_array:
             if parse_version(pd.__version__) < parse_version("1.5"):
                 msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__!r}'
@@ -2281,7 +2278,65 @@ def to_pandas(
                 else:
                     raise ModuleNotFoundError(msg)
 
-        record_batches = self._df.to_pandas()
+        # Object columns must be handled separately as Arrow does not convert them
+        # correctly
+        if Object in self.dtypes:
+            return self._to_pandas_with_object_columns(
+                use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
+            )
+
+        return self._to_pandas_without_object_columns(
+            self, use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
+        )
+
+    def _to_pandas_with_object_columns(
+        self,
+        *,
+        use_pyarrow_extension_array: bool,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        # Find which columns are of type pl.Object, and which aren't:
+        object_columns = []
+        not_object_columns = []
+        for i, dtype in enumerate(self.dtypes):
+            if dtype == Object:
+                object_columns.append(i)
+            else:
+                not_object_columns.append(i)
+
+        # Export columns that aren't pl.Object, in the same order:
+        if not_object_columns:
+            df_without_objects = self[:, not_object_columns]
+            pandas_df = self._to_pandas_without_object_columns(
+                df_without_objects,
+                use_pyarrow_extension_array=use_pyarrow_extension_array,
+                **kwargs,
+            )
+        else:
+            pandas_df = pd.DataFrame()
+
+        # Add columns that are pl.Object, using Series' custom to_pandas()
+        # logic for this case. We do this in order, so the original index for
+        # the next column in this dataframe is correct for the partially
+        # constructed Pandas dataframe, since there are no additional or
+        # missing columns to the inserted column's left.
+        for i in object_columns:
+            name = self.columns[i]
+            pandas_df.insert(i, name, self.to_series(i).to_pandas())
+
+        return pandas_df
+
+    def _to_pandas_without_object_columns(
+        self,
+        df: DataFrame,
+        *,
+        use_pyarrow_extension_array: bool,
+        **kwargs: Any,
+    ) -> pd.DataFrame:
+        if not df.width:  # Empty dataframe, cannot infer schema from batches
+            return pd.DataFrame()
+
+        record_batches = df._df.to_pandas()
         tbl = pa.Table.from_batches(record_batches)
         if use_pyarrow_extension_array:
             return tbl.to_pandas(

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -4492,6 +4492,10 @@ def to_pandas(
         2    <NA>
         Name: b, dtype: int64[pyarrow]
         """
+        if self.dtype == Object:
+            # Can't convert via PyArrow, so do it via NumPy:
+            return pd.Series(self.to_numpy(), dtype=object, name=self.name)
+
         if use_pyarrow_extension_array:
             if parse_version(pd.__version__) < (1, 5):
                 msg = f'pandas>=1.5.0 is required for `to_pandas("use_pyarrow_extension_array=True")`, found Pandas {pd.__version__}'

diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs
@@ -873,6 +873,11 @@ impl PyDataFrame {
         })
     }
 
+    /// Create a `Vec` of PyArrow RecordBatch instances.
+    ///
+    /// Note this will give bad results for columns with dtype `pl.Object`,
+    /// since those can't be converted correctly via PyArrow. The calling Python
+    /// code should make sure these are not included.
     pub fn to_pandas(&mut self) -> PyResult<Vec<PyObject>> {
         self.df.as_single_chunk_par();
         Python::with_gil(|py| {
@@ -891,7 +896,6 @@ impl PyDataFrame {
                 })
                 .map(|(i, _)| i)
                 .collect::<Vec<_>>();
-
             let rbs = self
                 .df
                 .iter_chunks(false)

diff --git a/py-polars/tests/unit/interop/test_to_pandas.py b/py-polars/tests/unit/interop/test_to_pandas.py
@@ -1,9 +1,14 @@
+from __future__ import annotations
+
 from datetime import date, datetime
+from typing import Literal
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 import pytest
+from hypothesis import given
+from hypothesis.strategies import just, lists, one_of
 
 import polars as pl
 
@@ -83,6 +88,32 @@ def test_cat_to_pandas(dtype: pl.DataType) -> None:
     )
 
 
+@given(
+    column_type_names=lists(
+        one_of(just("Object"), just("Int32")), min_size=1, max_size=8
+    )
+)
+def test_object_to_pandas(column_type_names: list[Literal["Object", "Int32"]]) -> None:
+    """
+    Converting ``pl.Object`` dtype columns to Pandas is handled correctly.
+
+    This edge case is handled with a separate code path than other data types,
+    so we test it more thoroughly.
+    """
+    column_types = [getattr(pl, name) for name in column_type_names]
+    data = {
+        f"col_{i}": [object()] if dtype == pl.Object else [-i]
+        for i, dtype in enumerate(column_types)
+    }
+    df = pl.DataFrame(
+        data, schema={f"col_{i}": column_types[i] for i in range(len(column_types))}
+    )
+    for pyarrow in [True, False]:
+        pandas_df = df.to_pandas(use_pyarrow_extension_array=pyarrow)
+        assert isinstance(pandas_df, pd.DataFrame)
+        assert pandas_df.to_dict(orient="list") == data
+
+
 def test_from_empty_pandas_with_dtypes() -> None:
     df = pd.DataFrame(columns=["a", "b"])
     df["a"] = df["a"].astype(str)
@@ -137,3 +168,14 @@ def test_to_pandas_datetime() -> None:
     result_df = s.to_frame().to_pandas()
     expected_df = expected_series.to_frame()
     pd.testing.assert_frame_equal(result_df, expected_df)
+
+
+@pytest.mark.parametrize("use_pyarrow_extension_array", [True, False])
+def test_object_to_pandas_series(use_pyarrow_extension_array: bool) -> None:
+    values = [object(), [1, 2, 3]]
+    pd.testing.assert_series_equal(
+        pl.Series("a", values, dtype=pl.Object).to_pandas(
+            use_pyarrow_extension_array=use_pyarrow_extension_array
+        ),
+        pd.Series(values, dtype=object, name="a"),
+    )
diff --git a/py-polars/tests/unit/namespaces/test_plot.py b/py-polars/tests/unit/namespaces/test_plot.py
@@ -3,7 +3,6 @@
 import pytest
 
 import polars as pl
-from polars.exceptions import PolarsPanicError
 
 # Calling `plot` the first time is slow
 # https://github.com/pola-rs/polars/issues/13500
@@ -39,8 +38,3 @@ def test_series_hist() -> None:
 
 def test_empty_dataframe() -> None:
     pl.DataFrame({"a": [], "b": []}).plot.scatter(x="a", y="b")
-
-
-def test_unsupported_dtype() -> None:
-    with pytest.raises(PolarsPanicError):
-        pl.DataFrame({"a": [{1, 2}], "b": [4]}).plot.scatter(x="a", y="b")