From 729f25a9c2170972ea4da5847b279cf81259aede Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Tue, 31 Jan 2023 16:53:22 +0400 Subject: [PATCH] fix(python): don't convert "ns"-precision temporal types via `pyarrow` (#6592) --- py-polars/polars/internals/dataframe/frame.py | 47 +++++++++++++------ py-polars/tests/unit/test_datelike.py | 19 ++++++++ 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index da4e31348abd1..20060c34260d7 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -1873,9 +1873,13 @@ def to_dict( def to_dicts(self) -> list[dict[str, Any]]: """ - Convert every row to a dictionary. + Convert every row to a dictionary of python-native values. - Note that this is slow. + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. Examples -------- @@ -6737,7 +6741,7 @@ def rows(self, named: Literal[True]) -> list[dict[str, Any]]: def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[dict[str, Any]]: """ - Returns all data in the DataFrame as a list of rows. + Returns all data in the DataFrame as a list of rows of python-native values. Parameters ---------- @@ -6746,15 +6750,21 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[dict[str, An column name to row value. This is more expensive than returning a regular tuple, but allows for accessing values by column name. - Returns - ------- - A list of tuples (default) or dictionaries of row values. + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. Warnings -------- Row-iteration is not optimal as the underlying data is stored in columnar form; where possible, prefer export via one of the dedicated export/output methods. + Returns + ------- + A list of tuples (default) or dictionaries of row values. + Examples -------- >>> df = pl.DataFrame( @@ -6796,7 +6806,7 @@ def iter_rows( self, named: bool = False, buffer_size: int = 500 ) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]: """ - Returns an iterator over the rows of the DataFrame. + Returns an iterator over the DataFrame of rows of python-native values. Parameters ---------- @@ -6811,19 +6821,20 @@ def iter_rows( the speedup from using the buffer is significant (~2-4x). Setting this value to zero disables row buffering. - Returns - ------- - An iterator of tuples (default) or dictionaries of row values. + Notes + ----- + If you have ``ns``-precision temporal values you should be aware that python + natively only supports up to ``us``-precision; if this matters you should export + to a different format. Warnings -------- Row iteration is not optimal as the underlying data is stored in columnar form; where possible, prefer export via one of the dedicated export/output methods. - Notes - ----- - If you are planning to materialise all frame data at once you should prefer - calling ``rows()``, which will be faster. + Returns + ------- + An iterator of tuples (default) or dictionaries of python row values. Examples -------- @@ -6849,9 +6860,15 @@ def iter_rows( # note: buffering rows results in a 2-4x speedup over individual calls # to ".row(i)", so it should only be disabled in extremely specific cases. if buffer_size: + load_pyarrow_dicts = ( + named + and _PYARROW_AVAILABLE + # note: 'ns' precision instantiates values as pandas types - avoid + and not any((getattr(tp, "tu", None) == "ns") for tp in self.dtypes) + ) for offset in range(0, self.height, buffer_size): zerocopy_slice = self.slice(offset, buffer_size) - if named and _PYARROW_AVAILABLE: + if load_pyarrow_dicts: yield from zerocopy_slice.to_arrow().to_batches()[0].to_pylist() else: rows_chunk = zerocopy_slice.rows(named=False) diff --git a/py-polars/tests/unit/test_datelike.py b/py-polars/tests/unit/test_datelike.py index 6cfeb2f33be32..7c947cb9044af 100644 --- a/py-polars/tests/unit/test_datelike.py +++ b/py-polars/tests/unit/test_datelike.py @@ -359,6 +359,25 @@ def test_timezone() -> None: assert_series_equal(s.cast(int), tz_s.cast(int)) +def test_to_dicts() -> None: + now = datetime.now() + data = { + "a": now, + "b": now.date(), + "c": now.time(), + "d": timedelta(days=1, seconds=43200), + } + df = pl.DataFrame( + data, schema_overrides={"a": pl.Datetime("ns"), "d": pl.Duration("ns")} + ) + assert len(df) == 1 + + d = df.to_dicts()[0] + for col in data: + assert d[col] == data[col] + assert isinstance(d[col], type(data[col])) + + def test_to_list() -> None: s = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date)