Skip to content

Commit

Permalink
fix(python): don't convert "ns"-precision temporal types via pyarrow (
Browse files Browse the repository at this point in the history
  • Loading branch information
alexander-beedie authored and vincent committed Feb 9, 2023
1 parent fb5503b commit 729f25a
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 15 deletions.
47 changes: 32 additions & 15 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1873,9 +1873,13 @@ def to_dict(

def to_dicts(self) -> list[dict[str, Any]]:
"""
Convert every row to a dictionary.
Convert every row to a dictionary of python-native values.
Note that this is slow.
Notes
-----
If you have ``ns``-precision temporal values you should be aware that python
natively only supports up to ``us``-precision; if this matters you should export
to a different format.
Examples
--------
Expand Down Expand Up @@ -6737,7 +6741,7 @@ def rows(self, named: Literal[True]) -> list[dict[str, Any]]:

def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[dict[str, Any]]:
"""
Returns all data in the DataFrame as a list of rows.
Returns all data in the DataFrame as a list of rows of python-native values.
Parameters
----------
Expand All @@ -6746,15 +6750,21 @@ def rows(self, named: bool = False) -> list[tuple[Any, ...]] | list[dict[str, An
column name to row value. This is more expensive than returning a regular
tuple, but allows for accessing values by column name.
Returns
-------
A list of tuples (default) or dictionaries of row values.
Notes
-----
If you have ``ns``-precision temporal values you should be aware that python
natively only supports up to ``us``-precision; if this matters you should export
to a different format.
Warnings
--------
Row-iteration is not optimal as the underlying data is stored in columnar form;
where possible, prefer export via one of the dedicated export/output methods.
Returns
-------
A list of tuples (default) or dictionaries of row values.
Examples
--------
>>> df = pl.DataFrame(
Expand Down Expand Up @@ -6796,7 +6806,7 @@ def iter_rows(
self, named: bool = False, buffer_size: int = 500
) -> Iterator[tuple[Any, ...]] | Iterator[dict[str, Any]]:
"""
Returns an iterator over the rows of the DataFrame.
Returns an iterator over the DataFrame of rows of python-native values.
Parameters
----------
Expand All @@ -6811,19 +6821,20 @@ def iter_rows(
the speedup from using the buffer is significant (~2-4x). Setting this
value to zero disables row buffering.
Returns
-------
An iterator of tuples (default) or dictionaries of row values.
Notes
-----
If you have ``ns``-precision temporal values you should be aware that python
natively only supports up to ``us``-precision; if this matters you should export
to a different format.
Warnings
--------
Row iteration is not optimal as the underlying data is stored in columnar form;
where possible, prefer export via one of the dedicated export/output methods.
Notes
-----
If you are planning to materialise all frame data at once you should prefer
calling ``rows()``, which will be faster.
Returns
-------
An iterator of tuples (default) or dictionaries of python row values.
Examples
--------
Expand All @@ -6849,9 +6860,15 @@ def iter_rows(
# note: buffering rows results in a 2-4x speedup over individual calls
# to ".row(i)", so it should only be disabled in extremely specific cases.
if buffer_size:
load_pyarrow_dicts = (
named
and _PYARROW_AVAILABLE
# note: 'ns' precision instantiates values as pandas types - avoid
and not any((getattr(tp, "tu", None) == "ns") for tp in self.dtypes)
)
for offset in range(0, self.height, buffer_size):
zerocopy_slice = self.slice(offset, buffer_size)
if named and _PYARROW_AVAILABLE:
if load_pyarrow_dicts:
yield from zerocopy_slice.to_arrow().to_batches()[0].to_pylist()
else:
rows_chunk = zerocopy_slice.rows(named=False)
Expand Down
19 changes: 19 additions & 0 deletions py-polars/tests/unit/test_datelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,25 @@ def test_timezone() -> None:
assert_series_equal(s.cast(int), tz_s.cast(int))


def test_to_dicts() -> None:
now = datetime.now()
data = {
"a": now,
"b": now.date(),
"c": now.time(),
"d": timedelta(days=1, seconds=43200),
}
df = pl.DataFrame(
data, schema_overrides={"a": pl.Datetime("ns"), "d": pl.Duration("ns")}
)
assert len(df) == 1

d = df.to_dicts()[0]
for col in data:
assert d[col] == data[col]
assert isinstance(d[col], type(data[col]))


def test_to_list() -> None:
s = pl.Series("date", [123543, 283478, 1243]).cast(pl.Date)

Expand Down

0 comments on commit 729f25a

Please sign in to comment.