Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Support Categorical/Enum in Series.to_numpy #14275

Merged
merged 8 commits into from
Feb 5, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4339,7 +4339,7 @@ def to_numpy(
"""

def raise_no_zero_copy() -> None:
if zero_copy_only:
if zero_copy_only and not self.is_empty():
msg = "cannot return a zero-copy array"
raise ValueError(msg)

Expand Down
11 changes: 11 additions & 0 deletions py-polars/src/series/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,17 @@ impl PySeries {
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
np_arr.into_py(py)
},
Categorical(rev_map, _) | Enum(rev_map, _) => {
let rev_map = rev_map.clone().unwrap();
let mapping = &*rev_map;
let f = |idx: u32| mapping.get(idx);
let ca = s.categorical().unwrap();
let np_arr = PyArray1::from_iter(
py,
ca.physical().into_iter().map(|s| s.map(f).into_py(py)),
stinodego marked this conversation as resolved.
Show resolved Hide resolved
);
np_arr.into_py(py)
},
#[cfg(feature = "object")]
Object(_, _) => {
let ca = s
Expand Down
6 changes: 6 additions & 0 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,3 +101,9 @@ def test__array__() -> None:
expected_array = np.array([[1, 1], [2, 2], [3, 3]], dtype=np.uint8)
assert_array_equal(out_array, expected_array)
assert out_array.flags["F_CONTIGUOUS"] is True


def test_numpy_preserve_uint64_4112() -> None:
df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash())
assert df.to_numpy().dtype == np.dtype("uint64")
assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")])
215 changes: 158 additions & 57 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from datetime import datetime, time, timedelta
from decimal import Decimal as D
from pathlib import Path
from typing import TYPE_CHECKING, Any

import numpy as np
Expand All @@ -16,6 +17,162 @@
import numpy.typing as npt


def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:
if s.len() == 0:
return
s_ptr = s._get_buffers()["values"]._get_buffer_info()[0]
arr_ptr = arr.__array_interface__["data"][0]
assert s_ptr == arr_ptr


def assert_zero_copy_only_raises(s: pl.Series) -> None:
with pytest.raises(ValueError, match="cannot return a zero-copy array"):
s.to_numpy(use_pyarrow=False, zero_copy_only=True)


@pytest.mark.parametrize(
("dtype", "expected_dtype"),
[
(pl.Int8, np.int8),
(pl.Int16, np.int16),
(pl.Int32, np.int32),
(pl.Int64, np.int64),
(pl.UInt8, np.uint8),
(pl.UInt16, np.uint16),
(pl.UInt32, np.uint32),
(pl.UInt64, np.uint64),
(pl.Float32, np.float32),
(pl.Float64, np.float64),
],
)
def test_series_to_numpy_numeric_zero_copy(
dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
) -> None:
s = pl.Series([1, 2, 3], dtype=dtype, strict=False)
result = s.to_numpy(use_pyarrow=False, zero_copy_only=True)

assert_zero_copy(s, result)
assert result.tolist() == s.to_list()
assert result.dtype == expected_dtype


@pytest.mark.parametrize(
("dtype", "expected_dtype"),
[
(pl.Int8, np.float32),
(pl.Int16, np.float32),
(pl.Int32, np.float64),
(pl.Int64, np.float64),
(pl.UInt8, np.float32),
(pl.UInt16, np.float32),
(pl.UInt32, np.float64),
(pl.UInt64, np.float64),
(pl.Float32, np.float32),
(pl.Float64, np.float64),
],
)
def test_series_to_numpy_numeric_with_nulls(
dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
) -> None:
s = pl.Series([1, 2, None], dtype=dtype, strict=False)
result = s.to_numpy(use_pyarrow=False)

assert result.tolist()[:-1] == s.to_list()[:-1]
assert np.isnan(result[-1])
assert result.dtype == expected_dtype
assert_zero_copy_only_raises(s)


@pytest.mark.parametrize(
("dtype", "values"),
[
(pl.Categorical, ["a", "b", "a"]),
(pl.Enum(["a", "b", "c"]), ["a", "b", "a"]),
(pl.String, ["a", "bc", "def"]),
(pl.Binary, [b"a", b"bc", b"def"]),
(pl.Object, [Path(), Path("abc")]),
# TODO: Implement for List types
# (pl.List, [[1], [2, 3]]),
# (pl.List, [["a"], ["b", "c"], []]),
],
)
def test_to_numpy_various_dtypes(dtype: pl.PolarsDataType, values: list[Any]) -> None:
values.append(None)
s = pl.Series(values, dtype=dtype)
result = s.to_numpy(use_pyarrow=False)

assert result.tolist() == values
assert result.dtype == np.object_
assert_zero_copy_only_raises(s)


def test_series_to_numpy_bool() -> None:
s = pl.Series([True, False])
result = s.to_numpy(use_pyarrow=False)

assert s.to_list() == result.tolist()
assert result.dtype == np.bool_
assert_zero_copy_only_raises(s)


def test_series_to_numpy_bool_with_nulls() -> None:
s = pl.Series([True, False, None])
result = s.to_numpy(use_pyarrow=False)

assert s.to_list() == result.tolist()
assert result.dtype == np.object_
assert_zero_copy_only_raises(s)


def test_series_to_numpy_array_of_int() -> None:
values = [[1, 2], [3, 4], [5, 6]]
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
result = s.to_numpy(use_pyarrow=False)

expected = np.array(values)
assert_array_equal(result, expected)
assert result.dtype == np.int64


def test_series_to_numpy_array_of_str() -> None:
values = [["1", "2", "3"], ["4", "5", "10000"]]
s = pl.Series(values, dtype=pl.Array(pl.String, 3))
result = s.to_numpy(use_pyarrow=False)
assert result.tolist() == values
assert result.dtype == np.object_


@pytest.mark.skip(
reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268"
)
def test_series_to_numpy_array_with_nulls() -> None:
values = [[1, 2], [3, 4], None]
s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
result = s.to_numpy(use_pyarrow=False)

expected = np.array([[1.0, 2.0], [3.0, 4.0], [np.nan, np.nan]])
assert_array_equal(result, expected)
assert result.dtype == np.float64
assert_zero_copy_only_raises(s)


def test_to_numpy_null() -> None:
s = pl.Series([None, None], dtype=pl.Null)
result = s.to_numpy(use_pyarrow=False)
expected = np.array([np.nan, np.nan], dtype=np.float32)
assert_array_equal(result, expected)
assert result.dtype == np.float32
assert_zero_copy_only_raises(s)


def test_to_numpy_empty() -> None:
series = pl.Series(dtype=pl.String)
result = series.to_numpy(use_pyarrow=False, zero_copy_only=True)
assert result.dtype == np.object_
assert result.shape == (0,)
assert result.size == 0


@given(
s=series(
min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct]
Expand All @@ -29,7 +186,7 @@
)
@settings(max_examples=250)
def test_series_to_numpy(s: pl.Series) -> None:
result = s.to_numpy()
result = s.to_numpy(use_pyarrow=False)

values = s.to_list()
dtype_map = {
Expand All @@ -56,14 +213,6 @@ def test_to_numpy_no_zero_copy(
series.to_numpy(zero_copy_only=True, use_pyarrow=use_pyarrow)


def test_to_numpy_empty_no_pyarrow() -> None:
series = pl.Series([], dtype=pl.Null)
result = series.to_numpy()
assert result.dtype == pl.Float32
assert result.shape == (0,)
assert result.size == 0


@pytest.mark.parametrize("writable", [False, True])
@pytest.mark.parametrize("pyarrow_available", [False, True])
def test_to_numpy2(
Expand Down Expand Up @@ -146,31 +295,6 @@ def test_numpy_disambiguation() -> None:
assert result == expected


def test_series_to_numpy_bool() -> None:
s = pl.Series([True, False])
result = s.to_numpy(use_pyarrow=False)
assert s.to_list() == result.tolist()
assert result.dtype == np.bool_


def test_series_to_numpy_bool_with_nulls() -> None:
s = pl.Series([True, False, None])
result = s.to_numpy(use_pyarrow=False)
assert s.to_list() == result.tolist()
assert result.dtype == np.object_


def test_array_to_numpy() -> None:
s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(pl.Int64, 2))
assert (s.to_numpy() == np.array([[1, 2], [3, 4], [5, 6]])).all()


def test_numpy_preserve_uint64_4112() -> None:
df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash())
assert df.to_numpy().dtype == np.dtype("uint64")
assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")])


def test_to_numpy_datelike() -> None:
s = pl.Series(
"dt",
Expand Down Expand Up @@ -239,26 +363,3 @@ def test_decimal_numpy_export(use_pyarrow: bool) -> None:
np.array(decimal_data).reshape((-1, 1)),
df.to_numpy(use_pyarrow=use_pyarrow),
)


@pytest.mark.parametrize(
("dtype", "expected_dtype"),
[
(pl.Int8, np.float32),
(pl.Int16, np.float32),
(pl.Int32, np.float64),
(pl.Int64, np.float64),
(pl.UInt8, np.float32),
(pl.UInt16, np.float32),
(pl.UInt32, np.float64),
(pl.UInt64, np.float64),
(pl.Float32, np.float32),
(pl.Float64, np.float64),
],
)
def test_series_to_numpy_numeric_with_nulls(
dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
) -> None:
s = pl.Series([1, 2, None], dtype=dtype, strict=False)
result = s.to_numpy(use_pyarrow=False)
assert result.dtype == expected_dtype
Loading