pola-rs · stinodego · Feb 5, 2024 · Feb 4, 2024 · Feb 4, 2024 · Feb 5, 2024
@@ -4339,7 +4339,7 @@ def to_numpy(
         """
 
         def raise_no_zero_copy() -> None:
-            if zero_copy_only:
+            if zero_copy_only and not self.is_empty():
                 msg = "cannot return a zero-copy array"
                 raise ValueError(msg)
 

@@ -195,6 +195,17 @@ impl PySeries {
                 let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
                 np_arr.into_py(py)
             },
+            Categorical(rev_map, _) | Enum(rev_map, _) => {
+                let rev_map = rev_map.clone().unwrap();
+                let mapping = &*rev_map;
+                let f = |idx: u32| mapping.get(idx);
+                let ca = s.categorical().unwrap();
+                let np_arr = PyArray1::from_iter(
+                    py,
+                    ca.physical().into_iter().map(|s| s.map(f).into_py(py)),
+                );
+                np_arr.into_py(py)
+            },
             #[cfg(feature = "object")]
             Object(_, _) => {
                 let ca = s

@@ -101,3 +101,9 @@ def test__array__() -> None:
     expected_array = np.array([[1, 1], [2, 2], [3, 3]], dtype=np.uint8)
     assert_array_equal(out_array, expected_array)
     assert out_array.flags["F_CONTIGUOUS"] is True
+
+
+def test_numpy_preserve_uint64_4112() -> None:
+    df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash())
+    assert df.to_numpy().dtype == np.dtype("uint64")
+    assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")])
@@ -2,6 +2,7 @@
 
 from datetime import datetime, time, timedelta
 from decimal import Decimal as D
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
@@ -16,6 +17,162 @@
     import numpy.typing as npt
 
 
+def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:
+    if s.len() == 0:
+        return
+    s_ptr = s._get_buffers()["values"]._get_buffer_info()[0]
+    arr_ptr = arr.__array_interface__["data"][0]
+    assert s_ptr == arr_ptr
+
+
+def assert_zero_copy_only_raises(s: pl.Series) -> None:
+    with pytest.raises(ValueError, match="cannot return a zero-copy array"):
+        s.to_numpy(use_pyarrow=False, zero_copy_only=True)
+
+
+@pytest.mark.parametrize(
+    ("dtype", "expected_dtype"),
+    [
+        (pl.Int8, np.int8),
+        (pl.Int16, np.int16),
+        (pl.Int32, np.int32),
+        (pl.Int64, np.int64),
+        (pl.UInt8, np.uint8),
+        (pl.UInt16, np.uint16),
+        (pl.UInt32, np.uint32),
+        (pl.UInt64, np.uint64),
+        (pl.Float32, np.float32),
+        (pl.Float64, np.float64),
+    ],
+)
+def test_series_to_numpy_numeric_zero_copy(
+    dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
+) -> None:
+    s = pl.Series([1, 2, 3], dtype=dtype, strict=False)
+    result = s.to_numpy(use_pyarrow=False, zero_copy_only=True)
+
+    assert_zero_copy(s, result)
+    assert result.tolist() == s.to_list()
+    assert result.dtype == expected_dtype
+
+
+@pytest.mark.parametrize(
+    ("dtype", "expected_dtype"),
+    [
+        (pl.Int8, np.float32),
+        (pl.Int16, np.float32),
+        (pl.Int32, np.float64),
+        (pl.Int64, np.float64),
+        (pl.UInt8, np.float32),
+        (pl.UInt16, np.float32),
+        (pl.UInt32, np.float64),
+        (pl.UInt64, np.float64),
+        (pl.Float32, np.float32),
+        (pl.Float64, np.float64),
+    ],
+)
+def test_series_to_numpy_numeric_with_nulls(
+    dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
+) -> None:
+    s = pl.Series([1, 2, None], dtype=dtype, strict=False)
+    result = s.to_numpy(use_pyarrow=False)
+
+    assert result.tolist()[:-1] == s.to_list()[:-1]
+    assert np.isnan(result[-1])
+    assert result.dtype == expected_dtype
+    assert_zero_copy_only_raises(s)
+
+
+@pytest.mark.parametrize(
+    ("dtype", "values"),
+    [
+        (pl.Categorical, ["a", "b", "a"]),
+        (pl.Enum(["a", "b", "c"]), ["a", "b", "a"]),
+        (pl.String, ["a", "bc", "def"]),
+        (pl.Binary, [b"a", b"bc", b"def"]),
+        (pl.Object, [Path(), Path("abc")]),
+        # TODO: Implement for List types
+        # (pl.List, [[1], [2, 3]]),
+        # (pl.List, [["a"], ["b", "c"], []]),
+    ],
+)
+def test_to_numpy_various_dtypes(dtype: pl.PolarsDataType, values: list[Any]) -> None:
+    values.append(None)
+    s = pl.Series(values, dtype=dtype)
+    result = s.to_numpy(use_pyarrow=False)
+
+    assert result.tolist() == values
+    assert result.dtype == np.object_
+    assert_zero_copy_only_raises(s)
+
+
+def test_series_to_numpy_bool() -> None:
+    s = pl.Series([True, False])
+    result = s.to_numpy(use_pyarrow=False)
+
+    assert s.to_list() == result.tolist()
+    assert result.dtype == np.bool_
+    assert_zero_copy_only_raises(s)
+
+
+def test_series_to_numpy_bool_with_nulls() -> None:
+    s = pl.Series([True, False, None])
+    result = s.to_numpy(use_pyarrow=False)
+
+    assert s.to_list() == result.tolist()
+    assert result.dtype == np.object_
+    assert_zero_copy_only_raises(s)
+
+
+def test_series_to_numpy_array_of_int() -> None:
+    values = [[1, 2], [3, 4], [5, 6]]
+    s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
+    result = s.to_numpy(use_pyarrow=False)
+
+    expected = np.array(values)
+    assert_array_equal(result, expected)
+    assert result.dtype == np.int64
+
+
+def test_series_to_numpy_array_of_str() -> None:
+    values = [["1", "2", "3"], ["4", "5", "10000"]]
+    s = pl.Series(values, dtype=pl.Array(pl.String, 3))
+    result = s.to_numpy(use_pyarrow=False)
+    assert result.tolist() == values
+    assert result.dtype == np.object_
+
+
+@pytest.mark.skip(
+    reason="Currently bugged, see: https://github.com/pola-rs/polars/issues/14268"
+)
+def test_series_to_numpy_array_with_nulls() -> None:
+    values = [[1, 2], [3, 4], None]
+    s = pl.Series(values, dtype=pl.Array(pl.Int64, 2))
+    result = s.to_numpy(use_pyarrow=False)
+
+    expected = np.array([[1.0, 2.0], [3.0, 4.0], [np.nan, np.nan]])
+    assert_array_equal(result, expected)
+    assert result.dtype == np.float64
+    assert_zero_copy_only_raises(s)
+
+
+def test_to_numpy_null() -> None:
+    s = pl.Series([None, None], dtype=pl.Null)
+    result = s.to_numpy(use_pyarrow=False)
+    expected = np.array([np.nan, np.nan], dtype=np.float32)
+    assert_array_equal(result, expected)
+    assert result.dtype == np.float32
+    assert_zero_copy_only_raises(s)
+
+
+def test_to_numpy_empty() -> None:
+    series = pl.Series(dtype=pl.String)
+    result = series.to_numpy(use_pyarrow=False, zero_copy_only=True)
+    assert result.dtype == np.object_
+    assert result.shape == (0,)
+    assert result.size == 0
+
+
 @given(
     s=series(
         min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct]
@@ -29,7 +186,7 @@
 )
 @settings(max_examples=250)
 def test_series_to_numpy(s: pl.Series) -> None:
-    result = s.to_numpy()
+    result = s.to_numpy(use_pyarrow=False)
 
     values = s.to_list()
     dtype_map = {
@@ -56,14 +213,6 @@ def test_to_numpy_no_zero_copy(
         series.to_numpy(zero_copy_only=True, use_pyarrow=use_pyarrow)
 
 
-def test_to_numpy_empty_no_pyarrow() -> None:
-    series = pl.Series([], dtype=pl.Null)
-    result = series.to_numpy()
-    assert result.dtype == pl.Float32
-    assert result.shape == (0,)
-    assert result.size == 0
-
-
 @pytest.mark.parametrize("writable", [False, True])
 @pytest.mark.parametrize("pyarrow_available", [False, True])
 def test_to_numpy2(
@@ -146,31 +295,6 @@ def test_numpy_disambiguation() -> None:
     assert result == expected
 
 
-def test_series_to_numpy_bool() -> None:
-    s = pl.Series([True, False])
-    result = s.to_numpy(use_pyarrow=False)
-    assert s.to_list() == result.tolist()
-    assert result.dtype == np.bool_
-
-
-def test_series_to_numpy_bool_with_nulls() -> None:
-    s = pl.Series([True, False, None])
-    result = s.to_numpy(use_pyarrow=False)
-    assert s.to_list() == result.tolist()
-    assert result.dtype == np.object_
-
-
-def test_array_to_numpy() -> None:
-    s = pl.Series([[1, 2], [3, 4], [5, 6]], dtype=pl.Array(pl.Int64, 2))
-    assert (s.to_numpy() == np.array([[1, 2], [3, 4], [5, 6]])).all()
-
-
-def test_numpy_preserve_uint64_4112() -> None:
-    df = pl.DataFrame({"a": [1, 2, 3]}).with_columns(pl.col("a").hash())
-    assert df.to_numpy().dtype == np.dtype("uint64")
-    assert df.to_numpy(structured=True).dtype == np.dtype([("a", "uint64")])
-
-
 def test_to_numpy_datelike() -> None:
     s = pl.Series(
         "dt",
@@ -239,26 +363,3 @@ def test_decimal_numpy_export(use_pyarrow: bool) -> None:
         np.array(decimal_data).reshape((-1, 1)),
         df.to_numpy(use_pyarrow=use_pyarrow),
     )
-
-
-@pytest.mark.parametrize(
-    ("dtype", "expected_dtype"),
-    [
-        (pl.Int8, np.float32),
-        (pl.Int16, np.float32),
-        (pl.Int32, np.float64),
-        (pl.Int64, np.float64),
-        (pl.UInt8, np.float32),
-        (pl.UInt16, np.float32),
-        (pl.UInt32, np.float64),
-        (pl.UInt64, np.float64),
-        (pl.Float32, np.float32),
-        (pl.Float64, np.float64),
-    ],
-)
-def test_series_to_numpy_numeric_with_nulls(
-    dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
-) -> None:
-    s = pl.Series([1, 2, None], dtype=dtype, strict=False)
-    result = s.to_numpy(use_pyarrow=False)
-    assert result.dtype == expected_dtype