From e51660ed53978aea74d45a32fca20159a956ed59 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sat, 3 Feb 2024 20:16:50 +0100 Subject: [PATCH 1/3] feat(python!): Change `Series.to_numpy` to return `f64` for `Int32/UInt32` Series instead of `f32` --- py-polars/src/series/export.rs | 44 +++++++++++----------- py-polars/tests/unit/interop/test_numpy.py | 24 ++++++++++++ 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index 63f72460881a..0cc19054c5db 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -23,40 +23,40 @@ impl PySeries { /// Non-nullable types are handled with `view()`. /// This will cast to floats so that `None = np.nan`. fn to_numpy(&self, py: Python) -> PyResult { + use DataType::*; let s = &self.series; match s.dtype() { - dt if dt.is_numeric() => { - if s.bit_repr_is_large() { - let s = s.cast(&DataType::Float64).unwrap(); - let ca = s.f64().unwrap(); - let np_arr = - PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f64::NAN))); - Ok(np_arr.into_py(py)) - } else { - let s = s.cast(&DataType::Float32).unwrap(); - let ca = s.f32().unwrap(); - let np_arr = - PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f32::NAN))); - Ok(np_arr.into_py(py)) - } + Int32 | UInt32 | Int64 | UInt64 | Float64 => { + let s = s.cast(&DataType::Float64).unwrap(); + let ca = s.f64().unwrap(); + let np_arr = + PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f64::NAN))); + Ok(np_arr.into_py(py)) }, - DataType::String => { - let ca = s.str().unwrap(); + Int8 | UInt8 | Int16 | UInt16 | Float32 => { + let s = s.cast(&DataType::Float32).unwrap(); + let ca = s.f32().unwrap(); + let np_arr = + PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f32::NAN))); + Ok(np_arr.into_py(py)) + }, + Boolean => { + let ca = s.bool().unwrap(); let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); Ok(np_arr.into_py(py)) }, - DataType::Binary => { - let ca = s.binary().unwrap(); + String => { + let ca = s.str().unwrap(); let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); Ok(np_arr.into_py(py)) }, - DataType::Boolean => { - let ca = s.bool().unwrap(); + Binary => { + let ca = s.binary().unwrap(); let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); Ok(np_arr.into_py(py)) }, #[cfg(feature = "object")] - DataType::Object(_, _) => { + Object(_, _) => { let ca = s .as_any() .downcast_ref::>() @@ -65,7 +65,7 @@ impl PySeries { PyArray1::from_iter(py, ca.into_iter().map(|opt_v| opt_v.to_object(py))); Ok(np_arr.into_py(py)) }, - DataType::Null => { + Null => { let n = s.len(); let np_arr = PyArray1::from_iter(py, std::iter::repeat(f32::NAN).take(n)); Ok(np_arr.into_py(py)) diff --git a/py-polars/tests/unit/interop/test_numpy.py b/py-polars/tests/unit/interop/test_numpy.py index b97635c20c82..70a47361c20c 100644 --- a/py-polars/tests/unit/interop/test_numpy.py +++ b/py-polars/tests/unit/interop/test_numpy.py @@ -1,4 +1,5 @@ import numpy as np +import numpy.typing as npt import pytest import polars as pl @@ -63,3 +64,26 @@ def test_series_to_numpy_bool_with_nulls() -> None: result = s.to_numpy(use_pyarrow=False) assert s.to_list() == result.tolist() assert result.dtype == np.object_ + + +@pytest.mark.parametrize( + ("dtype", "expected_dtype"), + [ + (pl.Int8, np.float32), + (pl.Int16, np.float32), + (pl.Int32, np.float64), + (pl.Int64, np.float64), + (pl.UInt8, np.float32), + (pl.UInt16, np.float32), + (pl.UInt32, np.float64), + (pl.UInt64, np.float64), + (pl.Float32, np.float32), + (pl.Float64, np.float64), + ], +) +def test_series_to_numpy_numeric_with_nulls( + dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike +) -> None: + s = pl.Series([1, 2, None], dtype=dtype, strict=False) + result = s.to_numpy(use_pyarrow=False) + assert result.dtype == expected_dtype From 269b4bb0ae49b9c59dc1ca79d4f2306652ee7fe5 Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 4 Feb 2024 01:34:22 +0100 Subject: [PATCH 2/3] Avoid additional copy for integers --- Cargo.lock | 1 + py-polars/Cargo.toml | 1 + py-polars/src/series/export.rs | 56 +++++++++++++++++++++------------- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3277337e26f6..02e0a3742e44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3065,6 +3065,7 @@ dependencies = [ "libc", "mimalloc", "ndarray", + "num-traits", "numpy", "once_cell", "polars", diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index 8a5f05b79499..01ad293dfe62 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -22,6 +22,7 @@ either = { workspace = true } itoa = { workspace = true } libc = "0.2" ndarray = { workspace = true } +num-traits = { workspace = true } numpy = { version = "0.20", default-features = false } once_cell = { workspace = true } pyo3 = { workspace = true, features = ["abi3-py38", "extension-module", "multiple-pymethods"] } diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index 0cc19054c5db..de366285deab 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -1,3 +1,4 @@ +use num_traits::{Float, NumCast}; use numpy::PyArray1; use polars_core::prelude::*; use pyo3::prelude::*; @@ -19,41 +20,37 @@ impl PySeries { }) } - /// For numeric types, this should only be called for Series with null types. + /// For numeric types, this should only be called for Series with null values. /// Non-nullable types are handled with `view()`. /// This will cast to floats so that `None = np.nan`. fn to_numpy(&self, py: Python) -> PyResult { use DataType::*; let s = &self.series; - match s.dtype() { - Int32 | UInt32 | Int64 | UInt64 | Float64 => { - let s = s.cast(&DataType::Float64).unwrap(); - let ca = s.f64().unwrap(); - let np_arr = - PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f64::NAN))); - Ok(np_arr.into_py(py)) - }, - Int8 | UInt8 | Int16 | UInt16 | Float32 => { - let s = s.cast(&DataType::Float32).unwrap(); - let ca = s.f32().unwrap(); - let np_arr = - PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f32::NAN))); - Ok(np_arr.into_py(py)) - }, + let out = match s.dtype() { + Int8 => numeric_series_to_numpy::(py, s), + Int16 => numeric_series_to_numpy::(py, s), + Int32 => numeric_series_to_numpy::(py, s), + Int64 => numeric_series_to_numpy::(py, s), + UInt8 => numeric_series_to_numpy::(py, s), + UInt16 => numeric_series_to_numpy::(py, s), + UInt32 => numeric_series_to_numpy::(py, s), + UInt64 => numeric_series_to_numpy::(py, s), + Float32 => numeric_series_to_numpy::(py, s), + Float64 => numeric_series_to_numpy::(py, s), Boolean => { let ca = s.bool().unwrap(); let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); - Ok(np_arr.into_py(py)) + np_arr.into_py(py) }, String => { let ca = s.str().unwrap(); let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); - Ok(np_arr.into_py(py)) + np_arr.into_py(py) }, Binary => { let ca = s.binary().unwrap(); let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); - Ok(np_arr.into_py(py)) + np_arr.into_py(py) }, #[cfg(feature = "object")] Object(_, _) => { @@ -63,12 +60,12 @@ impl PySeries { .unwrap(); let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|opt_v| opt_v.to_object(py))); - Ok(np_arr.into_py(py)) + np_arr.into_py(py) }, Null => { let n = s.len(); let np_arr = PyArray1::from_iter(py, std::iter::repeat(f32::NAN).take(n)); - Ok(np_arr.into_py(py)) + np_arr.into_py(py) }, dt => { raise_err!( @@ -76,7 +73,8 @@ impl PySeries { ComputeError ); }, - } + }; + Ok(out) } pub fn to_list(&self) -> PyObject { @@ -214,3 +212,17 @@ impl PySeries { }) } } + +fn numeric_series_to_numpy(py: Python, s: &Series) -> PyObject +where + T: PolarsNumericType, + U: Float + numpy::Element, +{ + let ca: &ChunkedArray = s.as_ref().as_ref(); + let mapper = |opt_v: Option| match opt_v { + Some(v) => NumCast::from(v).unwrap(), + None => U::nan(), + }; + let np_arr = PyArray1::from_iter(py, ca.iter().map(mapper)); + np_arr.into_py(py) +} From 772bfddf08bad82967207d8474bacc1467319fba Mon Sep 17 00:00:00 2001 From: Stijn de Gooijer Date: Sun, 4 Feb 2024 02:11:48 +0100 Subject: [PATCH 3/3] Drive-by cleanup docs --- py-polars/polars/series/series.py | 35 ++++---- py-polars/src/series/export.rs | 140 ++++++++++++++++-------------- 2 files changed, 93 insertions(+), 82 deletions(-) diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index cb75302e826b..7f71fa9a52cb 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4145,12 +4145,18 @@ def to_physical(self) -> Series: def to_list(self, *, use_pyarrow: bool | None = None) -> list[Any]: """ - Convert this Series to a Python List. This operation clones data. + Convert this Series to a Python list. + + This operation copies data. Parameters ---------- use_pyarrow - Use pyarrow for the conversion. + Use PyArrow to perform the conversion. + + .. deprecated:: 0.19.9 + This parameter will be removed. The function can safely be called + without the parameter - it should give the exact same result. Examples -------- @@ -4283,32 +4289,31 @@ def to_numpy( use_pyarrow: bool = True, ) -> np.ndarray[Any, Any]: """ - Convert this Series to numpy. + Convert this Series to a NumPy ndarray. - This operation may clone data but is completely safe. Note that: + This operation may copy data, but is completely safe. Note that: - - data which is purely numeric AND without null values is not cloned; - - floating point `nan` values can be zero-copied; - - booleans can't be zero-copied. + - Data which is purely numeric AND without null values is not cloned + - Floating point `nan` values can be zero-copied + - Booleans cannot be zero-copied - To ensure that no data is cloned, set `zero_copy_only=True`. + To ensure that no data is copied, set `zero_copy_only=True`. Parameters ---------- zero_copy_only - If True, an exception will be raised if the conversion to a numpy - array would require copying the underlying data (e.g. in presence - of nulls, or for non-primitive types). + Raise an exception if the conversion to a NumPy would require copying + the underlying data. Data copy occurs, for example, when the Series contains + nulls or non-numeric types. writable - For numpy arrays created with zero copy (view on the Arrow data), + For NumPy arrays created with zero copy (view on the Arrow data), the resulting array is not writable (Arrow data is immutable). By setting this to True, a copy of the array is made to ensure it is writable. use_pyarrow Use `pyarrow.Array.to_numpy `_ - - for the conversion to numpy. + for the conversion to NumPy. Examples -------- @@ -4417,7 +4422,7 @@ def _view(self, *, ignore_nulls: bool = False) -> SeriesView: def to_arrow(self) -> pa.Array: """ - Get the underlying Arrow Array. + Return the underlying Arrow array. If the Series contains only a single chunk this operation is zero copy. diff --git a/py-polars/src/series/export.rs b/py-polars/src/series/export.rs index de366285deab..b0e08ee99140 100644 --- a/py-polars/src/series/export.rs +++ b/py-polars/src/series/export.rs @@ -10,73 +10,8 @@ use crate::{arrow_interop, raise_err, PySeries}; #[pymethods] impl PySeries { - #[allow(clippy::wrong_self_convention)] - fn to_arrow(&mut self) -> PyResult { - self.rechunk(true); - Python::with_gil(|py| { - let pyarrow = py.import("pyarrow")?; - - arrow_interop::to_py::to_py_array(self.series.to_arrow(0, false), py, pyarrow) - }) - } - - /// For numeric types, this should only be called for Series with null values. - /// Non-nullable types are handled with `view()`. - /// This will cast to floats so that `None = np.nan`. - fn to_numpy(&self, py: Python) -> PyResult { - use DataType::*; - let s = &self.series; - let out = match s.dtype() { - Int8 => numeric_series_to_numpy::(py, s), - Int16 => numeric_series_to_numpy::(py, s), - Int32 => numeric_series_to_numpy::(py, s), - Int64 => numeric_series_to_numpy::(py, s), - UInt8 => numeric_series_to_numpy::(py, s), - UInt16 => numeric_series_to_numpy::(py, s), - UInt32 => numeric_series_to_numpy::(py, s), - UInt64 => numeric_series_to_numpy::(py, s), - Float32 => numeric_series_to_numpy::(py, s), - Float64 => numeric_series_to_numpy::(py, s), - Boolean => { - let ca = s.bool().unwrap(); - let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); - np_arr.into_py(py) - }, - String => { - let ca = s.str().unwrap(); - let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); - np_arr.into_py(py) - }, - Binary => { - let ca = s.binary().unwrap(); - let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); - np_arr.into_py(py) - }, - #[cfg(feature = "object")] - Object(_, _) => { - let ca = s - .as_any() - .downcast_ref::>() - .unwrap(); - let np_arr = - PyArray1::from_iter(py, ca.into_iter().map(|opt_v| opt_v.to_object(py))); - np_arr.into_py(py) - }, - Null => { - let n = s.len(); - let np_arr = PyArray1::from_iter(py, std::iter::repeat(f32::NAN).take(n)); - np_arr.into_py(py) - }, - dt => { - raise_err!( - format!("'to_numpy' not supported for dtype: {dt:?}"), - ComputeError - ); - }, - }; - Ok(out) - } - + /// Convert this Series to a Python list. + /// This operation copies data. pub fn to_list(&self) -> PyObject { Python::with_gil(|py| { let series = &self.series; @@ -211,6 +146,77 @@ impl PySeries { pylist.to_object(py) }) } + + /// Return the underlying Arrow array. + #[allow(clippy::wrong_self_convention)] + fn to_arrow(&mut self) -> PyResult { + self.rechunk(true); + Python::with_gil(|py| { + let pyarrow = py.import("pyarrow")?; + + arrow_interop::to_py::to_py_array(self.series.to_arrow(0, false), py, pyarrow) + }) + } + + /// Convert this Series to a NumPy ndarray. + /// + /// This method will copy data - numeric types without null values should + /// be handled on the Python side in a zero-copy manner. + /// + /// This method will cast integers to floats so that `null = np.nan`. + fn to_numpy(&self, py: Python) -> PyResult { + use DataType::*; + let s = &self.series; + let out = match s.dtype() { + Int8 => numeric_series_to_numpy::(py, s), + Int16 => numeric_series_to_numpy::(py, s), + Int32 => numeric_series_to_numpy::(py, s), + Int64 => numeric_series_to_numpy::(py, s), + UInt8 => numeric_series_to_numpy::(py, s), + UInt16 => numeric_series_to_numpy::(py, s), + UInt32 => numeric_series_to_numpy::(py, s), + UInt64 => numeric_series_to_numpy::(py, s), + Float32 => numeric_series_to_numpy::(py, s), + Float64 => numeric_series_to_numpy::(py, s), + Boolean => { + let ca = s.bool().unwrap(); + let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); + np_arr.into_py(py) + }, + String => { + let ca = s.str().unwrap(); + let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); + np_arr.into_py(py) + }, + Binary => { + let ca = s.binary().unwrap(); + let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py))); + np_arr.into_py(py) + }, + #[cfg(feature = "object")] + Object(_, _) => { + let ca = s + .as_any() + .downcast_ref::>() + .unwrap(); + let np_arr = + PyArray1::from_iter(py, ca.into_iter().map(|opt_v| opt_v.to_object(py))); + np_arr.into_py(py) + }, + Null => { + let n = s.len(); + let np_arr = PyArray1::from_iter(py, std::iter::repeat(f32::NAN).take(n)); + np_arr.into_py(py) + }, + dt => { + raise_err!( + format!("`to_numpy` not supported for dtype {dt:?}"), + ComputeError + ); + }, + }; + Ok(out) + } } fn numeric_series_to_numpy(py: Python, s: &Series) -> PyObject