Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Change Series.to_numpy to return f64 for Int32/UInt32 Series with nulls instead of f32 #14240

Merged
merged 3 commits into from
Feb 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions py-polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ either = { workspace = true }
itoa = { workspace = true }
libc = "0.2"
ndarray = { workspace = true }
num-traits = { workspace = true }
numpy = { version = "0.20", default-features = false }
once_cell = { workspace = true }
pyo3 = { workspace = true, features = ["abi3-py38", "extension-module", "multiple-pymethods"] }
Expand Down
35 changes: 20 additions & 15 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4145,12 +4145,18 @@ def to_physical(self) -> Series:

def to_list(self, *, use_pyarrow: bool | None = None) -> list[Any]:
"""
Convert this Series to a Python List. This operation clones data.
Convert this Series to a Python list.

This operation copies data.

Parameters
----------
use_pyarrow
Use pyarrow for the conversion.
Use PyArrow to perform the conversion.

.. deprecated:: 0.19.9
This parameter will be removed. The function can safely be called
without the parameter - it should give the exact same result.

Examples
--------
Expand Down Expand Up @@ -4283,32 +4289,31 @@ def to_numpy(
use_pyarrow: bool = True,
) -> np.ndarray[Any, Any]:
"""
Convert this Series to numpy.
Convert this Series to a NumPy ndarray.

This operation may clone data but is completely safe. Note that:
This operation may copy data, but is completely safe. Note that:

- data which is purely numeric AND without null values is not cloned;
- floating point `nan` values can be zero-copied;
- booleans can't be zero-copied.
- Data which is purely numeric AND without null values is not cloned
- Floating point `nan` values can be zero-copied
- Booleans cannot be zero-copied

To ensure that no data is cloned, set `zero_copy_only=True`.
To ensure that no data is copied, set `zero_copy_only=True`.

Parameters
----------
zero_copy_only
If True, an exception will be raised if the conversion to a numpy
array would require copying the underlying data (e.g. in presence
of nulls, or for non-primitive types).
Raise an exception if the conversion to a NumPy would require copying
the underlying data. Data copy occurs, for example, when the Series contains
nulls or non-numeric types.
writable
For numpy arrays created with zero copy (view on the Arrow data),
For NumPy arrays created with zero copy (view on the Arrow data),
the resulting array is not writable (Arrow data is immutable).
By setting this to True, a copy of the array is made to ensure
it is writable.
use_pyarrow
Use `pyarrow.Array.to_numpy
<https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_

for the conversion to numpy.
for the conversion to NumPy.

Examples
--------
Expand Down Expand Up @@ -4417,7 +4422,7 @@ def _view(self, *, ignore_nulls: bool = False) -> SeriesView:

def to_arrow(self) -> pa.Array:
"""
Get the underlying Arrow Array.
Return the underlying Arrow array.

If the Series contains only a single chunk this operation is zero copy.

Expand Down
158 changes: 88 additions & 70 deletions py-polars/src/series/export.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use num_traits::{Float, NumCast};
use numpy::PyArray1;
use polars_core::prelude::*;
use pyo3::prelude::*;
Expand All @@ -9,76 +10,8 @@ use crate::{arrow_interop, raise_err, PySeries};

#[pymethods]
impl PySeries {
#[allow(clippy::wrong_self_convention)]
fn to_arrow(&mut self) -> PyResult<PyObject> {
self.rechunk(true);
Python::with_gil(|py| {
let pyarrow = py.import("pyarrow")?;

arrow_interop::to_py::to_py_array(self.series.to_arrow(0, false), py, pyarrow)
})
}

/// For numeric types, this should only be called for Series with null types.
/// Non-nullable types are handled with `view()`.
/// This will cast to floats so that `None = np.nan`.
fn to_numpy(&self, py: Python) -> PyResult<PyObject> {
let s = &self.series;
match s.dtype() {
dt if dt.is_numeric() => {
if s.bit_repr_is_large() {
let s = s.cast(&DataType::Float64).unwrap();
let ca = s.f64().unwrap();
let np_arr =
PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f64::NAN)));
Ok(np_arr.into_py(py))
} else {
let s = s.cast(&DataType::Float32).unwrap();
let ca = s.f32().unwrap();
let np_arr =
PyArray1::from_iter(py, ca.iter().map(|opt_v| opt_v.unwrap_or(f32::NAN)));
Ok(np_arr.into_py(py))
}
},
DataType::String => {
let ca = s.str().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
Ok(np_arr.into_py(py))
},
DataType::Binary => {
let ca = s.binary().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
Ok(np_arr.into_py(py))
},
DataType::Boolean => {
let ca = s.bool().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
Ok(np_arr.into_py(py))
},
#[cfg(feature = "object")]
DataType::Object(_, _) => {
let ca = s
.as_any()
.downcast_ref::<ObjectChunked<ObjectValue>>()
.unwrap();
let np_arr =
PyArray1::from_iter(py, ca.into_iter().map(|opt_v| opt_v.to_object(py)));
Ok(np_arr.into_py(py))
},
DataType::Null => {
let n = s.len();
let np_arr = PyArray1::from_iter(py, std::iter::repeat(f32::NAN).take(n));
Ok(np_arr.into_py(py))
},
dt => {
raise_err!(
format!("'to_numpy' not supported for dtype: {dt:?}"),
ComputeError
);
},
}
}

/// Convert this Series to a Python list.
/// This operation copies data.
pub fn to_list(&self) -> PyObject {
Python::with_gil(|py| {
let series = &self.series;
Expand Down Expand Up @@ -213,4 +146,89 @@ impl PySeries {
pylist.to_object(py)
})
}

/// Return the underlying Arrow array.
#[allow(clippy::wrong_self_convention)]
fn to_arrow(&mut self) -> PyResult<PyObject> {
self.rechunk(true);
Python::with_gil(|py| {
let pyarrow = py.import("pyarrow")?;

arrow_interop::to_py::to_py_array(self.series.to_arrow(0, false), py, pyarrow)
})
}

/// Convert this Series to a NumPy ndarray.
///
/// This method will copy data - numeric types without null values should
/// be handled on the Python side in a zero-copy manner.
///
/// This method will cast integers to floats so that `null = np.nan`.
fn to_numpy(&self, py: Python) -> PyResult<PyObject> {
use DataType::*;
let s = &self.series;
let out = match s.dtype() {
Int8 => numeric_series_to_numpy::<Int8Type, f32>(py, s),
Int16 => numeric_series_to_numpy::<Int16Type, f32>(py, s),
Int32 => numeric_series_to_numpy::<Int32Type, f64>(py, s),
Int64 => numeric_series_to_numpy::<Int64Type, f64>(py, s),
UInt8 => numeric_series_to_numpy::<UInt8Type, f32>(py, s),
UInt16 => numeric_series_to_numpy::<UInt16Type, f32>(py, s),
UInt32 => numeric_series_to_numpy::<UInt32Type, f64>(py, s),
UInt64 => numeric_series_to_numpy::<UInt64Type, f64>(py, s),
Float32 => numeric_series_to_numpy::<Float32Type, f32>(py, s),
Float64 => numeric_series_to_numpy::<Float64Type, f64>(py, s),
Boolean => {
let ca = s.bool().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
np_arr.into_py(py)
},
String => {
let ca = s.str().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
np_arr.into_py(py)
},
Binary => {
let ca = s.binary().unwrap();
let np_arr = PyArray1::from_iter(py, ca.into_iter().map(|s| s.into_py(py)));
np_arr.into_py(py)
},
#[cfg(feature = "object")]
Object(_, _) => {
let ca = s
.as_any()
.downcast_ref::<ObjectChunked<ObjectValue>>()
.unwrap();
let np_arr =
PyArray1::from_iter(py, ca.into_iter().map(|opt_v| opt_v.to_object(py)));
np_arr.into_py(py)
},
Null => {
let n = s.len();
let np_arr = PyArray1::from_iter(py, std::iter::repeat(f32::NAN).take(n));
np_arr.into_py(py)
},
dt => {
raise_err!(
format!("`to_numpy` not supported for dtype {dt:?}"),
ComputeError
);
},
};
Ok(out)
}
}

fn numeric_series_to_numpy<T, U>(py: Python, s: &Series) -> PyObject
where
T: PolarsNumericType,
U: Float + numpy::Element,
{
let ca: &ChunkedArray<T> = s.as_ref().as_ref();
let mapper = |opt_v: Option<T::Native>| match opt_v {
Some(v) => NumCast::from(v).unwrap(),
None => U::nan(),
};
let np_arr = PyArray1::from_iter(py, ca.iter().map(mapper));
np_arr.into_py(py)
}
24 changes: 24 additions & 0 deletions py-polars/tests/unit/interop/test_numpy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import numpy.typing as npt
import pytest

import polars as pl
Expand Down Expand Up @@ -63,3 +64,26 @@ def test_series_to_numpy_bool_with_nulls() -> None:
result = s.to_numpy(use_pyarrow=False)
assert s.to_list() == result.tolist()
assert result.dtype == np.object_


@pytest.mark.parametrize(
("dtype", "expected_dtype"),
[
(pl.Int8, np.float32),
(pl.Int16, np.float32),
(pl.Int32, np.float64),
(pl.Int64, np.float64),
(pl.UInt8, np.float32),
(pl.UInt16, np.float32),
(pl.UInt32, np.float64),
(pl.UInt64, np.float64),
(pl.Float32, np.float32),
(pl.Float64, np.float64),
],
)
def test_series_to_numpy_numeric_with_nulls(
dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
) -> None:
s = pl.Series([1, 2, None], dtype=dtype, strict=False)
result = s.to_numpy(use_pyarrow=False)
assert result.dtype == expected_dtype
Loading