Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf(python): Improve Series.to_numpy performance for chunked Series that would otherwise be zero-copy #16301

Merged
merged 8 commits into from
May 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions crates/polars-core/src/datatypes/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -236,17 +236,17 @@ impl DataType {
self.is_float() || self.is_integer()
}

/// Check if this [`DataType`] is a boolean
/// Check if this [`DataType`] is a boolean.
pub fn is_bool(&self) -> bool {
matches!(self, DataType::Boolean)
}

/// Check if this [`DataType`] is a list
/// Check if this [`DataType`] is a list.
pub fn is_list(&self) -> bool {
matches!(self, DataType::List(_))
}

/// Check if this [`DataType`] is a array
/// Check if this [`DataType`] is an array.
pub fn is_array(&self) -> bool {
#[cfg(feature = "dtype-array")]
{
Expand Down
8 changes: 5 additions & 3 deletions py-polars/src/series/export.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use pyo3::types::PyList;
use crate::conversion::chunked_array::{decimal_to_pyobject_iter, time_to_pyobject_iter};
use crate::error::PyPolarsErr;
use crate::prelude::*;
use crate::to_numpy::{reshape_numpy_array, series_to_numpy_view};
use crate::to_numpy::{reshape_numpy_array, try_series_to_numpy_view};
use crate::{arrow_interop, raise_err, PySeries};

#[pymethods]
Expand Down Expand Up @@ -174,8 +174,10 @@ impl PySeries {
return series_to_numpy_with_copy(py, &self.series);
}

if let Some(mut arr) = series_to_numpy_view(py, &self.series, false) {
if writable {
if let Some((mut arr, writable_flag)) =
try_series_to_numpy_view(py, &self.series, false, allow_copy)
{
if writable && !writable_flag {
if !allow_copy {
return Err(PyValueError::new_err(
"cannot return a zero-copy writable array",
Expand Down
113 changes: 87 additions & 26 deletions py-polars/src/to_numpy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,65 +58,126 @@ impl PySeries {
/// WARNING: The resulting view will show the underlying value for nulls,
/// which may be any value. The caller is responsible for handling nulls
/// appropriately.
#[allow(clippy::wrong_self_convention)]
pub fn to_numpy_view(&self, py: Python) -> Option<PyObject> {
series_to_numpy_view(py, &self.series, true)
let (view, _) = try_series_to_numpy_view(py, &self.series, true, false)?;
Some(view)
}
}

pub(crate) fn series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
// NumPy arrays are always contiguous
if s.n_chunks() > 1 {
/// Create a NumPy view of the given Series.
pub(crate) fn try_series_to_numpy_view(
py: Python,
s: &Series,
allow_nulls: bool,
allow_rechunk: bool,
) -> Option<(PyObject, bool)> {
if !supports_view(s.dtype()) {
return None;
}
if !allow_nulls && s.null_count() > 0 {
if !allow_nulls && has_nulls(s) {
return None;
}
let view = match s.dtype() {
dt if dt.is_numeric() => numeric_series_to_numpy_view(py, s),
DataType::Datetime(_, _) | DataType::Duration(_) => temporal_series_to_numpy_view(py, s),
DataType::Array(_, _) => array_series_to_numpy_view(py, s, allow_nulls)?,
_ => return None,
};
Some(view)
let (s_owned, writable_flag) = handle_chunks(s, allow_rechunk)?;

let array = series_to_numpy_view_recursive(py, s_owned, writable_flag);
Some((array, writable_flag))
}
/// Returns whether the data type supports creating a NumPy view.
fn supports_view(dtype: &DataType) -> bool {
match dtype {
dt if dt.is_numeric() => true,
DataType::Datetime(_, _) | DataType::Duration(_) => true,
DataType::Array(inner, _) => supports_view(inner.as_ref()),
_ => false,
}
}
fn numeric_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
/// Returns whether the Series contains nulls at any level of nesting.
///
/// Of the nested types, only Array types are handled since only those are relevant for NumPy views.
fn has_nulls(s: &Series) -> bool {
if s.null_count() > 0 {
true
} else if s.dtype().is_array() {
let ca = s.array().unwrap();
let s_inner = ca.get_inner();
has_nulls(&s_inner)
} else {
false
}
}
/// Rechunk the Series if required.
///
/// NumPy arrays are always contiguous, so we may have to rechunk before creating a view.
/// If we do so, we can flag the resulting array as writable.
fn handle_chunks(s: &Series, allow_rechunk: bool) -> Option<(Series, bool)> {
let is_chunked = s.n_chunks() > 1;
match (is_chunked, allow_rechunk) {
(true, false) => None,
(true, true) => Some((s.rechunk(), true)),
(false, _) => Some((s.clone(), false)),
}
}

/// Create a NumPy view of the given Series without checking for data types, chunks, or nulls.
fn series_to_numpy_view_recursive(py: Python, s: Series, writable: bool) -> PyObject {
debug_assert!(s.n_chunks() == 1);
match s.dtype() {
dt if dt.is_numeric() => numeric_series_to_numpy_view(py, s, writable),
DataType::Datetime(_, _) | DataType::Duration(_) => {
temporal_series_to_numpy_view(py, s, writable)
},
DataType::Array(_, _) => array_series_to_numpy_view(py, &s, writable),
_ => panic!("invalid data type"),
}
}
/// Create a NumPy view of a numeric Series.
fn numeric_series_to_numpy_view(py: Python, s: Series, writable: bool) -> PyObject {
let dims = [s.len()].into_dimension();
let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.
with_match_physical_numeric_polars_type!(s.dtype(), |$T| {
let np_dtype = <$T as PolarsNumericType>::Native::get_dtype_bound(py);
let ca: &ChunkedArray<$T> = s.unpack::<$T>().unwrap();
let flags = if writable {
flags::NPY_ARRAY_FARRAY
} else {
flags::NPY_ARRAY_FARRAY_RO
};

let slice = ca.data_views().next().unwrap();

unsafe {
create_borrowed_np_array::<_>(
py,
np_dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
flags,
slice.as_ptr() as _,
owner,
PySeries::from(s).into_py(py), // Keep the Series memory alive.,
)
}
})
}
fn temporal_series_to_numpy_view(py: Python, s: &Series) -> PyObject {
/// Create a NumPy view of a Datetime or Duration Series.
fn temporal_series_to_numpy_view(py: Python, s: Series, writable: bool) -> PyObject {
let np_dtype = polars_dtype_to_np_temporal_dtype(py, s.dtype());

let phys = s.to_physical_repr();
let ca = phys.i64().unwrap();
let slice = ca.data_views().next().unwrap();
let dims = [s.len()].into_dimension();
let owner = PySeries::from(s.clone()).into_py(py); // Keep the Series memory alive.
let flags = if writable {
flags::NPY_ARRAY_FARRAY
} else {
flags::NPY_ARRAY_FARRAY_RO
};

unsafe {
create_borrowed_np_array::<_>(
py,
np_dtype,
dims,
flags::NPY_ARRAY_FARRAY_RO,
flags,
slice.as_ptr() as _,
owner,
PySeries::from(s).into_py(py), // Keep the Series memory alive.,
)
}
}
Expand Down Expand Up @@ -148,17 +209,17 @@ fn polars_dtype_to_np_temporal_dtype<'a>(
_ => panic!("only Datetime/Duration inputs supported, got {}", dtype),
}
}
fn array_series_to_numpy_view(py: Python, s: &Series, allow_nulls: bool) -> Option<PyObject> {
/// Create a NumPy view of an Array Series.
fn array_series_to_numpy_view(py: Python, s: &Series, writable: bool) -> PyObject {
let ca = s.array().unwrap();
let s_inner = ca.get_inner();
let np_array_flat = series_to_numpy_view(py, &s_inner, allow_nulls)?;
let np_array_flat = series_to_numpy_view_recursive(py, s_inner, writable);

// Reshape to the original shape.
let DataType::Array(_, width) = s.dtype() else {
unreachable!()
};
let view = reshape_numpy_array(py, np_array_flat, ca.len(), *width);
Some(view)
reshape_numpy_array(py, np_array_flat, ca.len(), *width)
}
/// Reshape the first dimension of a NumPy array to the given height and width.
pub(crate) fn reshape_numpy_array(
Expand All @@ -174,7 +235,7 @@ pub(crate) fn reshape_numpy_array(
.unwrap();

if shape.len() == 1 {
// In this case we can avoid allocating a Vec.
// In this case, we can avoid allocating a Vec.
let new_shape = (height, width);
arr.call_method1(py, intern!(py, "reshape"), new_shape)
.unwrap()
Expand Down
1 change: 1 addition & 0 deletions py-polars/tests/benchmark/interop/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Benchmark tests for conversions from/to other data formats."""
53 changes: 53 additions & 0 deletions py-polars/tests/benchmark/interop/test_numpy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Benchmark tests for conversions from/to NumPy."""

from __future__ import annotations

from typing import Any

import numpy as np
import pytest

import polars as pl

pytestmark = pytest.mark.benchmark()


@pytest.fixture(scope="module")
def floats_array() -> np.ndarray[Any, Any]:
n_rows = 10_000
return np.random.randn(n_rows)


@pytest.fixture()
def floats(floats_array: np.ndarray[Any, Any]) -> pl.Series:
return pl.Series(floats_array)


@pytest.fixture()
def floats_with_nulls(floats: pl.Series) -> pl.Series:
null_probability = 0.1
validity = pl.Series(np.random.uniform(size=floats.len())) > null_probability
return pl.select(pl.when(validity).then(floats)).to_series()


@pytest.fixture()
def floats_chunked(floats_array: np.ndarray[Any, Any]) -> pl.Series:
n_chunks = 5
chunk_len = len(floats_array) // n_chunks
chunks = [
floats_array[i * chunk_len : (i + 1) * chunk_len] for i in range(n_chunks)
]
chunks_copy = [pl.Series(c.copy()) for c in chunks]
return pl.concat(chunks_copy, rechunk=False)


def test_to_numpy_series_zero_copy(floats: pl.Series) -> None:
floats.to_numpy(use_pyarrow=False)


def test_to_numpy_series_with_nulls(floats_with_nulls: pl.Series) -> None:
floats_with_nulls.to_numpy(use_pyarrow=False)


def test_to_numpy_series_chunked(floats_chunked: pl.Series) -> None:
floats_chunked.to_numpy(use_pyarrow=False)
21 changes: 21 additions & 0 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,27 @@ def test_to_numpy_chunked() -> None:

assert result.tolist() == s.to_list()
assert result.dtype == np.int64
assert result.flags.writeable is True
assert_allow_copy_false_raises(s)

# Check that writing to the array doesn't change the original data
result[0] = 10
assert result.tolist() == [10, 2, 3, 4]
assert s.to_list() == [1, 2, 3, 4]


def test_to_numpy_chunked_temporal_nested() -> None:
dtype = pl.Array(pl.Datetime("us"), 1)
s1 = pl.Series([[datetime(2020, 1, 1)], [datetime(2021, 1, 1)]], dtype=dtype)
s2 = pl.Series([[datetime(2022, 1, 1)], [datetime(2023, 1, 1)]], dtype=dtype)
s = pl.concat([s1, s2], rechunk=False)

result = s.to_numpy(use_pyarrow=False)

assert result.tolist() == s.to_list()
assert result.dtype == np.dtype("datetime64[us]")
assert result.shape == (4, 1)
assert result.flags.writeable is True
assert_allow_copy_false_raises(s)


Expand Down