Skip to content

Commit

Permalink
feat(python)!: Properly apply strict parameter in Series constructor (
Browse files Browse the repository at this point in the history
  • Loading branch information
stinodego authored Jun 15, 2024
1 parent 0d82664 commit 6b36e9b
Show file tree
Hide file tree
Showing 22 changed files with 81 additions and 105 deletions.
7 changes: 4 additions & 3 deletions crates/polars-core/src/datatypes/dtype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ pub enum DataType {
Int64,
Float32,
Float64,
#[cfg(feature = "dtype-decimal")]
/// Fixed point decimal type optional precision and non-negative scale.
/// This is backed by a signed 128-bit integer which allows for up to 38 significant digits.
#[cfg(feature = "dtype-decimal")]
Decimal(Option<usize>, Option<usize>), // precision/scale; scale being None means "infer"
/// String data
String,
Expand All @@ -76,14 +76,14 @@ pub enum DataType {
Array(Box<DataType>, usize),
/// A nested list with a variable size in each row
List(Box<DataType>),
#[cfg(feature = "object")]
/// A generic type that can be used in a `Series`
/// &'static str can be used to determine/set inner type
#[cfg(feature = "object")]
Object(&'static str, Option<Arc<ObjectRegistry>>),
Null,
#[cfg(feature = "dtype-categorical")]
// The RevMapping has the internal state.
// This is ignored with comparisons, hashing etc.
#[cfg(feature = "dtype-categorical")]
Categorical(Option<Arc<RevMapping>>, CategoricalOrdering),
#[cfg(feature = "dtype-categorical")]
Enum(Option<Arc<RevMapping>>, CategoricalOrdering),
Expand Down Expand Up @@ -140,6 +140,7 @@ impl PartialEq for DataType {
(UnknownKind::Int(_), UnknownKind::Int(_)) => true,
_ => l == r,
},
// TODO: Add Decimal equality
_ => std::mem::discriminant(self) == std::mem::discriminant(other),
}
}
Expand Down
9 changes: 9 additions & 0 deletions crates/polars-core/src/series/any_value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,17 @@ impl Series {
let dtype = if strict {
get_first_non_null_dtype(values)
} else {
// Currently does not work correctly for Decimal because equality is not implemented.
any_values_to_supertype(values)?
};

// TODO: Remove this when Decimal data type equality is implemented.
#[cfg(feature = "dtype-decimal")]
if !strict && dtype.is_decimal() {
let dtype = DataType::Decimal(None, None);
return Self::from_any_values_and_dtype(name, values, &dtype, strict);
}

Self::from_any_values_and_dtype(name, values, &dtype, strict)
}

Expand Down
2 changes: 1 addition & 1 deletion docs/src/python/user-guide/getting-started/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
datetime(2025, 12, 4),
datetime(2025, 12, 5),
],
"d": [1, 2.0, float("nan"), -42, None],
"d": [1.0, 2.0, float("nan"), -42.0, None],
}
)
# --8<-- [end:setup]
Expand Down
2 changes: 1 addition & 1 deletion docs/src/python/user-guide/getting-started/joins.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
{
"a": range(8),
"b": np.random.rand(8),
"d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
"d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
}
)

Expand Down
47 changes: 10 additions & 37 deletions py-polars/polars/_utils/construction/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import contextlib
from datetime import date, datetime, time, timedelta
from decimal import Decimal as PyDecimal
from itertools import islice
from typing import (
TYPE_CHECKING,
Expand All @@ -27,8 +26,6 @@
)
from polars._utils.wrap import wrap_s
from polars.datatypes import (
INTEGER_DTYPES,
TEMPORAL_DTYPES,
Array,
Boolean,
Categorical,
Expand Down Expand Up @@ -293,44 +290,20 @@ def _construct_series_with_fallbacks(
constructor: Callable[[str, Sequence[Any], bool], PySeries],
name: str,
values: Sequence[Any],
target_dtype: PolarsDataType | None,
dtype: PolarsDataType | None,
*,
strict: bool,
) -> PySeries:
"""Construct Series, with fallbacks for basic type mismatch (eg: bool/int)."""
while True:
try:
return constructor(name, values, strict)
except TypeError as exc:
str_exc = str(exc)

# from x to float
# error message can be:
# - integers: "'float' object cannot be interpreted as an integer"
if "'float'" in str_exc and (
# we do not accept float values as int/temporal, as it causes silent
# information loss; the caller should explicitly cast in this case.
target_dtype not in (INTEGER_DTYPES | TEMPORAL_DTYPES)
):
constructor = py_type_to_constructor(float)

# from x to string
# error message can be:
# - integers: "'str' object cannot be interpreted as an integer"
# - floats: "must be real number, not str"
elif "'str'" in str_exc or str_exc == "must be real number, not str":
constructor = py_type_to_constructor(str)

# from x to int
# error message can be:
# - bools: "'int' object cannot be converted to 'PyBool'"
elif str_exc == "'int' object cannot be converted to 'PyBool'":
constructor = py_type_to_constructor(int)

elif "decimal.Decimal" in str_exc:
constructor = py_type_to_constructor(PyDecimal)
else:
raise
try:
return constructor(name, values, strict)
except TypeError:
if dtype is None:
return PySeries.new_from_any_values(name, values, strict=strict)
else:
return PySeries.new_from_any_values_and_dtype(
name, values, dtype, strict=strict
)


def iterable_to_pyseries(
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9487,7 +9487,7 @@ def fold(self, operation: Callable[[Series, Series], Series]) -> Series:
>>> df = pl.DataFrame(
... {
... "a": ["foo", "bar", 2],
... "a": ["foo", "bar", None],
... "b": [1, 2, 3],
... "c": [1.0, 2.0, 3.0],
... }
Expand Down
8 changes: 4 additions & 4 deletions py-polars/polars/expr/expr.py
Original file line number Diff line number Diff line change
Expand Up @@ -2964,7 +2964,7 @@ def max(self) -> Self:
Examples
--------
>>> df = pl.DataFrame({"a": [-1, float("nan"), 1]})
>>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]})
>>> df.select(pl.col("a").max())
shape: (1, 1)
┌─────┐
Expand All @@ -2983,7 +2983,7 @@ def min(self) -> Self:
Examples
--------
>>> df = pl.DataFrame({"a": [-1, float("nan"), 1]})
>>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]})
>>> df.select(pl.col("a").min())
shape: (1, 1)
┌──────┐
Expand All @@ -3005,7 +3005,7 @@ def nan_max(self) -> Self:
Examples
--------
>>> df = pl.DataFrame({"a": [0, float("nan")]})
>>> df = pl.DataFrame({"a": [0.0, float("nan")]})
>>> df.select(pl.col("a").nan_max())
shape: (1, 1)
┌─────┐
Expand All @@ -3027,7 +3027,7 @@ def nan_min(self) -> Self:
Examples
--------
>>> df = pl.DataFrame({"a": [0, float("nan")]})
>>> df = pl.DataFrame({"a": [0.0, float("nan")]})
>>> df.select(pl.col("a").nan_min())
shape: (1, 1)
┌─────┐
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/io/spreadsheet/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ def _read_spreadsheet_openpyxl(
# the non-strings will become null, so we handle the cast here
values = [str(v) if (v is not None) else v for v in values]

s = pl.Series(name, values, dtype=dtype)
s = pl.Series(name, values, dtype=dtype, strict=False)
series_data.append(s)

df = pl.DataFrame(
Expand Down
6 changes: 3 additions & 3 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1958,7 +1958,7 @@ def nan_max(self) -> int | float | date | datetime | timedelta | str:
>>> s.nan_max()
4
>>> s = pl.Series("a", [1, float("nan"), 4])
>>> s = pl.Series("a", [1.0, float("nan"), 4.0])
>>> s.nan_max()
nan
"""
Expand All @@ -1977,7 +1977,7 @@ def nan_min(self) -> int | float | date | datetime | timedelta | str:
>>> s.nan_min()
1
>>> s = pl.Series("a", [1, float("nan"), 4])
>>> s = pl.Series("a", [1.0, float("nan"), 4.0])
>>> s.nan_min()
nan
"""
Expand Down Expand Up @@ -4730,7 +4730,7 @@ def fill_nan(self, value: int | float | Expr | None) -> Series:
Examples
--------
>>> s = pl.Series("a", [1, 2, 3, float("nan")])
>>> s = pl.Series("a", [1.0, 2.0, 3.0, float("nan")])
>>> s.fill_nan(0)
shape: (4,)
Series: 'a' [f64]
Expand Down
50 changes: 19 additions & 31 deletions py-polars/src/series/construction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ impl PySeries {
#[pymethods]
impl PySeries {
#[staticmethod]
fn new_opt_bool(name: &str, values: &Bound<PyAny>, strict: bool) -> PyResult<Self> {
fn new_opt_bool(name: &str, values: &Bound<PyAny>, _strict: bool) -> PyResult<Self> {
let len = values.len()?;
let mut builder = BooleanChunkedBuilder::new(name, len);

Expand All @@ -107,25 +107,18 @@ impl PySeries {
if value.is_none() {
builder.append_null()
} else {
match value.extract::<bool>() {
Ok(v) => builder.append_value(v),
Err(e) => {
if strict {
return Err(e);
}
builder.append_null()
},
}
let v = value.extract::<bool>()?;
builder.append_value(v)
}
}
let ca = builder.finish();

let ca = builder.finish();
let s = ca.into_series();
Ok(s.into())
}
}

fn new_primitive<'a, T>(name: &str, values: &'a Bound<PyAny>, strict: bool) -> PyResult<PySeries>
fn new_primitive<'a, T>(name: &str, values: &'a Bound<PyAny>, _strict: bool) -> PyResult<PySeries>
where
T: PolarsNumericType,
ChunkedArray<T>: IntoSeries,
Expand All @@ -139,19 +132,12 @@ where
if value.is_none() {
builder.append_null()
} else {
match value.extract::<T::Native>() {
Ok(v) => builder.append_value(v),
Err(e) => {
if strict {
return Err(e);
}
builder.append_null()
},
}
let v = value.extract::<T::Native>()?;
builder.append_value(v)
}
}
let ca = builder.finish();

let ca = builder.finish();
let s = ca.into_series();
Ok(s.into())
}
Expand Down Expand Up @@ -243,9 +229,11 @@ impl PySeries {

for res in values.iter()? {
let value = res?;
match value.extract::<Cow<str>>() {
Ok(v) => builder.append_value(v),
Err(_) => builder.append_null(),
if value.is_none() {
builder.append_null()
} else {
let v = value.extract::<Cow<str>>()?;
builder.append_value(v)
}
}

Expand All @@ -261,9 +249,11 @@ impl PySeries {

for res in values.iter()? {
let value = res?;
match value.extract::<&[u8]>() {
Ok(v) => builder.append_value(v),
Err(_) => builder.append_null(),
if value.is_none() {
builder.append_null()
} else {
let v = value.extract::<&[u8]>()?;
builder.append_value(v)
}
}

Expand All @@ -274,9 +264,7 @@ impl PySeries {

#[staticmethod]
fn new_decimal(name: &str, values: &Bound<PyAny>, strict: bool) -> PyResult<Self> {
// Create a fake dtype with a placeholder "none" scale, to be inferred later.
let dtype = DataType::Decimal(None, None);
Self::new_from_any_values_and_dtype(name, values, Wrap(dtype), strict)
Self::new_from_any_values(name, values, strict)
}

#[staticmethod]
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/constructors/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,7 @@ def test_from_dicts_list_struct_without_inner_dtype_5611() -> None:


def test_from_dict_upcast_primitive() -> None:
df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]})
df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]}, strict=False)
assert df.dtypes == [pl.Float64, pl.Float64]


Expand Down
5 changes: 1 addition & 4 deletions py-polars/tests/unit/constructors/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,7 @@ def test_df_init_strict() -> None:

df = pl.DataFrame(data, schema=schema, strict=False)

# TODO: This should result in a Float Series without nulls
# https://github.com/pola-rs/polars/issues/14427
assert df["a"].to_list() == [1, 2, None]

assert df["a"].to_list() == [1, 2, 3]
assert df["a"].dtype == pl.Int8


Expand Down
5 changes: 4 additions & 1 deletion py-polars/tests/unit/constructors/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,10 @@ def test_sequence_of_series_with_dtype(dtype: pl.PolarsDataType | None) -> None:
def test_upcast_primitive_and_strings(
values: list[Any], dtype: pl.PolarsDataType, expected_dtype: pl.PolarsDataType
) -> None:
assert pl.Series(values, dtype=dtype).dtype == expected_dtype
with pytest.raises(TypeError):
pl.Series(values, dtype=dtype, strict=True)

assert pl.Series(values, dtype=dtype, strict=False).dtype == expected_dtype


def test_preserve_decimal_precision() -> None:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def test_df_getitem_col_mixed_inputs(input: list[Any], match: str) -> None:
@pytest.mark.parametrize(
("input", "match"),
[
([0.0, 1.0], "'float' object cannot be interpreted as an integer"),
([0.0, 1.0], "unexpected value while building Series of type Int64"),
(
pl.Series([[1, 2], [3, 4]]),
"cannot treat Series of type List\\(Int64\\) as indices",
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/dataframe/test_serde.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_df_serde_enum() -> None:
[
([[1, 2, 3], [None, None, None], [1, None, 3]], pl.Array(pl.Int32(), shape=3)),
([["a", "b"], [None, None]], pl.Array(pl.Utf8, shape=2)),
([[True, False, None], [None, None, None]], pl.Array(pl.Utf8, shape=3)),
([[True, False, None], [None, None, None]], pl.Array(pl.Boolean, shape=3)),
(
[[[1, 2, 3], [4, None, 5]], None, [[None, None, 2]]],
pl.List(pl.Array(pl.Int32(), shape=3)),
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/datatypes/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def permutations_int_dec_none() -> list[tuple[D | int | None, ...]]:
D("-0.01"),
D("1.2345678"),
D("500"),
# -1, # TODO: Address in https://github.com/pola-rs/polars/issues/14427
-1,
None,
]
)
Expand Down
Loading

0 comments on commit 6b36e9b

Please sign in to comment.