feat(python)!: Properly apply strict parameter in Series constructor (

pola-rs#16939)
Wouittone · Jun 22, 2024 · 6ac90b5 · 6ac90b5
1 parent 32c8516
commit 6ac90b5
Show file tree

Hide file tree

Showing 22 changed files with 81 additions and 105 deletions.
diff --git a/crates/polars-core/src/datatypes/dtype.rs b/crates/polars-core/src/datatypes/dtype.rs
@@ -53,9 +53,9 @@ pub enum DataType {
     Int64,
     Float32,
     Float64,
-    #[cfg(feature = "dtype-decimal")]
     /// Fixed point decimal type optional precision and non-negative scale.
     /// This is backed by a signed 128-bit integer which allows for up to 38 significant digits.
+    #[cfg(feature = "dtype-decimal")]
     Decimal(Option<usize>, Option<usize>), // precision/scale; scale being None means "infer"
     /// String data
     String,
@@ -76,14 +76,14 @@ pub enum DataType {
     Array(Box<DataType>, usize),
     /// A nested list with a variable size in each row
     List(Box<DataType>),
-    #[cfg(feature = "object")]
     /// A generic type that can be used in a `Series`
     /// &'static str can be used to determine/set inner type
+    #[cfg(feature = "object")]
     Object(&'static str, Option<Arc<ObjectRegistry>>),
     Null,
-    #[cfg(feature = "dtype-categorical")]
     // The RevMapping has the internal state.
     // This is ignored with comparisons, hashing etc.
+    #[cfg(feature = "dtype-categorical")]
     Categorical(Option<Arc<RevMapping>>, CategoricalOrdering),
     #[cfg(feature = "dtype-categorical")]
     Enum(Option<Arc<RevMapping>>, CategoricalOrdering),
@@ -140,6 +140,7 @@ impl PartialEq for DataType {
                     (UnknownKind::Int(_), UnknownKind::Int(_)) => true,
                     _ => l == r,
                 },
+                // TODO: Add Decimal equality
                 _ => std::mem::discriminant(self) == std::mem::discriminant(other),
             }
         }

diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs
@@ -60,8 +60,17 @@ impl Series {
         let dtype = if strict {
             get_first_non_null_dtype(values)
         } else {
+            // Currently does not work correctly for Decimal because equality is not implemented.
             any_values_to_supertype(values)?
         };
+
+        // TODO: Remove this when Decimal data type equality is implemented.
+        #[cfg(feature = "dtype-decimal")]
+        if !strict && dtype.is_decimal() {
+            let dtype = DataType::Decimal(None, None);
+            return Self::from_any_values_and_dtype(name, values, &dtype, strict);
+        }
+
         Self::from_any_values_and_dtype(name, values, &dtype, strict)
     }
 

diff --git a/docs/src/python/user-guide/getting-started/expressions.py b/docs/src/python/user-guide/getting-started/expressions.py
@@ -15,7 +15,7 @@
             datetime(2025, 12, 4),
             datetime(2025, 12, 5),
         ],
-        "d": [1, 2.0, float("nan"), -42, None],
+        "d": [1.0, 2.0, float("nan"), -42.0, None],
     }
 )
 # --8<-- [end:setup]

diff --git a/docs/src/python/user-guide/getting-started/joins.py b/docs/src/python/user-guide/getting-started/joins.py
@@ -9,7 +9,7 @@
     {
         "a": range(8),
         "b": np.random.rand(8),
-        "d": [1, 2.0, float("nan"), float("nan"), 0, -5, -42, None],
+        "d": [1.0, 2.0, float("nan"), float("nan"), 0.0, -5.0, -42.0, None],
     }
 )
 

diff --git a/py-polars/polars/_utils/construction/series.py b/py-polars/polars/_utils/construction/series.py
@@ -2,7 +2,6 @@
 
 import contextlib
 from datetime import date, datetime, time, timedelta
-from decimal import Decimal as PyDecimal
 from itertools import islice
 from typing import (
     TYPE_CHECKING,
@@ -27,8 +26,6 @@
 )
 from polars._utils.wrap import wrap_s
 from polars.datatypes import (
-    INTEGER_DTYPES,
-    TEMPORAL_DTYPES,
     Array,
     Boolean,
     Categorical,
@@ -293,44 +290,20 @@ def _construct_series_with_fallbacks(
     constructor: Callable[[str, Sequence[Any], bool], PySeries],
     name: str,
     values: Sequence[Any],
-    target_dtype: PolarsDataType | None,
+    dtype: PolarsDataType | None,
     *,
     strict: bool,
 ) -> PySeries:
     """Construct Series, with fallbacks for basic type mismatch (eg: bool/int)."""
-    while True:
-        try:
-            return constructor(name, values, strict)
-        except TypeError as exc:
-            str_exc = str(exc)
-
-            # from x to float
-            # error message can be:
-            #   - integers: "'float' object cannot be interpreted as an integer"
-            if "'float'" in str_exc and (
-                # we do not accept float values as int/temporal, as it causes silent
-                # information loss; the caller should explicitly cast in this case.
-                target_dtype not in (INTEGER_DTYPES | TEMPORAL_DTYPES)
-            ):
-                constructor = py_type_to_constructor(float)
-
-            # from x to string
-            # error message can be:
-            #   - integers: "'str' object cannot be interpreted as an integer"
-            #   - floats: "must be real number, not str"
-            elif "'str'" in str_exc or str_exc == "must be real number, not str":
-                constructor = py_type_to_constructor(str)
-
-            # from x to int
-            # error message can be:
-            #   - bools: "'int' object cannot be converted to 'PyBool'"
-            elif str_exc == "'int' object cannot be converted to 'PyBool'":
-                constructor = py_type_to_constructor(int)
-
-            elif "decimal.Decimal" in str_exc:
-                constructor = py_type_to_constructor(PyDecimal)
-            else:
-                raise
+    try:
+        return constructor(name, values, strict)
+    except TypeError:
+        if dtype is None:
+            return PySeries.new_from_any_values(name, values, strict=strict)
+        else:
+            return PySeries.new_from_any_values_and_dtype(
+                name, values, dtype, strict=strict
+            )
 
 
 def iterable_to_pyseries(

diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py
@@ -9487,7 +9487,7 @@ def fold(self, operation: Callable[[Series, Series], Series]) -> Series:
 
         >>> df = pl.DataFrame(
         ...     {
-        ...         "a": ["foo", "bar", 2],
+        ...         "a": ["foo", "bar", None],
         ...         "b": [1, 2, 3],
         ...         "c": [1.0, 2.0, 3.0],
         ...     }

diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py
@@ -2964,7 +2964,7 @@ def max(self) -> Self:
 
         Examples
         --------
-        >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]})
+        >>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]})
         >>> df.select(pl.col("a").max())
         shape: (1, 1)
         ┌─────┐
@@ -2983,7 +2983,7 @@ def min(self) -> Self:
 
         Examples
         --------
-        >>> df = pl.DataFrame({"a": [-1, float("nan"), 1]})
+        >>> df = pl.DataFrame({"a": [-1.0, float("nan"), 1.0]})
         >>> df.select(pl.col("a").min())
         shape: (1, 1)
         ┌──────┐
@@ -3005,7 +3005,7 @@ def nan_max(self) -> Self:
 
         Examples
         --------
-        >>> df = pl.DataFrame({"a": [0, float("nan")]})
+        >>> df = pl.DataFrame({"a": [0.0, float("nan")]})
         >>> df.select(pl.col("a").nan_max())
         shape: (1, 1)
         ┌─────┐
@@ -3027,7 +3027,7 @@ def nan_min(self) -> Self:
 
         Examples
         --------
-        >>> df = pl.DataFrame({"a": [0, float("nan")]})
+        >>> df = pl.DataFrame({"a": [0.0, float("nan")]})
         >>> df.select(pl.col("a").nan_min())
         shape: (1, 1)
         ┌─────┐

diff --git a/py-polars/polars/io/spreadsheet/functions.py b/py-polars/polars/io/spreadsheet/functions.py
@@ -777,7 +777,7 @@ def _read_spreadsheet_openpyxl(
                 # the non-strings will become null, so we handle the cast here
                 values = [str(v) if (v is not None) else v for v in values]
 
-            s = pl.Series(name, values, dtype=dtype)
+            s = pl.Series(name, values, dtype=dtype, strict=False)
             series_data.append(s)
 
     df = pl.DataFrame(

diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py
@@ -1958,7 +1958,7 @@ def nan_max(self) -> int | float | date | datetime | timedelta | str:
         >>> s.nan_max()
         4
 
-        >>> s = pl.Series("a", [1, float("nan"), 4])
+        >>> s = pl.Series("a", [1.0, float("nan"), 4.0])
         >>> s.nan_max()
         nan
         """
@@ -1977,7 +1977,7 @@ def nan_min(self) -> int | float | date | datetime | timedelta | str:
         >>> s.nan_min()
         1
 
-        >>> s = pl.Series("a", [1, float("nan"), 4])
+        >>> s = pl.Series("a", [1.0, float("nan"), 4.0])
         >>> s.nan_min()
         nan
         """
@@ -4730,7 +4730,7 @@ def fill_nan(self, value: int | float | Expr | None) -> Series:
 
         Examples
         --------
-        >>> s = pl.Series("a", [1, 2, 3, float("nan")])
+        >>> s = pl.Series("a", [1.0, 2.0, 3.0, float("nan")])
         >>> s.fill_nan(0)
         shape: (4,)
         Series: 'a' [f64]

diff --git a/py-polars/src/series/construction.rs b/py-polars/src/series/construction.rs
@@ -98,7 +98,7 @@ impl PySeries {
 #[pymethods]
 impl PySeries {
     #[staticmethod]
-    fn new_opt_bool(name: &str, values: &Bound<PyAny>, strict: bool) -> PyResult<Self> {
+    fn new_opt_bool(name: &str, values: &Bound<PyAny>, _strict: bool) -> PyResult<Self> {
         let len = values.len()?;
         let mut builder = BooleanChunkedBuilder::new(name, len);
 
@@ -107,25 +107,18 @@ impl PySeries {
             if value.is_none() {
                 builder.append_null()
             } else {
-                match value.extract::<bool>() {
-                    Ok(v) => builder.append_value(v),
-                    Err(e) => {
-                        if strict {
-                            return Err(e);
-                        }
-                        builder.append_null()
-                    },
-                }
+                let v = value.extract::<bool>()?;
+                builder.append_value(v)
             }
         }
-        let ca = builder.finish();
 
+        let ca = builder.finish();
         let s = ca.into_series();
         Ok(s.into())
     }
 }
 
-fn new_primitive<'a, T>(name: &str, values: &'a Bound<PyAny>, strict: bool) -> PyResult<PySeries>
+fn new_primitive<'a, T>(name: &str, values: &'a Bound<PyAny>, _strict: bool) -> PyResult<PySeries>
 where
     T: PolarsNumericType,
     ChunkedArray<T>: IntoSeries,
@@ -139,19 +132,12 @@ where
         if value.is_none() {
             builder.append_null()
         } else {
-            match value.extract::<T::Native>() {
-                Ok(v) => builder.append_value(v),
-                Err(e) => {
-                    if strict {
-                        return Err(e);
-                    }
-                    builder.append_null()
-                },
-            }
+            let v = value.extract::<T::Native>()?;
+            builder.append_value(v)
         }
     }
-    let ca = builder.finish();
 
+    let ca = builder.finish();
     let s = ca.into_series();
     Ok(s.into())
 }
@@ -243,9 +229,11 @@ impl PySeries {
 
         for res in values.iter()? {
             let value = res?;
-            match value.extract::<Cow<str>>() {
-                Ok(v) => builder.append_value(v),
-                Err(_) => builder.append_null(),
+            if value.is_none() {
+                builder.append_null()
+            } else {
+                let v = value.extract::<Cow<str>>()?;
+                builder.append_value(v)
             }
         }
 
@@ -261,9 +249,11 @@ impl PySeries {
 
         for res in values.iter()? {
             let value = res?;
-            match value.extract::<&[u8]>() {
-                Ok(v) => builder.append_value(v),
-                Err(_) => builder.append_null(),
+            if value.is_none() {
+                builder.append_null()
+            } else {
+                let v = value.extract::<&[u8]>()?;
+                builder.append_value(v)
             }
         }
 
@@ -274,9 +264,7 @@ impl PySeries {
 
     #[staticmethod]
     fn new_decimal(name: &str, values: &Bound<PyAny>, strict: bool) -> PyResult<Self> {
-        // Create a fake dtype with a placeholder "none" scale, to be inferred later.
-        let dtype = DataType::Decimal(None, None);
-        Self::new_from_any_values_and_dtype(name, values, Wrap(dtype), strict)
+        Self::new_from_any_values(name, values, strict)
     }
 
     #[staticmethod]

diff --git a/py-polars/tests/unit/constructors/test_constructors.py b/py-polars/tests/unit/constructors/test_constructors.py
@@ -1113,7 +1113,7 @@ def test_from_dicts_list_struct_without_inner_dtype_5611() -> None:
 
 
 def test_from_dict_upcast_primitive() -> None:
-    df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]})
+    df = pl.from_dict({"a": [1, 2.1, 3], "b": [4, 5, 6.4]}, strict=False)
     assert df.dtypes == [pl.Float64, pl.Float64]
 
 

diff --git a/py-polars/tests/unit/constructors/test_dataframe.py b/py-polars/tests/unit/constructors/test_dataframe.py
@@ -105,10 +105,7 @@ def test_df_init_strict() -> None:
 
     df = pl.DataFrame(data, schema=schema, strict=False)
 
-    # TODO: This should result in a Float Series without nulls
-    # https://github.com/pola-rs/polars/issues/14427
-    assert df["a"].to_list() == [1, 2, None]
-
+    assert df["a"].to_list() == [1, 2, 3]
     assert df["a"].dtype == pl.Int8
 
 

diff --git a/py-polars/tests/unit/constructors/test_series.py b/py-polars/tests/unit/constructors/test_series.py
@@ -73,7 +73,10 @@ def test_sequence_of_series_with_dtype(dtype: pl.PolarsDataType | None) -> None:
 def test_upcast_primitive_and_strings(
     values: list[Any], dtype: pl.PolarsDataType, expected_dtype: pl.PolarsDataType
 ) -> None:
-    assert pl.Series(values, dtype=dtype).dtype == expected_dtype
+    with pytest.raises(TypeError):
+        pl.Series(values, dtype=dtype, strict=True)
+
+    assert pl.Series(values, dtype=dtype, strict=False).dtype == expected_dtype
 
 
 def test_preserve_decimal_precision() -> None:

diff --git a/py-polars/tests/unit/dataframe/test_getitem.py b/py-polars/tests/unit/dataframe/test_getitem.py
@@ -205,7 +205,7 @@ def test_df_getitem_col_mixed_inputs(input: list[Any], match: str) -> None:
 @pytest.mark.parametrize(
     ("input", "match"),
     [
-        ([0.0, 1.0], "'float' object cannot be interpreted as an integer"),
+        ([0.0, 1.0], "unexpected value while building Series of type Int64"),
         (
             pl.Series([[1, 2], [3, 4]]),
             "cannot treat Series of type List\\(Int64\\) as indices",

diff --git a/py-polars/tests/unit/dataframe/test_serde.py b/py-polars/tests/unit/dataframe/test_serde.py
@@ -78,7 +78,7 @@ def test_df_serde_enum() -> None:
     [
         ([[1, 2, 3], [None, None, None], [1, None, 3]], pl.Array(pl.Int32(), shape=3)),
         ([["a", "b"], [None, None]], pl.Array(pl.Utf8, shape=2)),
-        ([[True, False, None], [None, None, None]], pl.Array(pl.Utf8, shape=3)),
+        ([[True, False, None], [None, None, None]], pl.Array(pl.Boolean, shape=3)),
         (
             [[[1, 2, 3], [4, None, 5]], None, [[None, None, 2]]],
             pl.List(pl.Array(pl.Int32(), shape=3)),

diff --git a/py-polars/tests/unit/datatypes/test_decimal.py b/py-polars/tests/unit/datatypes/test_decimal.py
@@ -22,7 +22,7 @@ def permutations_int_dec_none() -> list[tuple[D | int | None, ...]]:
                 D("-0.01"),
                 D("1.2345678"),
                 D("500"),
-                # -1,  # TODO: Address in https://github.com/pola-rs/polars/issues/14427
+                -1,
                 None,
             ]
         )