From c40c6ec4667069d5ac0da328e2f569f2fb9d39ea Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Thu, 7 Mar 2024 17:19:36 -0500 Subject: [PATCH 1/3] Init working impl --- .../chunked_array/logical/categorical/mod.rs | 52 +++++++------------ py-polars/tests/unit/datatypes/test_enum.py | 28 +++++++++- 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 15fa1e994453..dbd13c7d29e5 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -130,18 +130,18 @@ impl CategoricalChunked { } } - // Convert to fixed enum. In case a value is not in the categories return Error - pub fn to_enum(&self, categories: &Utf8ViewArray, hash: u128) -> PolarsResult { + // Convert to fixed enum. Values not in categories are mapped to None. + pub fn to_enum(&self, categories: &Utf8ViewArray, hash: u128) -> Self { // Fast paths match self.get_rev_map().as_ref() { RevMapping::Local(_, cur_hash) if hash == *cur_hash => { return unsafe { - Ok(CategoricalChunked::from_cats_and_rev_map_unchecked( + CategoricalChunked::from_cats_and_rev_map_unchecked( self.physical().clone(), self.get_rev_map().clone(), true, self.get_ordering(), - )) + ) }; }, _ => (), @@ -159,34 +159,22 @@ impl CategoricalChunked { let new_phys: UInt32Chunked = self .physical() .into_iter() - .map(|opt_v: Option| { - let Some(v) = opt_v else { - return Ok(None); - }; - - let Some(idx) = idx_map.get(&v) else { - polars_bail!( - not_in_enum, - value = old_rev_map.get(v), - categories = &categories - ); - }; - - Ok(Some(*idx)) + .map(|opt_v: Option| match opt_v { + Some(v) => Ok(idx_map.get(&v).copied()), + None => Ok(None), }) - .collect::>()?; - - Ok( - // SAFETY: we created the physical from the enum categories - unsafe { - CategoricalChunked::from_cats_and_rev_map_unchecked( - new_phys, - Arc::new(RevMapping::Local(categories.clone(), hash)), - true, - self.get_ordering(), - ) - }, - ) + .collect::>() + .unwrap(); + + // SAFETY: we created the physical from the enum categories + unsafe { + CategoricalChunked::from_cats_and_rev_map_unchecked( + new_phys, + Arc::new(RevMapping::Local(categories.clone(), hash)), + true, + self.get_ordering(), + ) + } } pub(crate) fn get_flags(&self) -> Settings { @@ -373,7 +361,7 @@ impl LogicalType for CategoricalChunked { polars_bail!(ComputeError: "can not cast to enum with global mapping") }; Ok(self - .to_enum(categories, *hash)? + .to_enum(categories, *hash) .set_ordering(*ordering, true) .into_series() .with_name(self.name())) diff --git a/py-polars/tests/unit/datatypes/test_enum.py b/py-polars/tests/unit/datatypes/test_enum.py index 07cd6e7c437b..b60a15de293f 100644 --- a/py-polars/tests/unit/datatypes/test_enum.py +++ b/py-polars/tests/unit/datatypes/test_enum.py @@ -117,6 +117,26 @@ def test_casting_to_an_enum_from_categorical() -> None: assert_series_equal(s2, expected) +def test_casting_to_an_enum_from_categorical_nonstrict() -> None: + dtype = pl.Enum(["a", "b"]) + s = pl.Series([None, "a", "b", "c"], dtype=pl.Categorical) + s2 = s.cast(dtype, strict=False) + assert s2.dtype == dtype + assert s2.null_count() == 2 # "c" mapped to null + expected = pl.Series([None, "a", "b", None], dtype=dtype) + assert_series_equal(s2, expected) + + +def test_casting_to_an_enum_from_enum_nonstrict() -> None: + dtype = pl.Enum(["a", "b"]) + s = pl.Series([None, "a", "b", "c"], dtype=pl.Enum(["a", "b", "c"])) + s2 = s.cast(dtype, strict=False) + assert s2.dtype == dtype + assert s2.null_count() == 2 # "c" mapped to null + expected = pl.Series([None, "a", "b", None], dtype=dtype) + assert_series_equal(s2, expected) + + def test_casting_to_an_enum_from_integer() -> None: dtype = pl.Enum(["a", "b", "c"]) expected = pl.Series([None, "b", "a", "c"], dtype=dtype) @@ -139,7 +159,9 @@ def test_casting_to_an_enum_oob_from_integer() -> None: def test_casting_to_an_enum_from_categorical_nonexistent() -> None: with pytest.raises( pl.ComputeError, - match=("value 'c' is not present in Enum"), + match=( + r"conversion from `cat` to `enum` failed in column '' for 1 out of 4 values: \[\"c\"\]" + ), ): pl.Series([None, "a", "b", "c"], dtype=pl.Categorical).cast(pl.Enum(["a", "b"])) @@ -159,7 +181,9 @@ def test_casting_to_an_enum_from_global_categorical() -> None: def test_casting_to_an_enum_from_global_categorical_nonexistent() -> None: with pytest.raises( pl.ComputeError, - match=("value 'c' is not present in Enum"), + match=( + r"conversion from `cat` to `enum` failed in column '' for 1 out of 4 values: \[\"c\"\]" + ), ): pl.Series([None, "a", "b", "c"], dtype=pl.Categorical).cast(pl.Enum(["a", "b"])) From 42253cd1f19d8a5528e064e57296b54ae48bbb0b Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Fri, 8 Mar 2024 09:39:41 -0500 Subject: [PATCH 2/3] Remove result indirection --- .../src/chunked_array/logical/categorical/mod.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index dbd13c7d29e5..423fc2b04628 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -160,11 +160,10 @@ impl CategoricalChunked { .physical() .into_iter() .map(|opt_v: Option| match opt_v { - Some(v) => Ok(idx_map.get(&v).copied()), - None => Ok(None), + Some(v) => idx_map.get(&v).copied(), + None => None, }) - .collect::>() - .unwrap(); + .collect(); // SAFETY: we created the physical from the enum categories unsafe { From 3270c584b5b57b9d22481ffba01591bafe471929 Mon Sep 17 00:00:00 2001 From: Marshall Crumiller Date: Fri, 8 Mar 2024 09:45:33 -0500 Subject: [PATCH 3/3] Simplify --- .../polars-core/src/chunked_array/logical/categorical/mod.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs index 423fc2b04628..959c1f5ec666 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/mod.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/mod.rs @@ -159,10 +159,7 @@ impl CategoricalChunked { let new_phys: UInt32Chunked = self .physical() .into_iter() - .map(|opt_v: Option| match opt_v { - Some(v) => idx_map.get(&v).copied(), - None => None, - }) + .map(|opt_v: Option| opt_v.and_then(|v| idx_map.get(&v).copied())) .collect(); // SAFETY: we created the physical from the enum categories