diff --git a/crates/polars-core/src/chunked_array/metadata.rs b/crates/polars-core/src/chunked_array/metadata.rs deleted file mode 100644 index 194a7fb57043..000000000000 --- a/crates/polars-core/src/chunked_array/metadata.rs +++ /dev/null @@ -1,188 +0,0 @@ -use std::fmt; - -use bitflags::bitflags; -use polars_utils::IdxSize; -#[cfg(feature = "serde")] -use serde::{Deserialize, Serialize}; - -use super::PolarsDataType; -use crate::series::IsSorted; - -bitflags! { - #[derive(Default, Debug, Clone, Copy, PartialEq)] - pub struct MetadataProperties: u32 { - const SORTED = 0x01; - const FAST_EXPLODE_LIST = 0x02; - const MIN_VALUE = 0x04; - const MAX_VALUE = 0x08; - const DISTINCT_COUNT = 0x10; - } -} - -pub struct Metadata { - flags: MetadataFlags, - - min_value: Option, - max_value: Option, - - /// Number of unique non-null values - distinct_count: Option, -} - -bitflags! { - #[derive(Default, Debug, Clone, Copy, PartialEq)] - #[cfg_attr(feature = "serde", derive(Serialize, Deserialize), serde(transparent))] - pub struct MetadataFlags: u8 { - const SORTED_ASC = 0x01; - const SORTED_DSC = 0x02; - const FAST_EXPLODE_LIST = 0x04; - } -} - -impl MetadataFlags { - pub fn set_sorted_flag(&mut self, sorted: IsSorted) { - match sorted { - IsSorted::Not => { - self.remove(MetadataFlags::SORTED_ASC | MetadataFlags::SORTED_DSC); - }, - IsSorted::Ascending => { - self.remove(MetadataFlags::SORTED_DSC); - self.insert(MetadataFlags::SORTED_ASC) - }, - IsSorted::Descending => { - self.remove(MetadataFlags::SORTED_ASC); - self.insert(MetadataFlags::SORTED_DSC) - }, - } - } - - pub fn get_sorted_flag(&self) -> IsSorted { - if self.contains(MetadataFlags::SORTED_ASC) { - IsSorted::Ascending - } else if self.contains(MetadataFlags::SORTED_DSC) { - IsSorted::Descending - } else { - IsSorted::Not - } - } - - pub fn get_fast_explode_list(&self) -> bool { - self.contains(MetadataFlags::FAST_EXPLODE_LIST) - } -} - -impl Default for Metadata { - fn default() -> Self { - Self::DEFAULT - } -} - -impl Clone for Metadata { - fn clone(&self) -> Self { - Self { - flags: self.flags, - min_value: self.min_value.clone(), - max_value: self.max_value.clone(), - distinct_count: self.distinct_count, - } - } -} - -impl fmt::Debug for Metadata { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Metadata") - .field("flags", &self.flags) - .field("min_value", &self.min_value) - .field("max_value", &self.max_value) - .field("distinct_count", &self.distinct_count) - .finish() - } -} - -impl Metadata { - pub const DEFAULT: Metadata = Self { - flags: MetadataFlags::empty(), - - min_value: None, - max_value: None, - - distinct_count: None, - }; - - pub fn is_sorted_ascending(&self) -> bool { - self.flags.contains(MetadataFlags::SORTED_ASC) - } - - pub fn set_sorted_ascending(&mut self, value: bool) { - self.flags.set(MetadataFlags::SORTED_ASC, value); - } - - pub fn is_sorted_descending(&self) -> bool { - self.flags.contains(MetadataFlags::SORTED_DSC) - } - - pub fn set_sorted_descending(&mut self, value: bool) { - self.flags.set(MetadataFlags::SORTED_DSC, value); - } - - pub fn get_fast_explode_list(&self) -> bool { - self.flags.contains(MetadataFlags::FAST_EXPLODE_LIST) - } - - pub fn set_fast_explode_list(&mut self, value: bool) { - self.flags.set(MetadataFlags::FAST_EXPLODE_LIST, value); - } - - pub fn is_sorted(&self) -> IsSorted { - let ascending = self.is_sorted_ascending(); - let descending = self.is_sorted_descending(); - - match (ascending, descending) { - (true, false) => IsSorted::Ascending, - (false, true) => IsSorted::Descending, - (false, false) => IsSorted::Not, - (true, true) => unreachable!(), - } - } - - pub fn set_sorted_flag(&mut self, is_sorted: IsSorted) { - let (ascending, descending) = match is_sorted { - IsSorted::Ascending => (true, false), - IsSorted::Descending => (false, true), - IsSorted::Not => (false, false), - }; - - self.set_sorted_ascending(ascending); - self.set_sorted_descending(descending); - } - - pub fn set_distinct_count(&mut self, distinct_count: Option) { - self.distinct_count = distinct_count; - } - pub fn set_min_value(&mut self, min_value: Option) { - self.min_value = min_value; - } - pub fn set_max_value(&mut self, max_value: Option) { - self.max_value = max_value; - } - - pub fn set_flags(&mut self, flags: MetadataFlags) { - self.flags = flags; - } - - pub fn get_distinct_count(&self) -> Option { - self.distinct_count - } - - pub fn get_min_value(&self) -> Option<&T::OwnedPhysical> { - self.min_value.as_ref() - } - - pub fn get_max_value(&self) -> Option<&T::OwnedPhysical> { - self.max_value.as_ref() - } - - pub fn get_flags(&self) -> MetadataFlags { - self.flags - } -} diff --git a/crates/polars-core/src/chunked_array/metadata/collect.rs b/crates/polars-core/src/chunked_array/metadata/collect.rs new file mode 100644 index 000000000000..1d88ab87f18a --- /dev/null +++ b/crates/polars-core/src/chunked_array/metadata/collect.rs @@ -0,0 +1,36 @@ +use super::{Metadata, MetadataCollectable, MetadataEnv}; +use crate::chunked_array::{ChunkAgg, ChunkedArray, PolarsDataType, PolarsNumericType}; +use crate::series::IsSorted; + +impl MetadataCollectable for ChunkedArray +where + T: PolarsDataType, + T: PolarsNumericType, + ChunkedArray: ChunkAgg, +{ + fn collect_cheap_metadata(&mut self) { + if !MetadataEnv::extensive_use() { + return; + } + + if self.len() < 32 { + let (min, max) = self + .min_max() + .map_or((None, None), |(l, r)| (Some(l), Some(r))); + + let has_one_value = self.len() - self.null_count() == 1; + + let md = Metadata::DEFAULT + .sorted_opt(has_one_value.then_some(IsSorted::Ascending)) + .min_value_opt(min) + .max_value_opt(max) + .distinct_count_opt(has_one_value.then_some(1)); + + if !md.is_empty() { + mdlog!("Initializing cheap metadata"); + } + + self.merge_metadata(md); + } + } +} diff --git a/crates/polars-core/src/chunked_array/metadata/env.rs b/crates/polars-core/src/chunked_array/metadata/env.rs new file mode 100644 index 000000000000..605ff5dd00df --- /dev/null +++ b/crates/polars-core/src/chunked_array/metadata/env.rs @@ -0,0 +1,119 @@ +#[derive(Debug, Clone, Copy)] +pub struct MetadataEnv(#[cfg(debug_assertions)] u32); + +#[cfg(debug_assertions)] +impl MetadataEnv { + pub const ENABLED: u32 = 0x1; + pub const EXTENSIVE_USE: u32 = 0x2; + pub const LOG: u32 = 0x4; + + #[inline(always)] + fn get_cached() -> Self { + static CACHED: std::sync::OnceLock = std::sync::OnceLock::new(); + *CACHED.get_or_init(Self::get) + } + + #[inline(always)] + fn get() -> Self { + let Ok(env) = std::env::var("POLARS_METADATA_FLAGS") else { + return Self(Self::ENABLED); + }; + + if env == "0" { + return Self(0); + } + + // @NOTE + // We use a RwLock here so that we can mutate it for specific runs or sections of runs when + // we perform A/B tests. + static CACHED: std::sync::RwLock> = + std::sync::RwLock::new(None); + + if let Some((cached_str, cached_value)) = CACHED.read().unwrap().as_ref() { + if cached_str == &env { + return *cached_value; + } + }; + + let mut mdenv = Self(Self::ENABLED); + for arg in env.split(',') { + match &arg.trim().to_lowercase()[..] { + "extensive" => mdenv.0 |= Self::EXTENSIVE_USE, + "log" => mdenv.0 |= Self::LOG, + _ => panic!("Invalid `POLARS_METADATA_FLAGS` environment variable"), + } + } + + mdenv + } + + #[inline(always)] + pub fn disabled() -> bool { + !Self::enabled() + } + + #[inline(always)] + pub fn enabled() -> bool { + Self::get().0 & Self::ENABLED != 0 + } + + #[inline(always)] + pub fn log() -> bool { + Self::get_cached().0 & Self::LOG != 0 + } + + #[inline(always)] + pub fn extensive_use() -> bool { + Self::get().0 & Self::EXTENSIVE_USE != 0 + } + + pub fn logfile() -> &'static std::sync::Mutex { + static CACHED: std::sync::OnceLock> = + std::sync::OnceLock::new(); + CACHED.get_or_init(|| { + std::sync::Mutex::new(std::fs::File::create(".polars-metadata.log").unwrap()) + }) + } +} + +#[cfg(not(debug_assertions))] +impl MetadataEnv { + #[inline(always)] + pub const fn disabled() -> bool { + false + } + + #[inline(always)] + pub const fn enabled() -> bool { + true + } + + #[inline(always)] + pub const fn log() -> bool { + false + } + + #[inline(always)] + pub const fn extensive_use() -> bool { + false + } +} + +macro_rules! mdlog { + ($s:literal$(, $arg:expr)* $(,)?) => { + #[cfg(debug_assertions)] + { + use std::io::Write; + let file = MetadataEnv::logfile(); + writeln!(file.lock().unwrap(), $s$(, $arg)*).unwrap(); + } + + #[cfg(not(debug_assertions))] + { + _ = $s; + $( + _ = $arg; + )* + } + }; +} diff --git a/crates/polars-core/src/chunked_array/metadata/mod.rs b/crates/polars-core/src/chunked_array/metadata/mod.rs new file mode 100644 index 000000000000..6c3c174fe08f --- /dev/null +++ b/crates/polars-core/src/chunked_array/metadata/mod.rs @@ -0,0 +1,443 @@ +use std::fmt; + +use bitflags::bitflags; +use polars_utils::IdxSize; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +pub use self::env::MetadataEnv; +use super::PolarsDataType; +use crate::series::IsSorted; + +#[macro_use] +mod env; +mod collect; + +macro_rules! mdenv_may_bail { + (get: $field:literal, $value:expr $(=> $default:expr)?) => {{ + if MetadataEnv::disabled() { + return $($default)?; + } + if MetadataEnv::log() { + mdlog!("Get: '{}' <- {:?}", $field, $value); + } + $value + }}; + (set: $field:literal, $value:expr) => { + if MetadataEnv::disabled() { + return; + } + if MetadataEnv::log() { + mdlog!("Set: '{}' <- {:?}", $field, $value); + } + }; + (init: $field:literal, $value:expr ; $default:expr) => {{ + if MetadataEnv::enabled() { + if MetadataEnv::log() { + mdlog!("Ini: '{}' <- {:?}", $field, $value); + } + $value + } else { + $default + } + }}; +} + +bitflags! { + #[derive(Default, Debug, Clone, Copy, PartialEq)] + pub struct MetadataProperties: u32 { + const SORTED = 0x01; + const FAST_EXPLODE_LIST = 0x02; + const MIN_VALUE = 0x04; + const MAX_VALUE = 0x08; + const DISTINCT_COUNT = 0x10; + } +} + +pub struct Metadata { + flags: MetadataFlags, + + min_value: Option, + max_value: Option, + + /// Number of unique non-null values + distinct_count: Option, +} + +pub trait MetadataCollectable: Sized { + fn collect_cheap_metadata(&mut self) {} + + #[inline(always)] + fn with_cheap_metadata(mut self) -> Self { + self.collect_cheap_metadata(); + self + } +} + +bitflags! { + #[derive(Default, Debug, Clone, Copy, PartialEq)] + #[cfg_attr(feature = "serde", derive(Serialize, Deserialize), serde(transparent))] + pub struct MetadataFlags: u8 { + const SORTED_ASC = 0x01; + const SORTED_DSC = 0x02; + const FAST_EXPLODE_LIST = 0x04; + } +} + +impl MetadataFlags { + pub fn set_sorted_flag(&mut self, sorted: IsSorted) { + mdenv_may_bail!(set: "sorted", sorted); + match sorted { + IsSorted::Not => { + self.remove(MetadataFlags::SORTED_ASC | MetadataFlags::SORTED_DSC); + }, + IsSorted::Ascending => { + self.remove(MetadataFlags::SORTED_DSC); + self.insert(MetadataFlags::SORTED_ASC) + }, + IsSorted::Descending => { + self.remove(MetadataFlags::SORTED_ASC); + self.insert(MetadataFlags::SORTED_DSC) + }, + } + } + + pub fn get_sorted_flag(&self) -> IsSorted { + let sorted = if self.contains(MetadataFlags::SORTED_ASC) { + IsSorted::Ascending + } else if self.contains(MetadataFlags::SORTED_DSC) { + IsSorted::Descending + } else { + IsSorted::Not + }; + + mdenv_may_bail!(get: "sorted", sorted => IsSorted::Not) + } + + pub fn set_fast_explode_list(&mut self, fast_explode_list: bool) { + mdenv_may_bail!(set: "fast_explode_list", fast_explode_list); + self.insert(Self::FAST_EXPLODE_LIST) + } + + pub fn get_fast_explode_list(&self) -> bool { + let value = self.contains(MetadataFlags::FAST_EXPLODE_LIST); + mdenv_may_bail!(get: "fast_explode_list", value => false) + } +} + +impl Default for Metadata { + fn default() -> Self { + Self::DEFAULT + } +} + +impl Clone for Metadata { + fn clone(&self) -> Self { + Self { + flags: self.flags, + min_value: self.min_value.clone(), + max_value: self.max_value.clone(), + distinct_count: self.distinct_count, + } + } +} + +impl fmt::Debug for Metadata { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Metadata") + .field("flags", &self.flags) + .field("min_value", &self.min_value) + .field("max_value", &self.max_value) + .field("distinct_count", &self.distinct_count) + .finish() + } +} + +pub enum MetadataMerge { + Keep, + Conflict, + New(Metadata), +} + +impl Metadata { + pub const DEFAULT: Metadata = Self { + flags: MetadataFlags::empty(), + + min_value: None, + max_value: None, + + distinct_count: None, + }; + + // Builder Pattern Methods + pub fn sorted(mut self, is_sorted: IsSorted) -> Self { + self.flags.set_sorted_flag(is_sorted); + self + } + pub fn fast_explode_list(mut self, fast_explode_list: bool) -> Self { + self.flags.set_fast_explode_list(fast_explode_list); + self + } + pub fn flags(mut self, flags: MetadataFlags) -> Self { + self.set_flags(flags); + self + } + pub fn min_value(mut self, min_value: T::OwnedPhysical) -> Self { + self.set_min_value(Some(min_value)); + self + } + pub fn max_value(mut self, max_value: T::OwnedPhysical) -> Self { + self.set_max_value(Some(max_value)); + self + } + pub fn distinct_count(mut self, distinct_count: IdxSize) -> Self { + self.set_distinct_count(Some(distinct_count)); + self + } + pub fn sorted_opt(self, is_sorted: Option) -> Self { + if let Some(is_sorted) = is_sorted { + self.sorted(is_sorted) + } else { + self + } + } + pub fn fast_explode_list_opt(self, fast_explode_list: Option) -> Self { + if let Some(fast_explode_list) = fast_explode_list { + self.fast_explode_list(fast_explode_list) + } else { + self + } + } + pub fn flags_opt(mut self, flags: Option) -> Self { + self.set_flags(flags.unwrap_or(MetadataFlags::empty())); + self + } + pub fn min_value_opt(mut self, min_value: Option) -> Self { + self.set_min_value(min_value); + self + } + pub fn max_value_opt(mut self, max_value: Option) -> Self { + self.set_max_value(max_value); + self + } + pub fn distinct_count_opt(mut self, distinct_count: Option) -> Self { + self.set_distinct_count(distinct_count); + self + } + + /// Create a [`Metadata`] with only the properties set in `props`. + pub fn filter_props_cast(&self, props: MetadataProperties) -> Metadata { + if props.is_empty() { + return Metadata::DEFAULT; + } + + debug_assert!(!props.contains(P::MIN_VALUE)); + debug_assert!(!props.contains(P::MAX_VALUE)); + + use {MetadataFlags as F, MetadataProperties as P}; + + let sorted = if props.contains(P::SORTED) { + self.flags & (F::SORTED_ASC | F::SORTED_DSC) + } else { + F::empty() + }; + let fast_explode_list = if props.contains(P::FAST_EXPLODE_LIST) { + self.flags & F::FAST_EXPLODE_LIST + } else { + F::empty() + }; + + Metadata { + flags: sorted | fast_explode_list, + min_value: None, + max_value: None, + distinct_count: self + .distinct_count + .as_ref() + .cloned() + .filter(|_| props.contains(P::DISTINCT_COUNT)), + } + } + + /// Create a [`Metadata`] with only the properties set in `props`. + pub fn filter_props(&self, props: MetadataProperties) -> Self { + if props.is_empty() { + return Metadata::DEFAULT; + } + + use {MetadataFlags as F, MetadataProperties as P}; + + let sorted = if props.contains(P::SORTED) { + self.flags & (F::SORTED_ASC | F::SORTED_DSC) + } else { + F::empty() + }; + let fast_explode_list = if props.contains(P::FAST_EXPLODE_LIST) { + self.flags & F::FAST_EXPLODE_LIST + } else { + F::empty() + }; + + let min_value = self + .min_value + .as_ref() + .cloned() + .filter(|_| props.contains(P::MIN_VALUE)); + let max_value = self + .max_value + .as_ref() + .cloned() + .filter(|_| props.contains(P::MAX_VALUE)); + let distinct_count = self + .distinct_count + .as_ref() + .cloned() + .filter(|_| props.contains(P::DISTINCT_COUNT)); + + Self { + flags: mdenv_may_bail!(init: "flags", sorted | fast_explode_list ; MetadataFlags::empty()), + min_value: mdenv_may_bail!(init: "min_value", min_value ; None), + max_value: mdenv_may_bail!(init: "max_value", max_value ; None), + distinct_count: mdenv_may_bail!(init: "distinct_count", distinct_count ; None), + } + } + + /// Merge the maximum information from both [`Metadata`]s into one [`Metadata`]. + /// + /// It returns + /// - [`MetadataMerge::Keep`] if the `self` already contains all the information + /// - [`MetadataMerge::New(md)`][MetadataMerge::New] if we have learned new information + /// - [`MetadataMerge::Conflict`] if the two structures contain conflicting metadata + pub fn merge(&self, other: Self) -> MetadataMerge { + if MetadataEnv::disabled() || other.is_empty() { + return MetadataMerge::Keep; + } + + let sorted_conflicts = matches!( + (self.is_sorted(), other.is_sorted()), + (IsSorted::Ascending, IsSorted::Descending) + | (IsSorted::Descending, IsSorted::Ascending) + ); + + let is_conflict = sorted_conflicts + || matches!((self.get_min_value(), other.get_min_value()), (Some(x), Some(y)) if x != y) + || matches!((self.get_max_value(), other.get_max_value()), (Some(x), Some(y)) if x != y) + || matches!((self.get_distinct_count(), other.get_distinct_count()), (Some(x), Some(y)) if x != y); + + if is_conflict { + return MetadataMerge::Conflict; + } + + let is_new = (!self.get_fast_explode_list() && other.get_fast_explode_list()) + || (self.is_sorted() == IsSorted::Not && other.is_sorted() != IsSorted::Not) + || matches!( + (self.get_min_value(), other.get_min_value()), + (None, Some(_)) + ) + || matches!( + (self.get_max_value(), other.get_max_value()), + (None, Some(_)) + ) + || matches!( + (self.get_distinct_count(), other.get_distinct_count()), + (None, Some(_)) + ); + + if !is_new { + return MetadataMerge::Keep; + } + + let min_value = self.min_value.as_ref().cloned().or(other.min_value); + let max_value = self.max_value.as_ref().cloned().or(other.max_value); + let distinct_count = self.distinct_count.or(other.distinct_count); + + MetadataMerge::New(Metadata { + flags: mdenv_may_bail!(init: "flags", self.flags | other.flags ; MetadataFlags::empty()), + min_value: mdenv_may_bail!(init: "min_value", min_value ; None), + max_value: mdenv_may_bail!(init: "max_value", max_value ; None), + distinct_count: mdenv_may_bail!(init: "distinct_count", distinct_count ; None), + }) + } + + pub fn is_empty(&self) -> bool { + self.flags.is_empty() + && self.min_value.is_none() + && self.max_value.is_none() + && self.distinct_count.is_none() + } + + pub fn is_sorted_ascending(&self) -> bool { + self.flags.get_sorted_flag() == IsSorted::Ascending + } + + pub fn set_sorted_ascending(&mut self, value: bool) { + self.flags.set_sorted_flag(if value { + IsSorted::Ascending + } else { + IsSorted::Not + }); + } + + pub fn is_sorted_descending(&self) -> bool { + self.flags.get_sorted_flag() == IsSorted::Descending + } + + pub fn set_sorted_descending(&mut self, value: bool) { + self.flags.set_sorted_flag(if value { + IsSorted::Descending + } else { + IsSorted::Not + }); + } + + pub fn get_fast_explode_list(&self) -> bool { + self.flags.get_fast_explode_list() + } + + pub fn set_fast_explode_list(&mut self, value: bool) { + self.flags.set_fast_explode_list(value); + } + + pub fn is_sorted(&self) -> IsSorted { + self.flags.get_sorted_flag() + } + + pub fn set_sorted_flag(&mut self, is_sorted: IsSorted) { + self.flags.set_sorted_flag(is_sorted) + } + + pub fn set_min_value(&mut self, min_value: Option) { + mdenv_may_bail!(set: "min_value", min_value); + self.min_value = min_value; + } + pub fn set_max_value(&mut self, max_value: Option) { + mdenv_may_bail!(set: "max_value", max_value); + self.max_value = max_value; + } + pub fn set_distinct_count(&mut self, distinct_count: Option) { + mdenv_may_bail!(set: "distinct_count", distinct_count); + self.distinct_count = distinct_count; + } + + pub fn set_flags(&mut self, flags: MetadataFlags) { + mdenv_may_bail!(set: "flags", flags); + self.flags = flags; + } + + pub fn get_min_value(&self) -> Option<&T::OwnedPhysical> { + let min_value = self.min_value.as_ref(); + mdenv_may_bail!(get: "min_value", min_value => None) + } + pub fn get_max_value(&self) -> Option<&T::OwnedPhysical> { + let max_value = self.max_value.as_ref(); + mdenv_may_bail!(get: "max_value", max_value => None) + } + pub fn get_distinct_count(&self) -> Option { + let distinct_count = self.distinct_count; + mdenv_may_bail!(get: "distinct_count", distinct_count => None) + } + pub fn get_flags(&self) -> MetadataFlags { + let flags = self.flags; + mdenv_may_bail!(get: "flags", flags => MetadataFlags::empty()) + } +} diff --git a/crates/polars-core/src/chunked_array/mod.rs b/crates/polars-core/src/chunked_array/mod.rs index d73d1ec61150..1654a6e2d42a 100644 --- a/crates/polars-core/src/chunked_array/mod.rs +++ b/crates/polars-core/src/chunked_array/mod.rs @@ -50,7 +50,7 @@ use std::slice::Iter; use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked; use arrow::legacy::prelude::*; -use self::metadata::{Metadata, MetadataFlags, MetadataProperties}; +use self::metadata::{Metadata, MetadataFlags, MetadataMerge, MetadataProperties}; use crate::series::IsSorted; use crate::utils::{first_non_null, last_non_null}; @@ -267,6 +267,21 @@ impl ChunkedArray { self.md.as_ref()?.get_distinct_count() } + pub fn merge_metadata(&mut self, md: Metadata) { + let Some(self_md) = self.metadata() else { + self.md = Some(Arc::new(md)); + return; + }; + + match self_md.merge(md) { + MetadataMerge::Keep => {}, + MetadataMerge::New(md) => self.md = Some(Arc::new(md)), + MetadataMerge::Conflict => { + panic!("Trying to merge metadata, but got conflicting information") + }, + } + } + /// Copies [`Metadata`] properties specified by `props` from `other` with different underlying [`PolarsDataType`] into /// `self`. /// @@ -295,26 +310,24 @@ impl ChunkedArray { "A MetadataProperty was not added to the copy_metadata_cast check" ); + debug_assert!(!props.contains(P::MIN_VALUE)); + debug_assert!(!props.contains(P::MAX_VALUE)); + // We add a fast path here for if both metadatas are empty, as this is quite a common case. - if props.is_empty() || (self.md.is_none() && other.md.is_none()) { + if props.is_empty() { return; } - debug_assert!(!props.contains(P::MIN_VALUE)); - debug_assert!(!props.contains(P::MAX_VALUE)); - - let md = Arc::make_mut(self.metadata_mut()); - let other_md = other.effective_metadata(); + let Some(other_md) = other.metadata() else { + return; + }; - if props.contains(P::SORTED) { - md.set_sorted_flag(other_md.is_sorted()); - } - if props.contains(P::FAST_EXPLODE_LIST) { - md.set_fast_explode_list(other_md.get_fast_explode_list()); - } - if props.contains(P::DISTINCT_COUNT) { - md.set_distinct_count(other_md.get_distinct_count()); + if other.is_empty() { + return; } + + let other_md = other_md.filter_props_cast(props); + self.merge_metadata(other_md); } /// Copies [`Metadata`] properties specified by `props` from `other` into `self`. @@ -337,41 +350,20 @@ impl ChunkedArray { ); // We add a fast path here for if both metadatas are empty, as this is quite a common case. - if props.is_empty() || (self.md.is_none() && other.md.is_none()) { + if props.is_empty() { return; } - // This checks whether we are okay to just clone the Arc. - if props.is_all() - || ((props.contains(P::SORTED) || self.is_sorted_flag() == other.is_sorted_flag()) - && (props.contains(P::FAST_EXPLODE_LIST) - || self.get_fast_explode_list() == other.get_fast_explode_list()) - && (props.contains(P::MIN_VALUE) || self.get_min_value() == other.get_min_value()) - && (props.contains(P::MAX_VALUE) || self.get_max_value() == other.get_max_value()) - && (props.contains(P::DISTINCT_COUNT) - || self.get_distinct_count() == other.get_distinct_count())) - { - self.md.clone_from(&other.md) - } - - let md = Arc::make_mut(self.metadata_mut()); - let other_md = other.effective_metadata(); + let Some(other_md) = other.metadata() else { + return; + }; - if props.contains(P::SORTED) { - md.set_sorted_flag(other_md.is_sorted()); - } - if props.contains(P::FAST_EXPLODE_LIST) { - md.set_fast_explode_list(other_md.get_fast_explode_list()); - } - if props.contains(P::MIN_VALUE) { - md.set_min_value(other_md.get_max_value().cloned()); - } - if props.contains(P::MAX_VALUE) { - md.set_max_value(other_md.get_min_value().cloned()); - } - if props.contains(P::DISTINCT_COUNT) { - md.set_distinct_count(other_md.get_distinct_count()); + if other.is_empty() { + return; } + + let other_md = other_md.filter_props(props); + self.merge_metadata(other_md); } /// Get the index of the first non null value in this [`ChunkedArray`]. diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index a3e721c3d281..f64ab7df3c7e 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -74,25 +74,47 @@ impl Series { dtype: &DataType, strict: bool, ) -> PolarsResult { + use crate::chunked_array::metadata::MetadataCollectable; + if values.is_empty() { return Ok(Self::new_empty(name, dtype)); } let mut s = match dtype { #[cfg(feature = "dtype-i8")] - DataType::Int8 => any_values_to_integer::(values, strict)?.into_series(), + DataType::Int8 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), #[cfg(feature = "dtype-i16")] - DataType::Int16 => any_values_to_integer::(values, strict)?.into_series(), - DataType::Int32 => any_values_to_integer::(values, strict)?.into_series(), - DataType::Int64 => any_values_to_integer::(values, strict)?.into_series(), + DataType::Int16 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), + DataType::Int32 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), + DataType::Int64 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), #[cfg(feature = "dtype-u8")] - DataType::UInt8 => any_values_to_integer::(values, strict)?.into_series(), + DataType::UInt8 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), #[cfg(feature = "dtype-u16")] - DataType::UInt16 => any_values_to_integer::(values, strict)?.into_series(), - DataType::UInt32 => any_values_to_integer::(values, strict)?.into_series(), - DataType::UInt64 => any_values_to_integer::(values, strict)?.into_series(), - DataType::Float32 => any_values_to_f32(values, strict)?.into_series(), - DataType::Float64 => any_values_to_f64(values, strict)?.into_series(), + DataType::UInt16 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), + DataType::UInt32 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), + DataType::UInt64 => any_values_to_integer::(values, strict)? + .with_cheap_metadata() + .into_series(), + DataType::Float32 => any_values_to_f32(values, strict)? + .with_cheap_metadata() + .into_series(), + DataType::Float64 => any_values_to_f64(values, strict)? + .with_cheap_metadata() + .into_series(), DataType::Boolean => any_values_to_bool(values, strict)?.into_series(), DataType::String => any_values_to_string(values, strict)?.into_series(), DataType::Binary => any_values_to_binary(values, strict)?.into_series(), @@ -167,6 +189,7 @@ fn any_values_to_integer( } Ok(builder.finish()) } + if strict { any_values_to_integer_strict::(values) } else {