From c5412eedce14fee5bae756b753b77a9bba6970d5 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 6 Nov 2024 21:22:45 +0400 Subject: [PATCH 1/5] feat: Support use of Duration dtype in `to_string` --- Cargo.lock | 1 + crates/polars-core/Cargo.toml | 1 + .../src/chunked_array/temporal/conversion.rs | 38 ++++- .../src/chunked_array/temporal/date.rs | 1 + .../src/chunked_array/temporal/datetime.rs | 6 +- .../src/chunked_array/temporal/duration.rs | 19 +++ .../src/chunked_array/temporal/time.rs | 1 + crates/polars-core/src/fmt.rs | 149 +++++++++++------- crates/polars-core/src/frame/mod.rs | 2 +- .../src/series/implementations/datetime.rs | 12 +- crates/polars-time/src/series/mod.rs | 26 ++- py-polars/polars/dataframe/frame.py | 2 +- py-polars/polars/expr/datetime.py | 144 ++++++++++++----- py-polars/polars/series/datetime.py | 44 ++++-- .../tests/unit/datatypes/test_duration.py | 64 ++++++++ .../tests/unit/datatypes/test_temporal.py | 64 ++++++++ py-polars/tests/unit/interop/test_interop.py | 102 ++++++------ 17 files changed, 494 insertions(+), 182 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d00cfa7ff0a6..1f322c4be6cf 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2974,6 +2974,7 @@ dependencies = [ "hashbrown 0.14.5", "hashbrown 0.15.1", "indexmap", + "itoa", "ndarray", "num-traits", "once_cell", diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index bb5cdc85cdac..05723922e17e 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -26,6 +26,7 @@ either = { workspace = true } hashbrown = { workspace = true } hashbrown_old_nightly_hack = { workspace = true } indexmap = { workspace = true } +itoa = { workspace = true } ndarray = { workspace = true, optional = true } num-traits = { workspace = true } once_cell = { workspace = true } diff --git a/crates/polars-core/src/chunked_array/temporal/conversion.rs b/crates/polars-core/src/chunked_array/temporal/conversion.rs index f54c17d4081e..7fa4640a4469 100644 --- a/crates/polars-core/src/chunked_array/temporal/conversion.rs +++ b/crates/polars-core/src/chunked_array/temporal/conversion.rs @@ -3,7 +3,9 @@ use chrono::*; use crate::prelude::*; -/// Number of seconds in a day +pub(crate) const NS_IN_DAY: i64 = 86_400_000_000_000; +pub(crate) const US_IN_DAY: i64 = 86_400_000_000; +pub(crate) const MS_IN_DAY: i64 = 86_400_000; pub(crate) const SECONDS_IN_DAY: i64 = 86_400; impl From<&AnyValue<'_>> for NaiveDateTime { @@ -37,12 +39,10 @@ pub fn datetime_to_timestamp_ns(v: NaiveDateTime) -> i64 { v.and_utc().timestamp_nanos_opt().unwrap() } -// Used by lazy for literal conversion pub fn datetime_to_timestamp_ms(v: NaiveDateTime) -> i64 { v.and_utc().timestamp_millis() } -// Used by lazy for literal conversion pub fn datetime_to_timestamp_us(v: NaiveDateTime) -> i64 { let us = v.and_utc().timestamp() * 1_000_000; us + v.and_utc().timestamp_subsec_micros() as i64 @@ -52,6 +52,32 @@ pub(crate) fn naive_datetime_to_date(v: NaiveDateTime) -> i32 { (datetime_to_timestamp_ms(v) / (MILLISECONDS * SECONDS_IN_DAY)) as i32 } -pub(crate) const NS_IN_DAY: i64 = 86_400_000_000_000; -pub(crate) const US_IN_DAY: i64 = 86_400_000_000; -pub(crate) const MS_IN_DAY: i64 = 86_400_000; +pub fn get_strftime_format(fmt: &str, dtype: &DataType) -> String { + if fmt != "iso" { + return fmt.to_string(); + } + #[allow(unreachable_code)] + let fmt: &str = match dtype { + #[cfg(feature = "dtype-datetime")] + DataType::Datetime(tu, tz) => match (tu, tz.is_some()) { + (TimeUnit::Milliseconds, true) => "%F %T%.3f%:z", + (TimeUnit::Milliseconds, false) => "%F %T%.3f", + (TimeUnit::Microseconds, true) => "%F %T%.6f%:z", + (TimeUnit::Microseconds, false) => "%F %T%.6f", + (TimeUnit::Nanoseconds, true) => "%F %T%.9f%:z", + (TimeUnit::Nanoseconds, false) => "%F %T%.9f", + }, + #[cfg(feature = "dtype-date")] + DataType::Date => "%F", + #[cfg(feature = "dtype-time")] + DataType::Time => "%T%.f", + _ => { + let err = format!( + "invalid call to `get_strftime_format`; fmt={:?}, dtype={}", + fmt, dtype + ); + unimplemented!("{}", err) + }, + }; + fmt.to_string() +} diff --git a/crates/polars-core/src/chunked_array/temporal/date.rs b/crates/polars-core/src/chunked_array/temporal/date.rs index ea0bb11d10fc..8ec371e92b37 100644 --- a/crates/polars-core/src/chunked_array/temporal/date.rs +++ b/crates/polars-core/src/chunked_array/temporal/date.rs @@ -33,6 +33,7 @@ impl DateChunked { /// Convert from Date into String with the given format. /// See [chrono strftime/strptime](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html). pub fn to_string(&self, format: &str) -> PolarsResult { + let format = if format == "iso" { "%F" } else { format }; let datefmt_f = |ndt: NaiveDate| ndt.format(format); self.try_apply_into_string_amortized(|val, buf| { let ndt = date32_to_date(val); diff --git a/crates/polars-core/src/chunked_array/temporal/datetime.rs b/crates/polars-core/src/chunked_array/temporal/datetime.rs index 92439e5b7527..3f8c6390696d 100644 --- a/crates/polars-core/src/chunked_array/temporal/datetime.rs +++ b/crates/polars-core/src/chunked_array/temporal/datetime.rs @@ -47,12 +47,12 @@ impl DatetimeChunked { TimeUnit::Microseconds => timestamp_us_to_datetime, TimeUnit::Milliseconds => timestamp_ms_to_datetime, }; - + let format = get_strftime_format(format, self.dtype()); let mut ca: StringChunked = match self.time_zone() { #[cfg(feature = "timezones")] Some(time_zone) => { let parsed_time_zone = time_zone.parse::().expect("already validated"); - let datefmt_f = |ndt| parsed_time_zone.from_utc_datetime(&ndt).format(format); + let datefmt_f = |ndt| parsed_time_zone.from_utc_datetime(&ndt).format(&format); self.try_apply_into_string_amortized(|val, buf| { let ndt = conversion_f(val); write!(buf, "{}", datefmt_f(ndt)) @@ -62,7 +62,7 @@ impl DatetimeChunked { )? }, _ => { - let datefmt_f = |ndt: NaiveDateTime| ndt.format(format); + let datefmt_f = |ndt: NaiveDateTime| ndt.format(&format); self.try_apply_into_string_amortized(|val, buf| { let ndt = conversion_f(val); write!(buf, "{}", datefmt_f(ndt)) diff --git a/crates/polars-core/src/chunked_array/temporal/duration.rs b/crates/polars-core/src/chunked_array/temporal/duration.rs index df8a51388baf..d17eb9a9df1f 100644 --- a/crates/polars-core/src/chunked_array/temporal/duration.rs +++ b/crates/polars-core/src/chunked_array/temporal/duration.rs @@ -1,4 +1,5 @@ use crate::export::chrono::Duration as ChronoDuration; +use crate::fmt::fmt_duration_string; use crate::prelude::DataType::Duration; use crate::prelude::*; @@ -60,6 +61,24 @@ impl DurationChunked { self.2 = Some(Duration(tu)) } + /// Convert from [`Duration`] to String; note that `strftime` format + /// strings are not supported, only the specifiers 'iso' and 'polars'. + pub fn to_string(&self, format: &str) -> PolarsResult { + match format { + "iso" | "polars" => { + let out: StringChunked = self + .0 + .apply_nonnull_values_generic(DataType::String, |v: i64| { + fmt_duration_string(v, self.time_unit(), format == "iso") + }); + Ok(out) + }, + _ => Err(PolarsError::InvalidOperation( + format!("format {:?} not supported for Duration type (expected one of 'iso' or 'polars')", format).into(), + )), + } + } + /// Construct a new [`DurationChunked`] from an iterator over [`ChronoDuration`]. pub fn from_duration>( name: PlSmallStr, diff --git a/crates/polars-core/src/chunked_array/temporal/time.rs b/crates/polars-core/src/chunked_array/temporal/time.rs index 77e204c765de..7cc8a767e54e 100644 --- a/crates/polars-core/src/chunked_array/temporal/time.rs +++ b/crates/polars-core/src/chunked_array/temporal/time.rs @@ -23,6 +23,7 @@ impl TimeChunked { pub fn to_string(&self, format: &str) -> StringChunked { let mut ca: StringChunked = self.apply_kernel_cast(&|arr| { let mut buf = String::new(); + let format = if format == "iso" { "%T%.9f" } else { format }; let mut mutarr = MutablePlString::with_capacity(arr.len()); for opt in arr.into_iter() { diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 88fbfae96701..95c07e919125 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -22,6 +22,8 @@ use comfy_table::modifiers::*; use comfy_table::presets::*; #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] use comfy_table::*; +#[cfg(feature = "dtype-duration")] +use itoa; use num_traits::{Num, NumCast}; use crate::config::*; @@ -966,7 +968,9 @@ fn fmt_datetime( } #[cfg(feature = "dtype-duration")] -const NAMES: [&str; 4] = ["d", "h", "m", "s"]; +const DURATION_PARTS: [&str; 4] = ["d", "h", "m", "s"]; +#[cfg(feature = "dtype-duration")] +const ISO_DURATION_PARTS: [&str; 4] = ["D", "H", "M", "S"]; #[cfg(feature = "dtype-duration")] const SIZES_NS: [i64; 4] = [ 86_400_000_000_000, @@ -980,63 +984,102 @@ const SIZES_US: [i64; 4] = [86_400_000_000, 3_600_000_000, 60_000_000, 1_000_000 const SIZES_MS: [i64; 4] = [86_400_000, 3_600_000, 60_000, 1_000]; #[cfg(feature = "dtype-duration")] -fn fmt_duration_ns(f: &mut Formatter<'_>, v: i64) -> fmt::Result { +pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { if v == 0 { - return write!(f, "0ns"); - } - format_duration(f, v, SIZES_NS.as_slice(), NAMES.as_slice())?; - if v % 1000 != 0 { - write!(f, "{}ns", v % 1_000_000_000)?; - } else if v % 1_000_000 != 0 { - write!(f, "{}µs", (v % 1_000_000_000) / 1000)?; - } else if v % 1_000_000_000 != 0 { - write!(f, "{}ms", (v % 1_000_000_000) / 1_000_000)?; - } - Ok(()) -} - -#[cfg(feature = "dtype-duration")] -fn fmt_duration_us(f: &mut Formatter<'_>, v: i64) -> fmt::Result { - if v == 0 { - return write!(f, "0µs"); - } - format_duration(f, v, SIZES_US.as_slice(), NAMES.as_slice())?; - if v % 1000 != 0 { - write!(f, "{}µs", (v % 1_000_000))?; - } else if v % 1_000_000 != 0 { - write!(f, "{}ms", (v % 1_000_000) / 1_000)?; - } - Ok(()) -} + return if iso { + "PT0S".to_string() + } else { + match unit { + TimeUnit::Nanoseconds => "0ns".to_string(), + TimeUnit::Microseconds => "0µs".to_string(), + TimeUnit::Milliseconds => "0ms".to_string(), + } + }; + }; + let sizes = match unit { + TimeUnit::Nanoseconds => SIZES_NS.as_slice(), + TimeUnit::Microseconds => SIZES_US.as_slice(), + TimeUnit::Milliseconds => SIZES_MS.as_slice(), + }; -#[cfg(feature = "dtype-duration")] -fn fmt_duration_ms(f: &mut Formatter<'_>, v: i64) -> fmt::Result { - if v == 0 { - return write!(f, "0ms"); - } - format_duration(f, v, SIZES_MS.as_slice(), NAMES.as_slice())?; - if v % 1_000 != 0 { - write!(f, "{}ms", (v % 1_000))?; - } - Ok(()) -} + let mut s = String::with_capacity(32); + let mut buffer = itoa::Buffer::new(); + if iso { + if v < 0 { + s.push_str("-P"); + v = v.abs() + } else { + s.push('P'); + } + }; -#[cfg(feature = "dtype-duration")] -fn format_duration(f: &mut Formatter, v: i64, sizes: &[i64], names: &[&str]) -> fmt::Result { - for i in 0..4 { + for (i, &size) in sizes.iter().enumerate() { let whole_num = if i == 0 { - v / sizes[i] + v / size } else { - (v % sizes[i - 1]) / sizes[i] + (v % sizes[i - 1]) / size }; - if whole_num <= -1 || whole_num >= 1 { - write!(f, "{}{}", whole_num, names[i])?; - if v % sizes[i] != 0 { - write!(f, " ")?; + if whole_num != 0 || (iso && i == 3) { + s.push_str(buffer.format(whole_num)); + if iso { + if i == 3 { + let secs = match unit { + TimeUnit::Nanoseconds => format!(".{:09}", v % size), + TimeUnit::Microseconds => format!(".{:06}", v % size), + TimeUnit::Milliseconds => format!(".{:03}", v % size), + }; + s.push_str(secs.trim_end_matches('0')); + } + s.push_str(ISO_DURATION_PARTS[i]); + if i == 0 { + s.push('T'); + } + } else { + s.push_str(DURATION_PARTS[i]); + if v % size != 0 { + s.push(' '); + } } + } else if iso && i == 0 { + s.push('T'); } } - Ok(()) + if iso { + if s.ends_with('T') { + s.pop(); + } + } else { + match unit { + TimeUnit::Nanoseconds => { + if v % 1000 != 0 { + s.push_str(buffer.format(v % 1_000_000_000)); + s.push_str("ns"); + } else if v % 1_000_000 != 0 { + s.push_str(buffer.format((v % 1_000_000_000) / 1000)); + s.push_str("µs"); + } else if v % 1_000_000_000 != 0 { + s.push_str(buffer.format((v % 1_000_000_000) / 1_000_000)); + s.push_str("ms"); + } + }, + TimeUnit::Microseconds => { + if v % 1000 != 0 { + s.push_str(buffer.format(v % 1_000_000)); + s.push_str("µs"); + } else if v % 1_000_000 != 0 { + s.push_str(buffer.format((v % 1_000_000) / 1_000)); + s.push_str("ms"); + } + }, + TimeUnit::Milliseconds => { + if v % 1000 != 0 { + s.push_str(buffer.format(v % 1_000)); + s.push_str("ms"); + } + }, + } + } + s } fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result { @@ -1087,11 +1130,7 @@ impl Display for AnyValue<'_> { fmt_datetime(f, *v, *tu, tz.as_ref().map(|v| v.as_ref())) }, #[cfg(feature = "dtype-duration")] - AnyValue::Duration(v, tu) => match tu { - TimeUnit::Nanoseconds => fmt_duration_ns(f, *v), - TimeUnit::Microseconds => fmt_duration_us(f, *v), - TimeUnit::Milliseconds => fmt_duration_ms(f, *v), - }, + AnyValue::Duration(v, tu) => write!(f, "{}", fmt_duration_string(*v, *tu, false)), #[cfg(feature = "dtype-time")] AnyValue::Time(_) => { let nt: chrono::NaiveTime = self.into(); @@ -1221,7 +1260,7 @@ impl Series { #[inline] #[cfg(feature = "dtype-decimal")] -pub fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result { +fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result { use arrow::compute::decimal::format_decimal; let trim_zeros = get_trim_decimal_zeros(); diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index 6fed5c25071c..34662cd0a26f 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -923,7 +923,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn height(&self) -> usize { - self.shape().0 + self.height } /// Returns the size as number of rows * number of columns diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index ace52993b8a1..7f0d575bd916 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -254,16 +254,8 @@ impl SeriesTrait for SeriesWrap { } fn cast(&self, dtype: &DataType, cast_options: CastOptions) -> PolarsResult { - match (dtype, self.0.time_unit()) { - (DataType::String, TimeUnit::Milliseconds) => { - Ok(self.0.to_string("%F %T%.3f")?.into_series()) - }, - (DataType::String, TimeUnit::Microseconds) => { - Ok(self.0.to_string("%F %T%.6f")?.into_series()) - }, - (DataType::String, TimeUnit::Nanoseconds) => { - Ok(self.0.to_string("%F %T%.9f")?.into_series()) - }, + match dtype { + DataType::String => Ok(self.0.to_string("iso")?.into_series()), _ => self.0.cast_with_options(dtype, cast_options), } } diff --git a/crates/polars-time/src/series/mod.rs b/crates/polars-time/src/series/mod.rs index e9f728bf6e09..5009e5beea2a 100644 --- a/crates/polars-time/src/series/mod.rs +++ b/crates/polars-time/src/series/mod.rs @@ -256,14 +256,28 @@ pub trait TemporalMethods: AsSeries { fn to_string(&self, format: &str) -> PolarsResult { let s = self.as_series(); match s.dtype() { - #[cfg(feature = "dtype-date")] - DataType::Date => s.date().map(|ca| Ok(ca.to_string(format)?.into_series()))?, #[cfg(feature = "dtype-datetime")] - DataType::Datetime(_, _) => s - .datetime() - .map(|ca| Ok(ca.to_string(format)?.into_series()))?, + DataType::Datetime(_, _) => { + let format = get_strftime_format(format, s.dtype()); + s.datetime() + .map(|ca| Ok(ca.to_string(format.as_str())?.into_series()))? + }, + #[cfg(feature = "dtype-date")] + DataType::Date => { + let format = get_strftime_format(format, s.dtype()); + s.date() + .map(|ca| Ok(ca.to_string(format.as_str())?.into_series()))? + }, #[cfg(feature = "dtype-time")] - DataType::Time => s.time().map(|ca| ca.to_string(format).into_series()), + DataType::Time => { + let format = get_strftime_format(format, s.dtype()); + s.time() + .map(|ca| ca.to_string(format.as_str()).into_series()) + }, + #[cfg(feature = "dtype-duration")] + DataType::Duration(_) => s + .duration() + .map(|ca| Ok(ca.to_string(format)?.into_series()))?, dt => polars_bail!(opq = to_string, dt), } } diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index d265466261aa..662d09e83d5b 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4553,7 +4553,7 @@ def insert_column(self, index: int, column: IntoExprColumn) -> DataFrame: Parameters ---------- index - Index at which to insert the new `Series` column. + Index at which to insert the new column. column `Series` or expression to insert. diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 4628ee2a9c15..3c72613c9a00 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -450,13 +450,10 @@ def combine(self, time: dt.time | Expr, time_unit: TimeUnit = "us") -> Expr: time = parse_into_expression(time) return wrap_expr(self._pyexpr.dt_combine(time, time_unit)) - def to_string(self, format: str) -> Expr: + def to_string(self, format: str | None = None) -> Expr: """ Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.String)`, but this method allows you to customize the - formatting of the resulting string. - Parameters ---------- format @@ -464,52 +461,123 @@ def to_string(self, format: str) -> Expr: `_ for specification. Example: `"%y-%m-%d"`. + Notes + ----- + * Similar to `cast(pl.String)`, but this method allows you to customize + the formatting of the resulting string; if no format is provided, the + appropriate ISO format for the underlying data type is used. + + * Duration dtype expressions cannot be formatted with `strftime`. Instead, + only "iso" and "polars" are supported as format strings. The "iso" format + string results in ISO8601 duration string output, and "polars" results + in the same form seen in the frame `repr`. + Examples -------- - >>> from datetime import datetime + >>> from datetime import datetime, date, timedelta, time >>> df = pl.DataFrame( ... { - ... "datetime": [ - ... datetime(2020, 3, 1), - ... datetime(2020, 4, 1), - ... datetime(2020, 5, 1), - ... ] + ... "dt": [ + ... date(1999, 3, 1), + ... date(2020, 5, 3), + ... date(2077, 7, 5), + ... ], + ... "dtm": [ + ... datetime(1980, 8, 10, 0, 10, 20), + ... datetime(2010, 10, 20, 8, 25, 35), + ... datetime(2040, 12, 30, 16, 40, 50), + ... ], + ... "tm": [ + ... time(1, 2, 3, 456789), + ... time(23, 59, 9, 101), + ... time(0, 0, 0, 100), + ... ], + ... "td": [ + ... timedelta(days=-1, seconds=-42), + ... timedelta(days=14, hours=-10, microseconds=1001), + ... timedelta(seconds=0), + ... ], ... } ... ) - >>> df.with_columns( - ... pl.col("datetime") - ... .dt.to_string("%Y/%m/%d %H:%M:%S") - ... .alias("datetime_string") + + Default format for temporal dtypes is ISO8601: + + >>> import polars.selectors as cs + >>> df.select((cs.date() | cs.datetime()).dt.to_string().name.prefix("s_")) + shape: (3, 2) + ┌────────────┬────────────────────────────┐ + │ s_dt ┆ s_dtm │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════╪════════════════════════════╡ + │ 1999-03-01 ┆ 1980-08-10 00:10:20.000000 │ + │ 2020-05-03 ┆ 2010-10-20 08:25:35.000000 │ + │ 2077-07-05 ┆ 2040-12-30 16:40:50.000000 │ + └────────────┴────────────────────────────┘ + >>> df.select((cs.time() | cs.duration()).dt.to_string().name.prefix("s_")) + shape: (3, 2) + ┌─────────────────┬───────────────────┐ + │ s_tm ┆ s_td │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════════════════╪═══════════════════╡ + │ 01:02:03.456789 ┆ -P1DT42.S │ + │ 23:59:09.000101 ┆ P13DT14H0.001001S │ + │ 00:00:00.000100 ┆ PT0S │ + └─────────────────┴───────────────────┘ + + All temporal types (aside from `Duration`) support strftime formatting: + + >>> df.select( + ... pl.col("dtm"), + ... s_dtm=pl.col("dtm").dt.to_string("%Y/%m/%d (%H.%M.%S)"), ... ) shape: (3, 2) - ┌─────────────────────┬─────────────────────┐ - │ datetime ┆ datetime_string │ - │ --- ┆ --- │ - │ datetime[μs] ┆ str │ - ╞═════════════════════╪═════════════════════╡ - │ 2020-03-01 00:00:00 ┆ 2020/03/01 00:00:00 │ - │ 2020-04-01 00:00:00 ┆ 2020/04/01 00:00:00 │ - │ 2020-05-01 00:00:00 ┆ 2020/05/01 00:00:00 │ - └─────────────────────┴─────────────────────┘ + ┌─────────────────────┬───────────────────────┐ + │ dtm ┆ s_dtm │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪═══════════════════════╡ + │ 1980-08-10 00:10:20 ┆ 1980/08/10 (00.10.20) │ + │ 2010-10-20 08:25:35 ┆ 2010/10/20 (08.25.35) │ + │ 2040-12-30 16:40:50 ┆ 2040/12/30 (16.40.50) │ + └─────────────────────┴───────────────────────┘ - If you're interested in the day name / month name, you can use - `'%A'` / `'%B'`: + The Polars Duration string format (as seen in the frame repr) is also available: - >>> df.with_columns( - ... day_name=pl.col("datetime").dt.to_string("%A"), - ... month_name=pl.col("datetime").dt.to_string("%B"), + >>> df.select(pl.col("td"), s_td=pl.col("td").dt.to_string("polars")) + shape: (3, 2) + ┌────────────────┬────────────────┐ + │ td ┆ s_td │ + │ --- ┆ --- │ + │ duration[μs] ┆ str │ + ╞════════════════╪════════════════╡ + │ -1d -42s ┆ -1d -42s │ + │ 13d 14h 1001µs ┆ 13d 14h 1001µs │ + │ 0µs ┆ 0µs │ + └────────────────┴────────────────┘ + + If you're interested in extracting the day or month names, you can use + the `'%A'` and `'%B'` strftime specifiers: + + >>> df.select( + ... pl.col("dt"), + ... day_name=pl.col("dtm").dt.to_string("%A"), + ... month_name=pl.col("dtm").dt.to_string("%B"), ... ) shape: (3, 3) - ┌─────────────────────┬───────────┬────────────┐ - │ datetime ┆ day_name ┆ month_name │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ str │ - ╞═════════════════════╪═══════════╪════════════╡ - │ 2020-03-01 00:00:00 ┆ Sunday ┆ March │ - │ 2020-04-01 00:00:00 ┆ Wednesday ┆ April │ - │ 2020-05-01 00:00:00 ┆ Friday ┆ May │ - └─────────────────────┴───────────┴────────────┘ - """ + ┌────────────┬───────────┬────────────┐ + │ dt ┆ day_name ┆ month_name │ + │ --- ┆ --- ┆ --- │ + │ date ┆ str ┆ str │ + ╞════════════╪═══════════╪════════════╡ + │ 1999-03-01 ┆ Sunday ┆ August │ + │ 2020-05-03 ┆ Wednesday ┆ October │ + │ 2077-07-05 ┆ Sunday ┆ December │ + └────────────┴───────────┴────────────┘ + """ + if format is None: + format = "iso" return wrap_expr(self._pyexpr.dt_to_string(format)) def strftime(self, format: str) -> Expr: diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index 3b0e905b84fc..0ce6948aa8aa 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -211,9 +211,6 @@ def to_string(self, format: str) -> Series: """ Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.String)`, but this method allows you to customize the - formatting of the resulting string. - Parameters ---------- format @@ -221,24 +218,49 @@ def to_string(self, format: str) -> Series: `_ for specification. Example: `"%y-%m-%d"`. + Notes + ----- + * Similar to `cast(pl.String)`, but this method allows you to customize + the formatting of the resulting string; if no format is provided, the + appropriate ISO format for the underlying data type is used. + + * Duration dtype Series cannot be formatted with `strftime`. Instead, + only "iso" and "polars" are supported as format strings. The "iso" format + string results in ISO8601 duration string output, and "polars" results + in the same form seen in the frame `repr`. + Examples -------- - >>> from datetime import datetime + >>> from datetime import date >>> s = pl.Series( ... "datetime", - ... [datetime(2020, 3, 1), datetime(2020, 4, 1), datetime(2020, 5, 1)], + ... [date(2020, 3, 1), date(2020, 4, 1), date(2020, 5, 1)], ... ) - >>> s.dt.to_string("%Y/%m/%d") + + Default for temporal dtypes (if not specifying a format string) is ISO8601: + + >>> s.dt.to_string() shape: (3,) Series: 'datetime' [str] [ - "2020/03/01" - "2020/04/01" - "2020/05/01" + "2020-03-01" + "2020-04-01" + "2020-05-01" ] - If you're interested in the day name / month name, you can use - `'%A'` / `'%B'`: + The output can be customized by using a strftime-compatible format string: + + >>> s.dt.to_string("%d/%m/%y") + shape: (3,) + Series: 'datetime' [str] + [ + "01/03/20" + "01/04/20" + "01/05/20" + ] + + If you're interested in using day or month names, you can use + the `'%A'` and/or `'%B'` format strings: >>> s.dt.to_string("%A") shape: (3,) diff --git a/py-polars/tests/unit/datatypes/test_duration.py b/py-polars/tests/unit/datatypes/test_duration.py index 597ac1c4a624..f9df754466c9 100644 --- a/py-polars/tests/unit/datatypes/test_duration.py +++ b/py-polars/tests/unit/datatypes/test_duration.py @@ -22,6 +22,70 @@ def test_duration_cum_sum() -> None: assert df.schema["A"].is_(duration_dtype) is False +def test_duration_cast() -> None: + durations = [ + timedelta(days=180, seconds=56789, microseconds=987654), + timedelta(days=0, seconds=64875, microseconds=8884), + timedelta(days=2, hours=23, seconds=4975, milliseconds=1), + timedelta(hours=1, seconds=1, milliseconds=1, microseconds=1), + timedelta(seconds=-42, milliseconds=-42), + None, + ] + + df = pl.DataFrame({"td": durations}, schema={"td": pl.Duration("us")}) + df_cast = df.select( + td_ms=pl.col("td").cast(pl.Duration("ms")), + td_int=pl.col("td").cast(pl.Int64), + td_str_iso=pl.col("td").dt.to_string(), + td_str_pl=pl.col("td").dt.to_string("polars"), + ) + assert df_cast.schema == { + "td_ms": pl.Duration(time_unit="ms"), + "td_int": pl.Int64, + "td_str_iso": pl.String, + "td_str_pl": pl.String, + } + + expected = pl.DataFrame( + { + "td_ms": [ + timedelta(days=180, seconds=56789, milliseconds=987), + timedelta(days=0, seconds=64875, milliseconds=8), + timedelta(days=2, hours=23, seconds=4975, milliseconds=1), + timedelta(hours=1, seconds=1, milliseconds=1), + timedelta(seconds=-42, milliseconds=-42), + None, + ], + "td_int": [ + 15608789987654, + 64875008884, + 260575001000, + 3601001001, + -42042000, + None, + ], + "td_str_iso": [ + "P180DT15H46M29.987654S", + "PT18H1M15.008884S", + "P3DT22M55.001S", + "PT1H1.001001S", + "-PT42.042S", + None, + ], + "td_str_pl": [ + "180d 15h 46m 29s 987654µs", + "18h 1m 15s 8884µs", + "3d 22m 55s 1ms", + "1h 1s 1001µs", + "-42s -42ms", + None, + ], + }, + schema_overrides={"td_ms": pl.Duration(time_unit="ms")}, + ) + assert_frame_equal(expected, df_cast) + + def test_duration_std_var() -> None: df = pl.DataFrame( {"duration": [1000, 5000, 3000]}, schema={"duration": pl.Duration} diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 042a0fca786b..ec87a45a9206 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -12,6 +12,7 @@ from hypothesis import given import polars as pl +import polars.selectors as cs from polars.datatypes import DTYPE_TEMPORAL_UNITS from polars.exceptions import ( ComputeError, @@ -1094,6 +1095,69 @@ def test_datetime_string_casts() -> None: ] +def test_temporal_to_string_iso_default() -> None: + df = pl.DataFrame( + { + "td": [ + timedelta(days=-1, seconds=-42), + timedelta(days=14, hours=-10, microseconds=1001), + timedelta(seconds=0), + ], + "tm": [ + time(1, 2, 3, 456789), + time(23, 59, 9, 101), + time(0), + ], + "dt": [ + date(1999, 3, 1), + date(2020, 5, 3), + date(2077, 7, 5), + ], + "dtm": [ + datetime(1980, 8, 10, 0, 10, 20), + datetime(2010, 10, 20, 8, 25, 35), + datetime(2040, 12, 30, 16, 40, 50), + ], + } + ).with_columns(dtm_tz=pl.col("dtm").dt.replace_time_zone("Asia/Kathmandu")) + + df_stringified = df.select( + pl.col("td").dt.to_string("polars").alias("td_pl"), cs.temporal().dt.to_string() + ) + assert df_stringified.to_dict(as_series=False) == { + "td_pl": [ + "-1d -42s", + "13d 14h 1001µs", + "0µs", + ], + "td": [ + "-P1DT42.S", + "P13DT14H0.001001S", + "PT0S", + ], + "tm": [ + "01:02:03.456789", + "23:59:09.000101", + "00:00:00", + ], + "dt": [ + "1999-03-01", + "2020-05-03", + "2077-07-05", + ], + "dtm": [ + "1980-08-10 00:10:20.000000", + "2010-10-20 08:25:35.000000", + "2040-12-30 16:40:50.000000", + ], + "dtm_tz": [ + "1980-08-10 00:10:20.000000+05:30", + "2010-10-20 08:25:35.000000+05:45", + "2040-12-30 16:40:50.000000+05:45", + ], + } + + def test_iso_year() -> None: assert pl.Series([datetime(2022, 1, 1, 7, 8, 40)]).dt.iso_year()[0] == 2021 assert pl.Series([date(2022, 1, 1)]).dt.iso_year()[0] == 2021 diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index b69a10671ca7..7ab6c196c807 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -412,13 +412,13 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - ┌─────┬─────┬─────┬─────┬─────┬───────┐ - │ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╪═════╪═════╪═══════╡ - └─────┴─────┴─────┴─────┴─────┴───────┘ - """ + ┌─────┬─────┬─────┬─────┬─────┬───────┐ + │ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╪═════╪═════╪═══════╡ + └─────┴─────┴─────┴─────┴─────┴───────┘ + """ ), ) assert df.shape == (0, 6) @@ -437,11 +437,11 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - ┌──────┬───────┐ - │ misc ┆ other │ - ╞══════╪═══════╡ - └──────┴───────┘ - """ + ┌──────┬───────┐ + │ misc ┆ other │ + ╞══════╪═══════╡ + └──────┴───────┘ + """ ), ) assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String})) @@ -472,17 +472,17 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - # >>> Missing cols with old-style ellipsis, nulls, commented out - # ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐ - # │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │ - # │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ - # │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - # ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡ - # │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │ - # │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │ - # │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │ - # └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘ - """ + # >>> Missing cols with old-style ellipsis, nulls, commented out + # ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐ + # │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │ + # │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ + # │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + # ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡ + # │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │ + # │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │ + # │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │ + # └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘ + """ ), ) assert df.schema == { @@ -505,15 +505,15 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - # >>> no dtypes: - # ┌────────────┬──────┐ - # │ dt ┆ c99 │ - # ╞════════════╪══════╡ - # │ 2023-03-25 ┆ 99 │ - # │ 1999-12-31 ┆ null │ - # │ null ┆ 891 │ - # └────────────┴──────┘ - """ + # >>> no dtypes: + # ┌────────────┬──────┐ + # │ dt ┆ c99 │ + # ╞════════════╪══════╡ + # │ 2023-03-25 ┆ 99 │ + # │ 1999-12-31 ┆ null │ + # │ null ┆ 891 │ + # └────────────┴──────┘ + """ ), ) assert df.schema == {"dt": pl.Date, "c99": pl.Int64} @@ -527,25 +527,25 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - In [2]: with pl.Config() as cfg: - ...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True) - ...: print(df) - ...: - shape: (1, 5) - ╭───────────┬────────────┬───┬───────┬────────────────────────────────╮ - │ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │ - │ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │ - │ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │ - │ i32 ┆ i64 ┆ ┆ ┆ │ - ╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡ - │ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ … ┆ … ┆ … ┆ … ┆ … │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │ - ╰───────────┴────────────┴───┴───────┴────────────────────────────────╯ - # "Een fluitje van een cent..." :) - """ + In [2]: with pl.Config() as cfg: + ...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True) + ...: print(df) + ...: + shape: (1, 5) + ╭───────────┬────────────┬───┬───────┬────────────────────────────────╮ + │ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │ + │ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │ + │ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │ + │ i32 ┆ i64 ┆ ┆ ┆ │ + ╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡ + │ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │ + ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ … ┆ … ┆ … ┆ … ┆ … │ + ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │ + ╰───────────┴────────────┴───┴───────┴────────────────────────────────╯ + # "Een fluitje van een cent..." :) + """ ), ) assert df.shape == (2, 4) From d41cc53c8994f9f0baf2badae7922471747b2241 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 8 Nov 2024 13:03:18 +0400 Subject: [PATCH 2/5] =?UTF-8?q?update=20err=20=E2=86=92=20polars=5Fbail?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../polars-core/src/chunked_array/temporal/duration.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/crates/polars-core/src/chunked_array/temporal/duration.rs b/crates/polars-core/src/chunked_array/temporal/duration.rs index d17eb9a9df1f..32e8b06cb331 100644 --- a/crates/polars-core/src/chunked_array/temporal/duration.rs +++ b/crates/polars-core/src/chunked_array/temporal/duration.rs @@ -73,9 +73,12 @@ impl DurationChunked { }); Ok(out) }, - _ => Err(PolarsError::InvalidOperation( - format!("format {:?} not supported for Duration type (expected one of 'iso' or 'polars')", format).into(), - )), + _ => { + polars_bail!( + InvalidOperation: "format {:?} not supported for Duration type (expected one of 'iso' or 'polars')", + format + ) + }, } } From d3a948a0306c7d1c77292a72dadbbd317700bde4 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Fri, 8 Nov 2024 18:40:30 +0400 Subject: [PATCH 3/5] add explanatory comments --- crates/polars-core/src/fmt.rs | 36 ++++++++++++++++--- py-polars/polars/expr/datetime.py | 2 +- .../tests/unit/datatypes/test_duration.py | 5 +++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 95c07e919125..625a5ddfead3 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -973,10 +973,10 @@ const DURATION_PARTS: [&str; 4] = ["d", "h", "m", "s"]; const ISO_DURATION_PARTS: [&str; 4] = ["D", "H", "M", "S"]; #[cfg(feature = "dtype-duration")] const SIZES_NS: [i64; 4] = [ - 86_400_000_000_000, - 3_600_000_000_000, - 60_000_000_000, - 1_000_000_000, + 86_400_000_000_000, // per day + 3_600_000_000_000, // per hour + 60_000_000_000, // per minute + 1_000_000_000, // per second ]; #[cfg(feature = "dtype-duration")] const SIZES_US: [i64; 4] = [86_400_000_000, 3_600_000_000, 60_000_000, 1_000_000]; @@ -985,6 +985,17 @@ const SIZES_MS: [i64; 4] = [86_400_000, 3_600_000, 60_000, 1_000]; #[cfg(feature = "dtype-duration")] pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { + // take the physical/integer duration value and return either a human-readable version + // of the duration (as used in the Polars frame repr) or an ISO8601 duration string. + // + // Polars: "3d 22m 55s 1ms" + // ISO: "P3DT22M55.001S" + // + // The parts (days, hours, minutes, seconds) occur in the same order in + // each string, so we use the same code to generate each of them, with + // only the separators and the 'seconds' part differing. + // + // Ref: https://en.wikipedia.org/wiki/ISO_8601#Durations if v == 0 { return if iso { "PT0S".to_string() @@ -1006,13 +1017,16 @@ pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { let mut buffer = itoa::Buffer::new(); if iso { if v < 0 { + // negative sign before "P" indicates that the entire ISO duration is negative. + // the Polars version applies a negative sign to each *individual* part. s.push_str("-P"); v = v.abs() } else { s.push('P'); } }; - + // iterate over dtype-specific sizes to appropriately scale + // and extract 'days', 'hours', 'minutes', and 'seconds' parts. for (i, &size) in sizes.iter().enumerate() { let whole_num = if i == 0 { v / size @@ -1022,6 +1036,8 @@ pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { if whole_num != 0 || (iso && i == 3) { s.push_str(buffer.format(whole_num)); if iso { + // (index 3 => 'seconds' part): the ISO version writes + // fractional seconds, not integer nano/micro/milliseconds. if i == 3 { let secs = match unit { TimeUnit::Nanoseconds => format!(".{:09}", v % size), @@ -1029,8 +1045,14 @@ pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { TimeUnit::Milliseconds => format!(".{:03}", v % size), }; s.push_str(secs.trim_end_matches('0')); + if s.ends_with('.') { + s.pop(); + } } s.push_str(ISO_DURATION_PARTS[i]); + + // (index 0 => 'days' part): after writing days above (if non-zero) + // the ISO duration string requires a `T` before the time part. if i == 0 { s.push('T'); } @@ -1041,14 +1063,18 @@ pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { } } } else if iso && i == 0 { + // always need to write the `T` separator for ISO + // durations, even if there is no 'days' part. s.push('T'); } } if iso { + // if there was only a 'days' component, no need for time separator. if s.ends_with('T') { s.pop(); } } else { + // write out fractional seconds as integer nano/micro/milliseconds. match unit { TimeUnit::Nanoseconds => { if v % 1000 != 0 { diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 3c72613c9a00..7123a977f4f5 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -521,7 +521,7 @@ def to_string(self, format: str | None = None) -> Expr: │ --- ┆ --- │ │ str ┆ str │ ╞═════════════════╪═══════════════════╡ - │ 01:02:03.456789 ┆ -P1DT42.S │ + │ 01:02:03.456789 ┆ -P1DT42S │ │ 23:59:09.000101 ┆ P13DT14H0.001001S │ │ 00:00:00.000100 ┆ PT0S │ └─────────────────┴───────────────────┘ diff --git a/py-polars/tests/unit/datatypes/test_duration.py b/py-polars/tests/unit/datatypes/test_duration.py index f9df754466c9..5682c20051cc 100644 --- a/py-polars/tests/unit/datatypes/test_duration.py +++ b/py-polars/tests/unit/datatypes/test_duration.py @@ -29,6 +29,7 @@ def test_duration_cast() -> None: timedelta(days=2, hours=23, seconds=4975, milliseconds=1), timedelta(hours=1, seconds=1, milliseconds=1, microseconds=1), timedelta(seconds=-42, milliseconds=-42), + timedelta(days=-1), None, ] @@ -54,6 +55,7 @@ def test_duration_cast() -> None: timedelta(days=2, hours=23, seconds=4975, milliseconds=1), timedelta(hours=1, seconds=1, milliseconds=1), timedelta(seconds=-42, milliseconds=-42), + timedelta(days=-1), None, ], "td_int": [ @@ -62,6 +64,7 @@ def test_duration_cast() -> None: 260575001000, 3601001001, -42042000, + -86400000000, None, ], "td_str_iso": [ @@ -70,6 +73,7 @@ def test_duration_cast() -> None: "P3DT22M55.001S", "PT1H1.001001S", "-PT42.042S", + "-P1DT0S", None, ], "td_str_pl": [ @@ -78,6 +82,7 @@ def test_duration_cast() -> None: "3d 22m 55s 1ms", "1h 1s 1001µs", "-42s -42ms", + "-1d", None, ], }, From abca6b8acba123ed861365f4a0427c4e13654860 Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Sat, 9 Nov 2024 09:30:18 +0400 Subject: [PATCH 4/5] further micro-optimisation, additional tests --- crates/polars-core/src/fmt.rs | 45 +++++++---- .../tests/unit/datatypes/test_duration.py | 77 ++++++++++++++----- .../tests/unit/datatypes/test_temporal.py | 2 +- 3 files changed, 89 insertions(+), 35 deletions(-) diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 625a5ddfead3..7b9ecc8f0278 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -991,11 +991,11 @@ pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { // Polars: "3d 22m 55s 1ms" // ISO: "P3DT22M55.001S" // - // The parts (days, hours, minutes, seconds) occur in the same order in + // the parts (days, hours, minutes, seconds) occur in the same order in // each string, so we use the same code to generate each of them, with // only the separators and the 'seconds' part differing. // - // Ref: https://en.wikipedia.org/wiki/ISO_8601#Durations + // ref: https://en.wikipedia.org/wiki/ISO_8601#Durations if v == 0 { return if iso { "PT0S".to_string() @@ -1015,6 +1015,7 @@ pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { let mut s = String::with_capacity(32); let mut buffer = itoa::Buffer::new(); + let mut wrote_part = false; if iso { if v < 0 { // negative sign before "P" indicates that the entire ISO duration is negative. @@ -1034,34 +1035,46 @@ pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { (v % sizes[i - 1]) / size }; if whole_num != 0 || (iso && i == 3) { - s.push_str(buffer.format(whole_num)); if iso { - // (index 3 => 'seconds' part): the ISO version writes - // fractional seconds, not integer nano/micro/milliseconds. - if i == 3 { - let secs = match unit { - TimeUnit::Nanoseconds => format!(".{:09}", v % size), - TimeUnit::Microseconds => format!(".{:06}", v % size), - TimeUnit::Milliseconds => format!(".{:03}", v % size), - }; - s.push_str(secs.trim_end_matches('0')); - if s.ends_with('.') { - s.pop(); + if i != 3 { + // days, hours, minutes + s.push_str(buffer.format(whole_num)); + s.push_str(ISO_DURATION_PARTS[i]); + } else { + // (index 3 => 'seconds' part): the ISO version writes + // fractional seconds, not integer nano/micro/milliseconds. + // if zero, only write out if no other parts written yet. + let fractional_part = v % size; + if whole_num == 0 && fractional_part == 0 { + if !wrote_part { + s.push_str("0S") + } + } else { + s.push_str(buffer.format(whole_num)); + if fractional_part != 0 { + let secs = match unit { + TimeUnit::Nanoseconds => format!(".{:09}", fractional_part), + TimeUnit::Microseconds => format!(".{:06}", fractional_part), + TimeUnit::Milliseconds => format!(".{:03}", fractional_part), + }; + s.push_str(secs.trim_end_matches('0')); + } + s.push_str(ISO_DURATION_PARTS[i]); } } - s.push_str(ISO_DURATION_PARTS[i]); - // (index 0 => 'days' part): after writing days above (if non-zero) // the ISO duration string requires a `T` before the time part. if i == 0 { s.push('T'); } } else { + s.push_str(buffer.format(whole_num)); s.push_str(DURATION_PARTS[i]); if v % size != 0 { s.push(' '); } } + wrote_part = true; } else if iso && i == 0 { // always need to write the `T` separator for ISO // durations, even if there is no 'days' part. diff --git a/py-polars/tests/unit/datatypes/test_duration.py b/py-polars/tests/unit/datatypes/test_duration.py index 5682c20051cc..e690e5e92dd5 100644 --- a/py-polars/tests/unit/datatypes/test_duration.py +++ b/py-polars/tests/unit/datatypes/test_duration.py @@ -22,25 +22,28 @@ def test_duration_cum_sum() -> None: assert df.schema["A"].is_(duration_dtype) is False -def test_duration_cast() -> None: - durations = [ - timedelta(days=180, seconds=56789, microseconds=987654), - timedelta(days=0, seconds=64875, microseconds=8884), - timedelta(days=2, hours=23, seconds=4975, milliseconds=1), - timedelta(hours=1, seconds=1, milliseconds=1, microseconds=1), - timedelta(seconds=-42, milliseconds=-42), - timedelta(days=-1), - None, - ] +def test_duration_to_string() -> None: + df = pl.DataFrame( + { + "td": [ + timedelta(days=180, seconds=56789, microseconds=987654), + timedelta(days=0, seconds=64875, microseconds=8884), + timedelta(days=2, hours=23, seconds=4975, milliseconds=1), + timedelta(hours=1, seconds=1, milliseconds=1, microseconds=1), + timedelta(seconds=-42, milliseconds=-42), + None, + ] + }, + schema={"td": pl.Duration("us")}, + ) - df = pl.DataFrame({"td": durations}, schema={"td": pl.Duration("us")}) - df_cast = df.select( + df_str = df.select( td_ms=pl.col("td").cast(pl.Duration("ms")), td_int=pl.col("td").cast(pl.Int64), td_str_iso=pl.col("td").dt.to_string(), td_str_pl=pl.col("td").dt.to_string("polars"), ) - assert df_cast.schema == { + assert df_str.schema == { "td_ms": pl.Duration(time_unit="ms"), "td_int": pl.Int64, "td_str_iso": pl.String, @@ -55,7 +58,6 @@ def test_duration_cast() -> None: timedelta(days=2, hours=23, seconds=4975, milliseconds=1), timedelta(hours=1, seconds=1, milliseconds=1), timedelta(seconds=-42, milliseconds=-42), - timedelta(days=-1), None, ], "td_int": [ @@ -64,7 +66,6 @@ def test_duration_cast() -> None: 260575001000, 3601001001, -42042000, - -86400000000, None, ], "td_str_iso": [ @@ -73,7 +74,6 @@ def test_duration_cast() -> None: "P3DT22M55.001S", "PT1H1.001001S", "-PT42.042S", - "-P1DT0S", None, ], "td_str_pl": [ @@ -82,13 +82,54 @@ def test_duration_cast() -> None: "3d 22m 55s 1ms", "1h 1s 1001µs", "-42s -42ms", - "-1d", None, ], }, schema_overrides={"td_ms": pl.Duration(time_unit="ms")}, ) - assert_frame_equal(expected, df_cast) + assert_frame_equal(expected, df_str) + + # individual +/- parts + df = pl.DataFrame( + { + "td_ns": [ + timedelta(weeks=1), + timedelta(days=1), + timedelta(hours=1), + timedelta(minutes=1), + timedelta(seconds=1), + timedelta(milliseconds=1), + timedelta(microseconds=1), + timedelta(seconds=0), + timedelta(microseconds=-1), + timedelta(milliseconds=-1), + timedelta(seconds=-1), + timedelta(minutes=-1), + timedelta(hours=-1), + timedelta(days=-1), + timedelta(weeks=-1), + ] + }, + schema={"td_ns": pl.Duration("ns")}, + ) + df_str = df.select(pl.col("td_ns").dt.to_string("iso")) + assert df_str["td_ns"].to_list() == [ + "P7D", + "P1D", + "PT1H", + "PT1M", + "PT1S", + "PT0.001S", + "PT0.000001S", + "PT0S", + "-PT0.000001S", + "-PT0.001S", + "-PT1S", + "-PT1M", + "-PT1H", + "-P1D", + "-P7D", + ] def test_duration_std_var() -> None: diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index ec87a45a9206..fe22bb4d5cbc 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -1131,7 +1131,7 @@ def test_temporal_to_string_iso_default() -> None: "0µs", ], "td": [ - "-P1DT42.S", + "-P1DT42S", "P13DT14H0.001001S", "PT0S", ], From 43d4cb33349fccc08e7e3e6cacd5c30e59fe915b Mon Sep 17 00:00:00 2001 From: alexander-beedie Date: Thu, 14 Nov 2024 00:43:17 +0400 Subject: [PATCH 5/5] use "Write" with `fmt_duration_string` to support Formatter and String, split out `iso_duration_string` --- .../src/chunked_array/temporal/duration.rs | 30 ++- crates/polars-core/src/fmt.rs | 202 ++++++++---------- 2 files changed, 118 insertions(+), 114 deletions(-) diff --git a/crates/polars-core/src/chunked_array/temporal/duration.rs b/crates/polars-core/src/chunked_array/temporal/duration.rs index 32e8b06cb331..afc0e7c58aa8 100644 --- a/crates/polars-core/src/chunked_array/temporal/duration.rs +++ b/crates/polars-core/src/chunked_array/temporal/duration.rs @@ -1,5 +1,5 @@ use crate::export::chrono::Duration as ChronoDuration; -use crate::fmt::fmt_duration_string; +use crate::fmt::{fmt_duration_string, iso_duration_string}; use crate::prelude::DataType::Duration; use crate::prelude::*; @@ -64,13 +64,29 @@ impl DurationChunked { /// Convert from [`Duration`] to String; note that `strftime` format /// strings are not supported, only the specifiers 'iso' and 'polars'. pub fn to_string(&self, format: &str) -> PolarsResult { + // the duration string functions below can reuse this string buffer + let mut s = String::with_capacity(32); match format { - "iso" | "polars" => { - let out: StringChunked = self - .0 - .apply_nonnull_values_generic(DataType::String, |v: i64| { - fmt_duration_string(v, self.time_unit(), format == "iso") - }); + "iso" => { + let out: StringChunked = + self.0 + .apply_nonnull_values_generic(DataType::String, |v: i64| { + s.clear(); + iso_duration_string(&mut s, v, self.time_unit()); + s.clone() + }); + Ok(out) + }, + "polars" => { + let out: StringChunked = + self.0 + .apply_nonnull_values_generic(DataType::String, |v: i64| { + s.clear(); + fmt_duration_string(&mut s, v, self.time_unit()) + .map_err(|e| polars_err!(ComputeError: "{:?}", e)) + .expect("failed to format duration"); + s.clone() + }); Ok(out) }, _ => { diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 7b9ecc8f0278..1456c75bc4cc 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -984,141 +984,129 @@ const SIZES_US: [i64; 4] = [86_400_000_000, 3_600_000_000, 60_000_000, 1_000_000 const SIZES_MS: [i64; 4] = [86_400_000, 3_600_000, 60_000, 1_000]; #[cfg(feature = "dtype-duration")] -pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { - // take the physical/integer duration value and return either a human-readable version - // of the duration (as used in the Polars frame repr) or an ISO8601 duration string. - // - // Polars: "3d 22m 55s 1ms" - // ISO: "P3DT22M55.001S" - // - // the parts (days, hours, minutes, seconds) occur in the same order in - // each string, so we use the same code to generate each of them, with - // only the separators and the 'seconds' part differing. - // - // ref: https://en.wikipedia.org/wiki/ISO_8601#Durations +pub fn fmt_duration_string(f: &mut W, v: i64, unit: TimeUnit) -> fmt::Result { + // take the physical/integer duration value and return a + // friendly/readable duration string, eg: "3d 22m 55s 1ms" if v == 0 { - return if iso { - "PT0S".to_string() - } else { - match unit { - TimeUnit::Nanoseconds => "0ns".to_string(), - TimeUnit::Microseconds => "0µs".to_string(), - TimeUnit::Milliseconds => "0ms".to_string(), - } + return match unit { + TimeUnit::Nanoseconds => f.write_str("0ns"), + TimeUnit::Microseconds => f.write_str("0µs"), + TimeUnit::Milliseconds => f.write_str("0ms"), }; }; + // iterate over dtype-specific sizes to appropriately scale + // and extract 'days', 'hours', 'minutes', and 'seconds' parts. let sizes = match unit { TimeUnit::Nanoseconds => SIZES_NS.as_slice(), TimeUnit::Microseconds => SIZES_US.as_slice(), TimeUnit::Milliseconds => SIZES_MS.as_slice(), }; - - let mut s = String::with_capacity(32); let mut buffer = itoa::Buffer::new(); - let mut wrote_part = false; - if iso { - if v < 0 { - // negative sign before "P" indicates that the entire ISO duration is negative. - // the Polars version applies a negative sign to each *individual* part. - s.push_str("-P"); - v = v.abs() + for (i, &size) in sizes.iter().enumerate() { + let whole_num = if i == 0 { + v / size } else { - s.push('P'); + (v % sizes[i - 1]) / size + }; + if whole_num != 0 { + f.write_str(buffer.format(whole_num))?; + f.write_str(DURATION_PARTS[i])?; + if v % size != 0 { + f.write_char(' ')?; + } } + } + // write fractional seconds as integer nano/micro/milliseconds. + let (v, units) = match unit { + TimeUnit::Nanoseconds => (v % 1_000_000_000, ["ns", "µs", "ms"]), + TimeUnit::Microseconds => (v % 1_000_000, ["µs", "ms", ""]), + TimeUnit::Milliseconds => (v % 1_000, ["ms", "", ""]), }; + if v != 0 { + let (value, suffix) = if v % 1_000 != 0 { + (v, units[0]) + } else if v % 1_000_000 != 0 { + (v / 1_000, units[1]) + } else { + (v / 1_000_000, units[2]) + }; + f.write_str(buffer.format(value))?; + f.write_str(suffix)?; + } + Ok(()) +} + +#[cfg(feature = "dtype-duration")] +pub fn iso_duration_string(s: &mut String, mut v: i64, unit: TimeUnit) { + if v == 0 { + s.push_str("PT0S"); + return; + } + let mut buffer = itoa::Buffer::new(); + let mut wrote_part = false; + if v < 0 { + // negative sign before "P" indicates entire ISO duration is negative. + s.push_str("-P"); + v = v.abs(); + } else { + s.push('P'); + } // iterate over dtype-specific sizes to appropriately scale // and extract 'days', 'hours', 'minutes', and 'seconds' parts. + let sizes = match unit { + TimeUnit::Nanoseconds => SIZES_NS.as_slice(), + TimeUnit::Microseconds => SIZES_US.as_slice(), + TimeUnit::Milliseconds => SIZES_MS.as_slice(), + }; for (i, &size) in sizes.iter().enumerate() { let whole_num = if i == 0 { v / size } else { (v % sizes[i - 1]) / size }; - if whole_num != 0 || (iso && i == 3) { - if iso { - if i != 3 { - // days, hours, minutes - s.push_str(buffer.format(whole_num)); - s.push_str(ISO_DURATION_PARTS[i]); + if whole_num != 0 || i == 3 { + if i != 3 { + // days, hours, minutes + s.push_str(buffer.format(whole_num)); + s.push_str(ISO_DURATION_PARTS[i]); + } else { + // (index 3 => 'seconds' part): the ISO version writes + // fractional seconds, not integer nano/micro/milliseconds. + // if zero, only write out if no other parts written yet. + let fractional_part = v % size; + if whole_num == 0 && fractional_part == 0 { + if !wrote_part { + s.push_str("0S") + } } else { - // (index 3 => 'seconds' part): the ISO version writes - // fractional seconds, not integer nano/micro/milliseconds. - // if zero, only write out if no other parts written yet. - let fractional_part = v % size; - if whole_num == 0 && fractional_part == 0 { - if !wrote_part { - s.push_str("0S") - } - } else { - s.push_str(buffer.format(whole_num)); - if fractional_part != 0 { - let secs = match unit { - TimeUnit::Nanoseconds => format!(".{:09}", fractional_part), - TimeUnit::Microseconds => format!(".{:06}", fractional_part), - TimeUnit::Milliseconds => format!(".{:03}", fractional_part), - }; - s.push_str(secs.trim_end_matches('0')); - } - s.push_str(ISO_DURATION_PARTS[i]); + s.push_str(buffer.format(whole_num)); + if fractional_part != 0 { + let secs = match unit { + TimeUnit::Nanoseconds => format!(".{:09}", fractional_part), + TimeUnit::Microseconds => format!(".{:06}", fractional_part), + TimeUnit::Milliseconds => format!(".{:03}", fractional_part), + }; + s.push_str(secs.trim_end_matches('0')); } + s.push_str(ISO_DURATION_PARTS[i]); } - // (index 0 => 'days' part): after writing days above (if non-zero) - // the ISO duration string requires a `T` before the time part. - if i == 0 { - s.push('T'); - } - } else { - s.push_str(buffer.format(whole_num)); - s.push_str(DURATION_PARTS[i]); - if v % size != 0 { - s.push(' '); - } + } + // (index 0 => 'days' part): after writing days above (if non-zero) + // the ISO duration string requires a `T` before the time part. + if i == 0 { + s.push('T'); } wrote_part = true; - } else if iso && i == 0 { + } else if i == 0 { // always need to write the `T` separator for ISO // durations, even if there is no 'days' part. s.push('T'); } } - if iso { - // if there was only a 'days' component, no need for time separator. - if s.ends_with('T') { - s.pop(); - } - } else { - // write out fractional seconds as integer nano/micro/milliseconds. - match unit { - TimeUnit::Nanoseconds => { - if v % 1000 != 0 { - s.push_str(buffer.format(v % 1_000_000_000)); - s.push_str("ns"); - } else if v % 1_000_000 != 0 { - s.push_str(buffer.format((v % 1_000_000_000) / 1000)); - s.push_str("µs"); - } else if v % 1_000_000_000 != 0 { - s.push_str(buffer.format((v % 1_000_000_000) / 1_000_000)); - s.push_str("ms"); - } - }, - TimeUnit::Microseconds => { - if v % 1000 != 0 { - s.push_str(buffer.format(v % 1_000_000)); - s.push_str("µs"); - } else if v % 1_000_000 != 0 { - s.push_str(buffer.format((v % 1_000_000) / 1_000)); - s.push_str("ms"); - } - }, - TimeUnit::Milliseconds => { - if v % 1000 != 0 { - s.push_str(buffer.format(v % 1_000)); - s.push_str("ms"); - } - }, - } + // if there was only a 'days' component, no need for time separator. + if s.ends_with('T') { + s.pop(); } - s } fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result { @@ -1133,9 +1121,9 @@ fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result { } } if bytes.len() > width { - write!(f, "\"…")?; + f.write_str("\"…")?; } else { - write!(f, "\"")?; + f.write_str("\"")?; } Ok(()) } @@ -1169,7 +1157,7 @@ impl Display for AnyValue<'_> { fmt_datetime(f, *v, *tu, tz.as_ref().map(|v| v.as_ref())) }, #[cfg(feature = "dtype-duration")] - AnyValue::Duration(v, tu) => write!(f, "{}", fmt_duration_string(*v, *tu, false)), + AnyValue::Duration(v, tu) => fmt_duration_string(f, *v, *tu), #[cfg(feature = "dtype-time")] AnyValue::Time(_) => { let nt: chrono::NaiveTime = self.into();