Skip to content

Commit

Permalink
feat(rust, python): add utc parameter to strptime (#6496)
Browse files Browse the repository at this point in the history
Co-authored-by: MarcoGorelli <>
  • Loading branch information
MarcoGorelli authored Jan 28, 2023
1 parent d7b27d8 commit dfca44c
Show file tree
Hide file tree
Showing 9 changed files with 88 additions and 15 deletions.
2 changes: 1 addition & 1 deletion polars/polars-io/src/csv/read_impl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pub(crate) fn cast_columns(
(Utf8, Datetime(tu, _)) => s
.utf8()
.unwrap()
.as_datetime(None, *tu, false, false)
.as_datetime(None, *tu, false, false, false)
.map(|ca| ca.into_series()),
(_, dt) => s.cast(dt),
};
Expand Down
10 changes: 8 additions & 2 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,14 @@ pub(super) fn strptime(s: &Series, options: &StrpTimeOptions) -> PolarsResult<Se
}
DataType::Datetime(tu, _) => {
if options.exact {
ca.as_datetime(options.fmt.as_deref(), *tu, options.cache, options.tz_aware)?
.into_series()
ca.as_datetime(
options.fmt.as_deref(),
*tu,
options.cache,
options.tz_aware,
options.utc,
)?
.into_series()
} else {
ca.as_datetime_not_exact(options.fmt.as_deref(), *tu)?
.into_series()
Expand Down
3 changes: 3 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ pub struct StrpTimeOptions {
pub cache: bool,
/// Parse a timezone aware timestamp
pub tz_aware: bool,
/// Convert timezone aware to UTC
pub utc: bool,
}

impl Default for StrpTimeOptions {
Expand All @@ -32,6 +34,7 @@ impl Default for StrpTimeOptions {
exact: false,
cache: true,
tz_aware: false,
utc: false,
}
}
}
Expand Down
30 changes: 21 additions & 9 deletions polars/polars-time/src/chunkedarray/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ pub trait Utf8Methods: AsUtf8 {
tu: TimeUnit,
cache: bool,
mut tz_aware: bool,
utc: bool,
) -> PolarsResult<DatetimeChunked> {
let utf8_ca = self.as_utf8();
let fmt = match fmt {
Expand All @@ -402,6 +403,11 @@ pub trait Utf8Methods: AsUtf8 {
if TZ_AWARE_RE.is_match(fmt) {
tz_aware = true;
}
if !tz_aware && utc {
return Err(PolarsError::ComputeError(
"Cannot use 'utc=True' with tz-naive data. Parse the data as naive, and then use `.dt.with_time_zone('UTC')".into(),
));
}
let fmt = self::strptime::compile_fmt(fmt);
let cache = cache && utf8_ca.len() > 50;

Expand All @@ -421,13 +427,15 @@ pub trait Utf8Methods: AsUtf8 {

let mut convert = |s: &str| {
DateTime::parse_from_str(s, &fmt).ok().map(|dt| {
match tz {
None => tz = Some(dt.timezone()),
Some(tz_found) => {
if tz_found != dt.timezone() {
return Err(PolarsError::ComputeError(
"Different timezones found during 'strptime' operation.".into(),
));
if !utc {
match tz {
None => tz = Some(dt.timezone()),
Some(tz_found) => {
if tz_found != dt.timezone() {
return Err(PolarsError::ComputeError(
"Different timezones found during 'strptime' operation. You might want to use `utc=True` and then set the time zone after parsing".into()
));
}
}
}
}
Expand Down Expand Up @@ -459,9 +467,13 @@ pub trait Utf8Methods: AsUtf8 {
})
.collect::<PolarsResult<_>>()?;

let tz = tz.map(|of| format!("{of}"));
ca.rename(utf8_ca.name());
Ok(ca.into_datetime(tu, tz))
if !utc {
let tz = tz.map(|of| format!("{of}"));
Ok(ca.into_datetime(tu, tz))
} else {
Ok(ca.into_datetime(tu, Some("UTC".to_string())))
}
}
#[cfg(not(feature = "timezones"))]
{
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-time/src/groupby/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ mod test {
"2020-01-08 23:16:43",
],
)
.as_datetime(None, tu, false, false)?
.as_datetime(None, tu, false, false, false)?
.into_series();
let a = Series::new("a", [3, 7, 5, 9, 2, 1]);
let df = DataFrame::new(vec![date, a.clone()])?;
Expand Down Expand Up @@ -546,7 +546,7 @@ mod test {
"2020-01-08 23:16:43",
],
)
.as_datetime(None, TimeUnit::Milliseconds, false, false)?
.as_datetime(None, TimeUnit::Milliseconds, false, false, false)?
.into_series();
let a = Series::new("a", [3, 7, 5, 9, 2, 1]);
let df = DataFrame::new(vec![date, a.clone()])?;
Expand Down
8 changes: 7 additions & 1 deletion py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def strptime(
exact: bool = True,
cache: bool = True,
tz_aware: bool = False,
utc: bool = False,
) -> pli.Expr:
"""
Parse a Utf8 expression to a Date/Datetime/Time type.
Expand All @@ -57,6 +58,9 @@ def strptime(
tz_aware
Parse timezone aware datetimes. This may be automatically toggled by the
'fmt' given.
utc
Parse timezone aware datetimes as UTC. This may be useful if you have data
with mixed offsets.
Notes
-----
Expand Down Expand Up @@ -109,7 +113,9 @@ def strptime(
elif datatype == Datetime:
tu = datatype.tu # type: ignore[union-attr]
dtcol = pli.wrap_expr(
self._pyexpr.str_parse_datetime(fmt, strict, exact, cache, tz_aware, tu)
self._pyexpr.str_parse_datetime(
fmt, strict, exact, cache, tz_aware, utc, tu
)
)
return dtcol if (tu is None) else dtcol.dt.cast_time_unit(tu)
elif datatype == Time:
Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/internals/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def strptime(
exact: bool = True,
cache: bool = True,
tz_aware: bool = False,
utc: bool = False,
) -> pli.Series:
"""
Parse a Series of dtype Utf8 to a Date/Datetime Series.
Expand All @@ -54,6 +55,9 @@ def strptime(
tz_aware
Parse timezone aware datetimes. This may be automatically toggled by the
'fmt' given.
utc
Parse timezone aware datetimes as UTC. This may be useful if you have data
with mixed offsets.
Returns
-------
Expand Down
5 changes: 5 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -551,17 +551,20 @@ impl PyExpr {
exact,
cache,
tz_aware: false,
utc: false,
})
.into()
}

#[allow(clippy::too_many_arguments)]
pub fn str_parse_datetime(
&self,
fmt: Option<String>,
strict: bool,
exact: bool,
cache: bool,
tz_aware: bool,
utc: bool,
tu: Option<Wrap<TimeUnit>>,
) -> PyExpr {
let result_tu = match (&fmt, tu) {
Expand Down Expand Up @@ -591,6 +594,7 @@ impl PyExpr {
exact,
cache,
tz_aware,
utc,
})
.into()
}
Expand All @@ -612,6 +616,7 @@ impl PyExpr {
exact,
cache,
tz_aware: false,
utc: false,
})
.into()
}
Expand Down
37 changes: 37 additions & 0 deletions py-polars/tests/unit/test_datelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -2658,3 +2658,40 @@ def test_infer_iso8601(iso8601_format: str) -> None:
assert parsed.dt.nanosecond().item() == 123456000
if "%3f" in iso8601_format:
assert parsed.dt.nanosecond().item() == 123000000


@pytest.mark.parametrize("fmt", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
def test_crossing_dst(fmt: str) -> None:
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
result = pl.Series(ts).str.strptime(pl.Datetime, fmt, utc=True)
assert result[0] == datetime(
2021, 3, 27, 22, 59, 59, tzinfo=zoneinfo.ZoneInfo(key="UTC")
)
assert result[1] == datetime(
2021, 3, 28, 21, 59, 59, tzinfo=zoneinfo.ZoneInfo(key="UTC")
)


@pytest.mark.parametrize("fmt", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
def test_crossing_dst_tz_aware(fmt: str) -> None:
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
with pytest.raises(
ComputeError,
match=(
r"^Different timezones found during 'strptime' operation. "
"You might want to use `utc=True` and then set the time zone after parsing$"
),
):
pl.Series(ts).str.strptime(pl.Datetime, fmt, utc=False)


def test_utc_with_tz_naive() -> None:
ts = ["2021-03-27T23:59:59", "2021-03-28T23:59:59"]
with pytest.raises(
ComputeError,
match=(
r"Cannot use 'utc=True' with tz-naive data. "
r"Parse the data as naive, and then use `.dt.with_time_zone\('UTC'\)"
),
):
pl.Series(ts).str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S", utc=True)

0 comments on commit dfca44c

Please sign in to comment.