Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust, python): add utc parameter to strptime #6496

Merged
merged 3 commits into from
Jan 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion polars/polars-io/src/csv/read_impl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ pub(crate) fn cast_columns(
(Utf8, Datetime(tu, _)) => s
.utf8()
.unwrap()
.as_datetime(None, *tu, false, false)
.as_datetime(None, *tu, false, false, false)
.map(|ca| ca.into_series()),
(_, dt) => s.cast(dt),
};
Expand Down
10 changes: 8 additions & 2 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,14 @@ pub(super) fn strptime(s: &Series, options: &StrpTimeOptions) -> PolarsResult<Se
}
DataType::Datetime(tu, _) => {
if options.exact {
ca.as_datetime(options.fmt.as_deref(), *tu, options.cache, options.tz_aware)?
.into_series()
ca.as_datetime(
options.fmt.as_deref(),
*tu,
options.cache,
options.tz_aware,
options.utc,
)?
.into_series()
} else {
ca.as_datetime_not_exact(options.fmt.as_deref(), *tu)?
.into_series()
Expand Down
3 changes: 3 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ pub struct StrpTimeOptions {
pub cache: bool,
/// Parse a timezone aware timestamp
pub tz_aware: bool,
/// Convert timezone aware to UTC
pub utc: bool,
}

impl Default for StrpTimeOptions {
Expand All @@ -32,6 +34,7 @@ impl Default for StrpTimeOptions {
exact: false,
cache: true,
tz_aware: false,
utc: false,
}
}
}
Expand Down
30 changes: 21 additions & 9 deletions polars/polars-time/src/chunkedarray/utf8/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ pub trait Utf8Methods: AsUtf8 {
tu: TimeUnit,
cache: bool,
mut tz_aware: bool,
utc: bool,
) -> PolarsResult<DatetimeChunked> {
let utf8_ca = self.as_utf8();
let fmt = match fmt {
Expand All @@ -402,6 +403,11 @@ pub trait Utf8Methods: AsUtf8 {
if TZ_AWARE_RE.is_match(fmt) {
tz_aware = true;
}
if !tz_aware && utc {
return Err(PolarsError::ComputeError(
"Cannot use 'utc=True' with tz-naive data. Parse the data as naive, and then use `.dt.with_time_zone('UTC')".into(),
));
}
let fmt = self::strptime::compile_fmt(fmt);
let cache = cache && utf8_ca.len() > 50;

Expand All @@ -421,13 +427,15 @@ pub trait Utf8Methods: AsUtf8 {

let mut convert = |s: &str| {
DateTime::parse_from_str(s, &fmt).ok().map(|dt| {
match tz {
None => tz = Some(dt.timezone()),
Some(tz_found) => {
if tz_found != dt.timezone() {
return Err(PolarsError::ComputeError(
"Different timezones found during 'strptime' operation.".into(),
));
if !utc {
match tz {
None => tz = Some(dt.timezone()),
Some(tz_found) => {
if tz_found != dt.timezone() {
return Err(PolarsError::ComputeError(
"Different timezones found during 'strptime' operation. You might want to use `utc=True` and then set the time zone after parsing".into()
));
}
}
}
}
Expand Down Expand Up @@ -459,9 +467,13 @@ pub trait Utf8Methods: AsUtf8 {
})
.collect::<PolarsResult<_>>()?;

let tz = tz.map(|of| format!("{of}"));
ca.rename(utf8_ca.name());
Ok(ca.into_datetime(tu, tz))
if !utc {
let tz = tz.map(|of| format!("{of}"));
Ok(ca.into_datetime(tu, tz))
} else {
Ok(ca.into_datetime(tu, Some("UTC".to_string())))
}
}
#[cfg(not(feature = "timezones"))]
{
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-time/src/groupby/dynamic.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ mod test {
"2020-01-08 23:16:43",
],
)
.as_datetime(None, tu, false, false)?
.as_datetime(None, tu, false, false, false)?
.into_series();
let a = Series::new("a", [3, 7, 5, 9, 2, 1]);
let df = DataFrame::new(vec![date, a.clone()])?;
Expand Down Expand Up @@ -546,7 +546,7 @@ mod test {
"2020-01-08 23:16:43",
],
)
.as_datetime(None, TimeUnit::Milliseconds, false, false)?
.as_datetime(None, TimeUnit::Milliseconds, false, false, false)?
.into_series();
let a = Series::new("a", [3, 7, 5, 9, 2, 1]);
let df = DataFrame::new(vec![date, a.clone()])?;
Expand Down
8 changes: 7 additions & 1 deletion py-polars/polars/internals/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def strptime(
exact: bool = True,
cache: bool = True,
tz_aware: bool = False,
utc: bool = False,
) -> pli.Expr:
"""
Parse a Utf8 expression to a Date/Datetime/Time type.
Expand All @@ -57,6 +58,9 @@ def strptime(
tz_aware
Parse timezone aware datetimes. This may be automatically toggled by the
'fmt' given.
utc
Parse timezone aware datetimes as UTC. This may be useful if you have data
with mixed offsets.

Notes
-----
Expand Down Expand Up @@ -109,7 +113,9 @@ def strptime(
elif datatype == Datetime:
tu = datatype.tu # type: ignore[union-attr]
dtcol = pli.wrap_expr(
self._pyexpr.str_parse_datetime(fmt, strict, exact, cache, tz_aware, tu)
self._pyexpr.str_parse_datetime(
fmt, strict, exact, cache, tz_aware, utc, tu
)
)
return dtcol if (tu is None) else dtcol.dt.cast_time_unit(tu)
elif datatype == Time:
Expand Down
4 changes: 4 additions & 0 deletions py-polars/polars/internals/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def strptime(
exact: bool = True,
cache: bool = True,
tz_aware: bool = False,
utc: bool = False,
) -> pli.Series:
"""
Parse a Series of dtype Utf8 to a Date/Datetime Series.
Expand All @@ -54,6 +55,9 @@ def strptime(
tz_aware
Parse timezone aware datetimes. This may be automatically toggled by the
'fmt' given.
utc
Parse timezone aware datetimes as UTC. This may be useful if you have data
with mixed offsets.

Returns
-------
Expand Down
5 changes: 5 additions & 0 deletions py-polars/src/lazy/dsl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -551,17 +551,20 @@ impl PyExpr {
exact,
cache,
tz_aware: false,
utc: false,
})
.into()
}

#[allow(clippy::too_many_arguments)]
pub fn str_parse_datetime(
&self,
fmt: Option<String>,
strict: bool,
exact: bool,
cache: bool,
tz_aware: bool,
utc: bool,
tu: Option<Wrap<TimeUnit>>,
) -> PyExpr {
let result_tu = match (&fmt, tu) {
Expand Down Expand Up @@ -591,6 +594,7 @@ impl PyExpr {
exact,
cache,
tz_aware,
utc,
})
.into()
}
Expand All @@ -612,6 +616,7 @@ impl PyExpr {
exact,
cache,
tz_aware: false,
utc: false,
})
.into()
}
Expand Down
37 changes: 37 additions & 0 deletions py-polars/tests/unit/test_datelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -2658,3 +2658,40 @@ def test_infer_iso8601(iso8601_format: str) -> None:
assert parsed.dt.nanosecond().item() == 123456000
if "%3f" in iso8601_format:
assert parsed.dt.nanosecond().item() == 123000000


@pytest.mark.parametrize("fmt", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
def test_crossing_dst(fmt: str) -> None:
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
result = pl.Series(ts).str.strptime(pl.Datetime, fmt, utc=True)
assert result[0] == datetime(
2021, 3, 27, 22, 59, 59, tzinfo=zoneinfo.ZoneInfo(key="UTC")
)
assert result[1] == datetime(
2021, 3, 28, 21, 59, 59, tzinfo=zoneinfo.ZoneInfo(key="UTC")
)


@pytest.mark.parametrize("fmt", ["%+", "%Y-%m-%dT%H:%M:%S%z"])
def test_crossing_dst_tz_aware(fmt: str) -> None:
ts = ["2021-03-27T23:59:59+01:00", "2021-03-28T23:59:59+02:00"]
with pytest.raises(
ComputeError,
match=(
r"^Different timezones found during 'strptime' operation. "
"You might want to use `utc=True` and then set the time zone after parsing$"
),
):
pl.Series(ts).str.strptime(pl.Datetime, fmt, utc=False)


def test_utc_with_tz_naive() -> None:
ts = ["2021-03-27T23:59:59", "2021-03-28T23:59:59"]
with pytest.raises(
ComputeError,
match=(
r"Cannot use 'utc=True' with tz-naive data. "
r"Parse the data as naive, and then use `.dt.with_time_zone\('UTC'\)"
),
):
pl.Series(ts).str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S", utc=True)