Skip to content

Commit

Permalink
fix(rust, python): raise if to_datetime would have parsed input incor…
Browse files Browse the repository at this point in the history
…rectly (pola-rs#9675)
  • Loading branch information
MarcoGorelli authored and c-peters committed Jul 14, 2023
1 parent ed58c7a commit 90c988c
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 12 deletions.
30 changes: 21 additions & 9 deletions polars/polars-time/src/chunkedarray/utf8/strptime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ use regex::Regex;

use crate::chunkedarray::{polars_bail, PolarsResult};

static HOUR_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"%[HI]").unwrap());
static MINUTE_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"%M").unwrap());
static HOUR_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"%[_-]?[HkIl]").unwrap());
static MINUTE_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"%[_-]?M").unwrap());
static SECOND_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"%[_-]?S").unwrap());
static TWELVE_HOUR_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"%[_-]?[Il]").unwrap());
static MERIDIEM_PATTERN: Lazy<Regex> = Lazy::new(|| Regex::new(r"%[_-]?[pP]").unwrap());

#[inline]
fn update_and_parse<T: atoi::FromRadix10>(
Expand Down Expand Up @@ -52,14 +55,23 @@ fn parse_month_abbrev(val: &[u8], offset: usize) -> Option<(u32, usize)> {
/// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes
/// year, day, month distinctively with %Y, %d, %m.
pub(super) fn compile_fmt(fmt: &str) -> PolarsResult<String> {
if HOUR_PATTERN.is_match(fmt) & !MINUTE_PATTERN.is_match(fmt) {
// (hopefully) temporary hack. Ideally, chrono would return a ParseKindError indicating
// if `fmt` is too long for NaiveDate. If that's implemented, then this check could
// be removed, and that error could be matched against in `transform_datetime_*s`
// See https://github.com/chronotope/chrono/issues/1075.
polars_bail!(ComputeError: "Invalid format string: found hour, but no minute directive. \
Please either specify both or neither.");
// (hopefully) temporary hacks. Ideally, chrono would return a ParseKindError indicating
// if `fmt` is too long for NaiveDate. If that's implemented, then this check could
// be removed, and that error could be matched against in `transform_datetime_*s`
// See https://github.com/chronotope/chrono/issues/1075.
if HOUR_PATTERN.is_match(fmt) ^ MINUTE_PATTERN.is_match(fmt) {
polars_bail!(ComputeError: "Invalid format string: \
Please either specify both hour and minute, or neither.");
}
if SECOND_PATTERN.is_match(fmt) && !HOUR_PATTERN.is_match(fmt) {
polars_bail!(ComputeError: "Invalid format string: \
Found seconds directive, but no hours directive.");
}
if TWELVE_HOUR_PATTERN.is_match(fmt) ^ MERIDIEM_PATTERN.is_match(fmt) {
polars_bail!(ComputeError: "Invalid format string: \
Please either specify both 12-hour directive and meridiem directive, or neither.");
}

Ok(fmt
.replace("%D", "%m/%d/%y")
.replace("%R", "%H:%M")
Expand Down
35 changes: 35 additions & 0 deletions py-polars/polars/testing/parametric/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
integers,
lists,
sampled_from,
sets,
text,
timedeltas,
times,
Expand Down Expand Up @@ -130,6 +131,40 @@ def strategy_decimal(draw: DrawFn) -> PyDecimal:
)


@composite
def strategy_datetime_format(draw: DrawFn) -> str:
"""Draw a random datetime format string."""
fmt = draw(
sets(
sampled_from(
[
"%m",
"%b",
"%B",
"%d",
"%j",
"%a",
"%A",
"%w",
"%H",
"%I",
"%p",
"%M",
"%S",
"%U",
"%W",
"%%",
]
),
)
)

# Make sure year is always present
fmt.add("%Y")

return " ".join(fmt)


class StrategyLookup(MutableMapping[PolarsDataType, SearchStrategy[Any]]):
"""
Mapping from polars DataTypes to hypothesis Strategies.
Expand Down
44 changes: 44 additions & 0 deletions py-polars/tests/parametric/time_series/test_to_datetime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from datetime import datetime

import hypothesis.strategies as st
from hypothesis import given

import polars as pl
from polars.exceptions import ComputeError
from polars.testing.parametric.strategies import strategy_datetime_format


@given(
datetimes=st.datetimes(
min_value=datetime(2000, 1, 1), max_value=datetime(9999, 12, 31)
),
fmt=strategy_datetime_format(),
)
def test_to_datetime(datetimes: datetime, fmt: str) -> None:
input = datetimes.strftime(fmt)
expected = datetime.strptime(input, fmt)
try:
result = pl.Series([input]).str.to_datetime(format=fmt).item()
except ComputeError as exc:
# If there's an exception, check that it's either:
# - something which polars can't parse at all: missing day or month
# - something on which polars intentionally raises
assert ( # noqa: PT017
(
(("%H" in fmt) ^ ("%M" in fmt))
or (("%I" in fmt) ^ ("%M" in fmt))
or ("%S" in fmt and "%H" not in fmt)
or ("%S" in fmt and "%I" not in fmt)
or (("%I" in fmt) ^ ("%p" in fmt))
or (("%H" in fmt) ^ ("%p" in fmt))
)
and "Invalid format string" in str(exc)
) or (
(
not any(day in fmt for day in ("%d", "%j"))
or not any(month in fmt for month in ("%b", "%B", "%m"))
)
and "strict conversion to datetimes failed" in str(exc)
)
else:
assert result == expected
66 changes: 63 additions & 3 deletions py-polars/tests/unit/namespaces/test_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -528,12 +528,72 @@ def test_strptime_subseconds_datetime(data: str, format: str, expected: time) ->
assert result == expected


def test_strptime_hour_without_minute_8849() -> None:
@pytest.mark.parametrize(
("string", "fmt"),
[
pytest.param("2023-05-04|7", "%Y-%m-%d|%H", id="hour but no minute"),
pytest.param("2023-05-04|7", "%Y-%m-%d|%k", id="padded hour but no minute"),
pytest.param("2023-05-04|10", "%Y-%m-%d|%M", id="minute but no hour"),
pytest.param("2023-05-04|10", "%Y-%m-%d|%S", id="second but no hour"),
pytest.param(
"2000-Jan-01 01 00 01", "%Y-%b-%d %I %M %S", id="12-hour clock but no AM/PM"
),
pytest.param(
"2000-Jan-01 01 00 01",
"%Y-%b-%d %l %M %S",
id="padded 12-hour clock but no AM/PM",
),
],
)
def test_strptime_incomplete_formats(string: str, fmt: str) -> None:
with pytest.raises(
ComputeError,
match="Invalid format string: found hour, but no minute directive",
match="Invalid format string",
):
pl.Series(["2023-05-04|7", "2023-05-04|10"]).str.to_datetime("%Y-%m-%d|%H")
pl.Series([string]).str.to_datetime(fmt)


@pytest.mark.parametrize(
("string", "fmt", "expected"),
[
("2023-05-04|7:3", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 7, 3)),
("2023-05-04|10:03", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 10, 3)),
(
"2000-Jan-01 01 00 01 am",
"%Y-%b-%d %I %M %S %P",
datetime(2000, 1, 1, 1, 0, 1),
),
(
"2000-Jan-01 01 00 01 am",
"%Y-%b-%d %_I %M %S %P",
datetime(2000, 1, 1, 1, 0, 1),
),
(
"2000-Jan-01 01 00 01 am",
"%Y-%b-%d %l %M %S %P",
datetime(2000, 1, 1, 1, 0, 1),
),
(
"2000-Jan-01 01 00 01 AM",
"%Y-%b-%d %I %M %S %p",
datetime(2000, 1, 1, 1, 0, 1),
),
(
"2000-Jan-01 01 00 01 AM",
"%Y-%b-%d %_I %M %S %p",
datetime(2000, 1, 1, 1, 0, 1),
),
(
"2000-Jan-01 01 00 01 AM",
"%Y-%b-%d %l %M %S %p",
datetime(2000, 1, 1, 1, 0, 1),
),
],
)
def test_strptime_complete_formats(string: str, fmt: str, expected: datetime) -> None:
# Similar to the above, but these formats are complete and should work
result = pl.Series([string]).str.to_datetime(fmt).item()
assert result == expected


@pytest.mark.parametrize(
Expand Down

0 comments on commit 90c988c

Please sign in to comment.