From 07019781e2992eaaf785a93429da692f3469cd3d Mon Sep 17 00:00:00 2001 From: saschahofmann Date: Wed, 24 Apr 2024 16:23:46 +0200 Subject: [PATCH] feat: add to_date function to StringValue (#9030) ## Description of changes * Adds `to_date` to string types that accepts a format string and parses a string to Date type. * Uses a cast to timestamp and extract time as a fallback like this ```python def visit_StringToDate(self, op, *, arg, format_str): return self.f.date(self.f.str_to_time(arg, format_str)) ``` * Implements native functions for bigquery, clickhouse, MySQL, oracle, postgres, and snowflake ## Issues closed Implements half of #8908 --------- Co-authored-by: Gil Forsyth --- ibis/backends/clickhouse/compiler.py | 1 + ibis/backends/datafusion/compiler.py | 1 + ibis/backends/druid/compiler.py | 1 + ibis/backends/exasol/compiler.py | 1 + ibis/backends/flink/compiler.py | 1 + ibis/backends/mssql/compiler.py | 1 + ibis/backends/polars/compiler.py | 9 ++++ ibis/backends/snowflake/compiler.py | 1 + ibis/backends/sql/compiler.py | 2 +- ibis/backends/sqlite/compiler.py | 1 + ibis/backends/tests/test_temporal.py | 79 ++++++++++++++++++++++++++++ ibis/expr/operations/temporal.py | 9 ++++ ibis/expr/types/strings.py | 29 ++++++++++ 13 files changed, 135 insertions(+), 1 deletion(-) diff --git a/ibis/backends/clickhouse/compiler.py b/ibis/backends/clickhouse/compiler.py index ef3f03abb538..f9b7ba2a8ff0 100644 --- a/ibis/backends/clickhouse/compiler.py +++ b/ibis/backends/clickhouse/compiler.py @@ -37,6 +37,7 @@ class ClickHouseCompiler(SQLGlotCompiler): ops.Time, ops.TimeDelta, ops.StringToTimestamp, + ops.StringToDate, ops.Levenshtein, ) ) diff --git a/ibis/backends/datafusion/compiler.py b/ibis/backends/datafusion/compiler.py index 35ce655f30b7..74a9adbc4f99 100644 --- a/ibis/backends/datafusion/compiler.py +++ b/ibis/backends/datafusion/compiler.py @@ -59,6 +59,7 @@ class DataFusionCompiler(SQLGlotCompiler): ops.TimestampNow, ops.TypeOf, ops.Unnest, + ops.StringToDate, ops.StringToTimestamp, ) ) diff --git a/ibis/backends/druid/compiler.py b/ibis/backends/druid/compiler.py index ce4b3db9a26f..59e0d9e0c5dc 100644 --- a/ibis/backends/druid/compiler.py +++ b/ibis/backends/druid/compiler.py @@ -67,6 +67,7 @@ class DruidCompiler(SQLGlotCompiler): ops.Strftime, ops.StringAscii, ops.StringSplit, + ops.StringToDate, ops.StringToTimestamp, ops.TimeDelta, ops.TimestampBucket, diff --git a/ibis/backends/exasol/compiler.py b/ibis/backends/exasol/compiler.py index 1fff079b564e..488c93400baa 100644 --- a/ibis/backends/exasol/compiler.py +++ b/ibis/backends/exasol/compiler.py @@ -78,6 +78,7 @@ class ExasolCompiler(SQLGlotCompiler): ops.Strftime, ops.StringJoin, ops.StringSplit, + ops.StringToDate, ops.StringToTimestamp, ops.TimeDelta, ops.TimestampAdd, diff --git a/ibis/backends/flink/compiler.py b/ibis/backends/flink/compiler.py index 2ba669c761f4..541d21f96a28 100644 --- a/ibis/backends/flink/compiler.py +++ b/ibis/backends/flink/compiler.py @@ -82,6 +82,7 @@ class FlinkCompiler(SQLGlotCompiler): ops.RegexSearch: "regexp", ops.StrRight: "right", ops.StringLength: "char_length", + ops.StringToDate: "to_date", ops.StringToTimestamp: "to_timestamp", ops.Strip: "trim", ops.TypeOf: "typeof", diff --git a/ibis/backends/mssql/compiler.py b/ibis/backends/mssql/compiler.py index af7f7fbe6b32..957e12646dc4 100644 --- a/ibis/backends/mssql/compiler.py +++ b/ibis/backends/mssql/compiler.py @@ -113,6 +113,7 @@ class MSSQLCompiler(SQLGlotCompiler): ops.RPad, ops.StartsWith, ops.StringSplit, + ops.StringToDate, ops.StringToTimestamp, ops.StructColumn, ops.TimestampAdd, diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index efd27ed1f634..c28e224c19c2 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -936,6 +936,15 @@ def interval_from_integer(op, **kw): return _make_duration(arg, dt.Interval(unit=op.unit)) +@translate.register(ops.StringToDate) +def string_to_date(op, **kw): + arg = translate(op.arg, **kw) + return arg.str.strptime( + dtype=pl.Date, + format=_literal_value(op.format_str), + ) + + @translate.register(ops.StringToTimestamp) def string_to_timestamp(op, **kw): arg = translate(op.arg, **kw) diff --git a/ibis/backends/snowflake/compiler.py b/ibis/backends/snowflake/compiler.py index 75bfd7ce6674..c0915d0598cc 100644 --- a/ibis/backends/snowflake/compiler.py +++ b/ibis/backends/snowflake/compiler.py @@ -78,6 +78,7 @@ class SnowflakeCompiler(SQLGlotCompiler): ops.Hash: "hash", ops.Median: "median", ops.Mode: "mode", + ops.StringToDate: "to_date", ops.StringToTimestamp: "to_timestamp_tz", ops.TimeFromHMS: "time_from_parts", ops.TimestampFromYMDHMS: "timestamp_from_parts", diff --git a/ibis/backends/sql/compiler.py b/ibis/backends/sql/compiler.py index 71d57b7481aa..89bc0db53f5e 100644 --- a/ibis/backends/sql/compiler.py +++ b/ibis/backends/sql/compiler.py @@ -277,6 +277,7 @@ class SQLGlotCompiler(abc.ABC): ops.StringLength: "length", ops.StringReplace: "replace", ops.StringSplit: "split", + ops.StringToDate: "str_to_date", ops.StringToTimestamp: "str_to_time", ops.Tan: "tan", ops.Translate: "translate", @@ -801,7 +802,6 @@ def visit_IntervalFromInteger(self, op, *, arg, unit): ) ### String Instruments - def visit_Strip(self, op, *, arg): return self.f.trim(arg, string.whitespace) diff --git a/ibis/backends/sqlite/compiler.py b/ibis/backends/sqlite/compiler.py index d680b38c66a2..077510239cc6 100644 --- a/ibis/backends/sqlite/compiler.py +++ b/ibis/backends/sqlite/compiler.py @@ -66,6 +66,7 @@ class SQLiteCompiler(SQLGlotCompiler): ops.TimestampAdd, ops.TimestampSub, ops.TimestampDiff, + ops.StringToDate, ops.StringToTimestamp, ops.TimeDelta, ops.DateDelta, diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py index bf45776e586d..be6b559a3ab9 100644 --- a/ibis/backends/tests/test_temporal.py +++ b/ibis/backends/tests/test_temporal.py @@ -1441,6 +1441,85 @@ def test_string_to_timestamp(alltypes, fmt): assert val.strftime("%m/%d/%y") == result["date_string_col"][i] +@pytest.mark.parametrize( + "fmt", + [ + # "11/01/10" - "month/day/year" + param( + "%m/%d/%y", + id="mysql_format", + marks=[ + pytest.mark.never( + ["snowflake"], + reason=( + "(snowflake.connector.errors.ProgrammingError) 100096 (22007): " + "Can't parse '11/01/10' as timestamp with format '%m/%d/%y'" + ), + raises=SnowflakeProgrammingError, + ), + pytest.mark.never( + ["flink"], + raises=ValueError, + reason="Datetime formatting style is not supported.", + ), + ], + ), + param( + "MM/dd/yy", + id="pyspark_format", + marks=[ + pytest.mark.never( + ["bigquery"], + reason="400 Mismatch between format character 'M' and string character '0'", + raises=GoogleBadRequest, + ), + pytest.mark.never( + ["mysql"], + reason="NaTType does not support strftime", + raises=ValueError, + ), + pytest.mark.never( + ["trino"], + reason="datetime formatting style not supported", + raises=TrinoUserError, + ), + pytest.mark.never( + ["polars"], + reason="datetime formatting style not supported", + raises=PolarsComputeError, + ), + pytest.mark.never( + ["duckdb"], + reason="datetime formatting style not supported", + raises=DuckDBInvalidInputException, + ), + ], + ), + ], +) +@pytest.mark.notimpl( + [ + "dask", + "pandas", + "clickhouse", + "sqlite", + "datafusion", + "mssql", + "druid", + ], + raises=com.OperationNotDefinedError, +) +@pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError) +def test_string_to_date(alltypes, fmt): + table = alltypes + result = table.mutate(date=table.date_string_col.to_date(fmt)).execute() + + # TEST: do we get the same date out, that we put in? + # format string assumes that we are using pandas' strftime + for i, val in enumerate(result["date"]): + assert val.strftime("%m/%d/%y") == result["date_string_col"][i] + + @pytest.mark.parametrize( ("date", "expected_index", "expected_day"), [ diff --git a/ibis/expr/operations/temporal.py b/ibis/expr/operations/temporal.py index 5eea8d969a76..a500dd017400 100644 --- a/ibis/expr/operations/temporal.py +++ b/ibis/expr/operations/temporal.py @@ -79,6 +79,15 @@ class StringToTimestamp(Value): dtype = dt.Timestamp(timezone="UTC") +@public +class StringToDate(Value): + arg: Value[dt.String] + format_str: Value[dt.String] + + shape = rlz.shape_like("arg") + dtype = dt.date + + @public class ExtractTemporalField(TemporalUnary): dtype = dt.int32 diff --git a/ibis/expr/types/strings.py b/ibis/expr/types/strings.py index c2e8c3dda17f..410f3f74b851 100644 --- a/ibis/expr/types/strings.py +++ b/ibis/expr/types/strings.py @@ -1302,6 +1302,35 @@ def to_timestamp(self, format_str: str) -> ir.TimestampValue: """ return ops.StringToTimestamp(self, format_str).to_expr() + def to_date(self, format_str: str) -> ir.DateValue: + """Parse a string and return a date. + + Parameters + ---------- + format_str + Format string in `strptime` format + + Returns + ------- + DateValue + Parsed date value + + Examples + -------- + >>> import ibis + >>> ibis.options.interactive = True + >>> t = ibis.memtable({"ts": ["20170206"]}) + >>> t.ts.to_date("%Y%m%d") + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ StringToDate(ts, '%Y%m%d') ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ + │ date │ + ├────────────────────────────┤ + │ 2017-02-06 │ + └────────────────────────────┘ + """ + return ops.StringToDate(self, format_str).to_expr() + def protocol(self): """Parse a URL and extract protocol.