fix(backends): ensure that returned date results are actually proper …

…date values BREAKING CHANGE: Columns with Ibis `date` types are now returned as object dtype containing `datetime.date` objects when executing with the pandas backend.
ibis-project · Dec 19, 2023 · 0626fb2 · 0626fb2
1 parent 4fc8e75
commit 0626fb2
Show file tree

Hide file tree

Showing 23 changed files with 179 additions and 95 deletions.
diff --git a/ci/make_geography_db.py b/ci/make_geography_db.py
@@ -56,10 +56,7 @@
     "independence": lambda row: toolz.assoc(
         row,
         "independence_date",
-        datetime.datetime.strptime(
-            row["independence_date"],
-            "%Y-%m-%d",
-        ).date(),
+        datetime.datetime.fromisoformat(row["independence_date"]).date(),
     )
 }
 

diff --git a/ibis/backends/base/sql/registry/literal.py b/ibis/backends/base/sql/registry/literal.py
@@ -62,7 +62,7 @@ def _interval_literal_format(translator, op):
 def _date_literal_format(translator, op):
     value = op.value
     if isinstance(value, datetime.date):
-        value = value.strftime("%Y-%m-%d")
+        value = value.isoformat()
 
     return repr(value)
 

diff --git a/ibis/backends/clickhouse/compiler/values.py b/ibis/backends/clickhouse/compiler/values.py
@@ -346,18 +346,14 @@ def _literal(op, *, value, dtype, **kw):
 
         return interval(value, unit=dtype.resolution.upper())
     elif dtype.is_timestamp():
-        funcname = "makeDateTime"
+        funcname = "parseDateTime"
+
         if micros := value.microsecond:
             funcname += "64"
 
-        args = [
-            value.year,
-            value.month,
-            value.day,
-            value.hour,
-            value.minute,
-            value.second,
-        ]
+        funcname += "BestEffort"
+
+        args = [value.isoformat()]
 
         if micros % 1000:
             args.append(micros)

diff --git a/ibis/backends/flink/registry.py b/ibis/backends/flink/registry.py
@@ -66,7 +66,7 @@ def _cast(translator: ExprTranslator, op: ops.generic.Cast) -> str:
             arg_translated = f"FROM_UNIXTIME({arg_translated})"
 
         if to.timezone:
-            return f"TO_TIMESTAMP(CONVERT_TZ(CAST({arg_translated} AS STRING), 'UTC+0', '{to.timezone}'))"
+            return f"TO_TIMESTAMP(CONVERT_TZ(CAST({arg_translated} AS STRING), 'UTC', {to.timezone!r}))"
         else:
             return f"TO_TIMESTAMP({arg_translated})"
     elif to.is_date():

diff --git a/ibis/backends/flink/utils.py b/ibis/backends/flink/utils.py
@@ -8,7 +8,7 @@
 import ibis.expr.datatypes as dt
 import ibis.expr.operations as ops
 from ibis.backends.flink.datatypes import FlinkType
-from ibis.common.temporal import IntervalUnit
+from ibis.common.temporal import IntervalUnit, normalize_timezone
 from ibis.util import convert_unit
 
 # For details on what precisions Flink SQL interval types support, see
@@ -264,7 +264,7 @@ def translate_literal(op: ops.Literal) -> str:
         return f"x'{value.hex()}'"
     elif dtype.is_date():
         if isinstance(value, datetime.date):
-            value = value.strftime("%Y-%m-%d")
+            value = value.isoformat()
         return repr(value)
     elif dtype.is_numeric():
         if math.isnan(value):
@@ -285,15 +285,24 @@ def translate_literal(op: ops.Literal) -> str:
         return f"CAST({value} AS {FlinkType.from_ibis(dtype)!s})"
     elif dtype.is_timestamp():
         # TODO(chloeh13q): support timestamp with local timezone
-        if isinstance(value, datetime.datetime):
-            fmt = "%Y-%m-%d %H:%M:%S"
-            # datetime.datetime only supports resolution up to microseconds, even
-            # though Flink supports fractional precision up to 9 digits. We will
-            # need to use numpy or pandas datetime types for higher resolutions.
-            if value.microsecond:
-                fmt += ".%f"
-            return "TIMESTAMP " + repr(value.strftime(fmt))
-        raise NotImplementedError(f"No translation rule for timestamp {value}")
+        assert isinstance(value, datetime.datetime)
+        # datetime.datetime only supports resolution up to microseconds, even
+        # though Flink supports fractional precision up to 9 digits. We will
+        # need to use numpy or pandas datetime types for higher resolutions.
+        #
+        if dtype.timezone is not None:
+            value = value.astimezone(normalize_timezone("UTC"))
+
+        # remove timezone information without altering the ISO output
+        # except for removing the UTC offset
+        #
+        # format to ISO 8601 without the T character
+        value = value.replace(tzinfo=None).isoformat(sep=" ")
+
+        if (tz := dtype.timezone) is not None:
+            return f"TO_TIMESTAMP(CONVERT_TZ({value!r}, 'UTC', {tz!r}))"
+        else:
+            return f"TIMESTAMP {value!r}"
     elif dtype.is_time():
         return f"TIME '{value}'"
     elif dtype.is_interval():

diff --git a/ibis/backends/pandas/tests/execution/test_cast.py b/ibis/backends/pandas/tests/execution/test_cast.py
@@ -154,7 +154,7 @@ def test_timestamp_with_timezone_is_inferred_correctly(t, df):
 def test_cast_date(t, df, column):
     expr = t[column].cast("date")
     result = expr.execute()
-    expected = df[column].dt.normalize().dt.tz_localize(None)
+    expected = df[column].dt.normalize().dt.tz_localize(None).dt.date
     tm.assert_series_equal(result, expected)
 
 

diff --git a/ibis/backends/pandas/tests/execution/test_temporal.py b/ibis/backends/pandas/tests/execution/test_temporal.py
@@ -66,7 +66,7 @@ def test_timestamp_functions(case_func, expected_func):
 def test_cast_datetime_strings_to_date(t, df, column):
     expr = t[column].cast("date")
     result = expr.execute()
-    expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None)
+    expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None).dt.date
     tm.assert_series_equal(result, expected)
 
 
@@ -103,7 +103,7 @@ def test_cast_integer_to_date(t, df):
     expr = t.plain_int64.cast("date")
     result = expr.execute()
     expected = pd.Series(
-        pd.to_datetime(df.plain_int64.values, unit="D").values,
+        pd.to_datetime(df.plain_int64.values, unit="D").date,
         index=df.index,
         name="plain_int64",
     )

diff --git a/ibis/backends/polars/__init__.py b/ibis/backends/polars/__init__.py
@@ -420,7 +420,7 @@ def execute(
         else:
             assert isinstance(expr, ir.Column), type(expr)
             if expr.type().is_temporal():
-                return df.to_pandas().iloc[:, 0]
+                return expr.__pandas_result__(df.to_pandas())
             else:
                 # note: skip frame-construction overhead
                 return df.to_series().to_pandas()

diff --git a/ibis/backends/polars/tests/conftest.py b/ibis/backends/polars/tests/conftest.py
@@ -33,10 +33,10 @@ def connect(*, tmpdir, worker_id, **kw):
 
     @classmethod
     def assert_series_equal(cls, left, right, *args, **kwargs) -> None:
-        check_dtype = not (
+        check_dtype = kwargs.pop("check_dtype", True) and not (
             issubclass(left.dtype.type, np.timedelta64)
             and issubclass(right.dtype.type, np.timedelta64)
-        ) and kwargs.pop("check_dtype", True)
+        )
         return super().assert_series_equal(
             left, right, *args, **kwargs, check_dtype=check_dtype
         )

diff --git a/ibis/backends/polars/tests/test_udf.py b/ibis/backends/polars/tests/test_udf.py
@@ -46,7 +46,7 @@ def test_multiple_argument_udf(alltypes):
     result = expr.execute()
 
     df = alltypes[["smallint_col", "int_col"]].execute()
-    expected = (df.smallint_col + df.int_col).astype("int32")
+    expected = df.smallint_col + df.int_col
 
     tm.assert_series_equal(result, expected.rename("tmp"))
 

diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py
@@ -232,10 +232,7 @@ def execute(self, expr: ir.Expr, **kwargs: Any) -> Any:
         table_expr = expr.as_table()
         df = self.compile(table_expr, **kwargs).toPandas()
 
-        # TODO: remove the extra conversion
-        return expr.__pandas_result__(
-            PySparkPandasData.convert_table(df, table_expr.schema())
-        )
+        return expr.__pandas_result__(df)
 
     def _fully_qualified_name(self, name, database):
         if is_fully_qualified(name):

diff --git a/ibis/backends/sqlite/tests/test_types.py b/ibis/backends/sqlite/tests/test_types.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import sqlite3
+from datetime import date
 
 import pandas as pd
 import pytest
@@ -86,10 +87,7 @@ def test_type_map(db):
     assert t.schema() == expected_schema
     res = t.filter(t.str_col == "a").execute()
     sol = pd.DataFrame(
-        {
-            "str_col": ["a"],
-            "date_col": pd.Series(["2022-01-01"], dtype="M8[ns]"),
-        }
+        {"str_col": ["a"], "date_col": pd.Series([date(2022, 1, 1)], dtype="object")}
     )
     assert res.equals(sol)
 

diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
@@ -817,7 +817,7 @@ def test_int_column(alltypes):
     assert result.dtype == np.int8
 
 
-@pytest.mark.notimpl(["druid", "oracle", "exasol"])
+@pytest.mark.notimpl(["druid", "oracle"])
 @pytest.mark.never(
     ["bigquery", "sqlite", "snowflake"], reason="backend only implements int64"
 )

diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py
@@ -226,11 +226,6 @@ def test_literal_map_get_broadcast(backend, alltypes, df):
         param(["a", "b"], ["1", "2"], id="int"),
     ],
 )
-@pytest.mark.notyet(
-    ["flink"],
-    raises=AssertionError,
-    reason="got list of tuples instead; requires PyFlink compatibility with PyArrow 13",
-)
 def test_map_construct_dict(con, keys, values):
     expr = ibis.map(keys, values)
     result = con.execute(expr.name("tmp"))

diff --git a/ibis/backends/tests/test_param.py b/ibis/backends/tests/test_param.py
@@ -225,7 +225,10 @@ def test_scalar_param_date(backend, alltypes, value):
     )
     df = base.execute()
     expected = (
-        df.loc[df.date_col.dt.normalize() == pd.Timestamp(value).normalize()]
+        df.loc[
+            pd.to_datetime(df.date_col).dt.normalize().dt.date
+            == pd.Timestamp(value).normalize().date()
+        ]
         .sort_values("id")
         .reset_index(drop=True)
         .drop(columns=["date_col"])