Skip to content

Commit

Permalink
fix(backends): ensure that returned date results are actually proper …
Browse files Browse the repository at this point in the history
…date values

BREAKING CHANGE: Columns with Ibis `date` types are now returned as object dtype containing `datetime.date` objects when executing with the pandas backend.
  • Loading branch information
cpcloud authored and gforsyth committed Dec 19, 2023
1 parent 4fc8e75 commit 0626fb2
Show file tree
Hide file tree
Showing 23 changed files with 179 additions and 95 deletions.
5 changes: 1 addition & 4 deletions ci/make_geography_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,7 @@
"independence": lambda row: toolz.assoc(
row,
"independence_date",
datetime.datetime.strptime(
row["independence_date"],
"%Y-%m-%d",
).date(),
datetime.datetime.fromisoformat(row["independence_date"]).date(),
)
}

Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/base/sql/registry/literal.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def _interval_literal_format(translator, op):
def _date_literal_format(translator, op):
value = op.value
if isinstance(value, datetime.date):
value = value.strftime("%Y-%m-%d")
value = value.isoformat()

return repr(value)

Expand Down
14 changes: 5 additions & 9 deletions ibis/backends/clickhouse/compiler/values.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,18 +346,14 @@ def _literal(op, *, value, dtype, **kw):

return interval(value, unit=dtype.resolution.upper())
elif dtype.is_timestamp():
funcname = "makeDateTime"
funcname = "parseDateTime"

if micros := value.microsecond:
funcname += "64"

args = [
value.year,
value.month,
value.day,
value.hour,
value.minute,
value.second,
]
funcname += "BestEffort"

args = [value.isoformat()]

if micros % 1000:
args.append(micros)
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/flink/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def _cast(translator: ExprTranslator, op: ops.generic.Cast) -> str:
arg_translated = f"FROM_UNIXTIME({arg_translated})"

if to.timezone:
return f"TO_TIMESTAMP(CONVERT_TZ(CAST({arg_translated} AS STRING), 'UTC+0', '{to.timezone}'))"
return f"TO_TIMESTAMP(CONVERT_TZ(CAST({arg_translated} AS STRING), 'UTC', {to.timezone!r}))"
else:
return f"TO_TIMESTAMP({arg_translated})"
elif to.is_date():
Expand Down
31 changes: 20 additions & 11 deletions ibis/backends/flink/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
from ibis.backends.flink.datatypes import FlinkType
from ibis.common.temporal import IntervalUnit
from ibis.common.temporal import IntervalUnit, normalize_timezone
from ibis.util import convert_unit

# For details on what precisions Flink SQL interval types support, see
Expand Down Expand Up @@ -264,7 +264,7 @@ def translate_literal(op: ops.Literal) -> str:
return f"x'{value.hex()}'"
elif dtype.is_date():
if isinstance(value, datetime.date):
value = value.strftime("%Y-%m-%d")
value = value.isoformat()
return repr(value)
elif dtype.is_numeric():
if math.isnan(value):
Expand All @@ -285,15 +285,24 @@ def translate_literal(op: ops.Literal) -> str:
return f"CAST({value} AS {FlinkType.from_ibis(dtype)!s})"
elif dtype.is_timestamp():
# TODO(chloeh13q): support timestamp with local timezone
if isinstance(value, datetime.datetime):
fmt = "%Y-%m-%d %H:%M:%S"
# datetime.datetime only supports resolution up to microseconds, even
# though Flink supports fractional precision up to 9 digits. We will
# need to use numpy or pandas datetime types for higher resolutions.
if value.microsecond:
fmt += ".%f"
return "TIMESTAMP " + repr(value.strftime(fmt))
raise NotImplementedError(f"No translation rule for timestamp {value}")
assert isinstance(value, datetime.datetime)
# datetime.datetime only supports resolution up to microseconds, even
# though Flink supports fractional precision up to 9 digits. We will
# need to use numpy or pandas datetime types for higher resolutions.
#
if dtype.timezone is not None:
value = value.astimezone(normalize_timezone("UTC"))

# remove timezone information without altering the ISO output
# except for removing the UTC offset
#
# format to ISO 8601 without the T character
value = value.replace(tzinfo=None).isoformat(sep=" ")

if (tz := dtype.timezone) is not None:
return f"TO_TIMESTAMP(CONVERT_TZ({value!r}, 'UTC', {tz!r}))"
else:
return f"TIMESTAMP {value!r}"
elif dtype.is_time():
return f"TIME '{value}'"
elif dtype.is_interval():
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/pandas/tests/execution/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_timestamp_with_timezone_is_inferred_correctly(t, df):
def test_cast_date(t, df, column):
expr = t[column].cast("date")
result = expr.execute()
expected = df[column].dt.normalize().dt.tz_localize(None)
expected = df[column].dt.normalize().dt.tz_localize(None).dt.date
tm.assert_series_equal(result, expected)


Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/pandas/tests/execution/test_temporal.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def test_timestamp_functions(case_func, expected_func):
def test_cast_datetime_strings_to_date(t, df, column):
expr = t[column].cast("date")
result = expr.execute()
expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None)
expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None).dt.date
tm.assert_series_equal(result, expected)


Expand Down Expand Up @@ -103,7 +103,7 @@ def test_cast_integer_to_date(t, df):
expr = t.plain_int64.cast("date")
result = expr.execute()
expected = pd.Series(
pd.to_datetime(df.plain_int64.values, unit="D").values,
pd.to_datetime(df.plain_int64.values, unit="D").date,
index=df.index,
name="plain_int64",
)
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/polars/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ def execute(
else:
assert isinstance(expr, ir.Column), type(expr)
if expr.type().is_temporal():
return df.to_pandas().iloc[:, 0]
return expr.__pandas_result__(df.to_pandas())
else:
# note: skip frame-construction overhead
return df.to_series().to_pandas()
Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/polars/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ def connect(*, tmpdir, worker_id, **kw):

@classmethod
def assert_series_equal(cls, left, right, *args, **kwargs) -> None:
check_dtype = not (
check_dtype = kwargs.pop("check_dtype", True) and not (
issubclass(left.dtype.type, np.timedelta64)
and issubclass(right.dtype.type, np.timedelta64)
) and kwargs.pop("check_dtype", True)
)
return super().assert_series_equal(
left, right, *args, **kwargs, check_dtype=check_dtype
)
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/polars/tests/test_udf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_multiple_argument_udf(alltypes):
result = expr.execute()

df = alltypes[["smallint_col", "int_col"]].execute()
expected = (df.smallint_col + df.int_col).astype("int32")
expected = df.smallint_col + df.int_col

tm.assert_series_equal(result, expected.rename("tmp"))

Expand Down
5 changes: 1 addition & 4 deletions ibis/backends/pyspark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,7 @@ def execute(self, expr: ir.Expr, **kwargs: Any) -> Any:
table_expr = expr.as_table()
df = self.compile(table_expr, **kwargs).toPandas()

# TODO: remove the extra conversion
return expr.__pandas_result__(
PySparkPandasData.convert_table(df, table_expr.schema())
)
return expr.__pandas_result__(df)

def _fully_qualified_name(self, name, database):
if is_fully_qualified(name):
Expand Down
6 changes: 2 additions & 4 deletions ibis/backends/sqlite/tests/test_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import sqlite3
from datetime import date

import pandas as pd
import pytest
Expand Down Expand Up @@ -86,10 +87,7 @@ def test_type_map(db):
assert t.schema() == expected_schema
res = t.filter(t.str_col == "a").execute()
sol = pd.DataFrame(
{
"str_col": ["a"],
"date_col": pd.Series(["2022-01-01"], dtype="M8[ns]"),
}
{"str_col": ["a"], "date_col": pd.Series([date(2022, 1, 1)], dtype="object")}
)
assert res.equals(sol)

Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,7 +817,7 @@ def test_int_column(alltypes):
assert result.dtype == np.int8


@pytest.mark.notimpl(["druid", "oracle", "exasol"])
@pytest.mark.notimpl(["druid", "oracle"])
@pytest.mark.never(
["bigquery", "sqlite", "snowflake"], reason="backend only implements int64"
)
Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/tests/test_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,11 +226,6 @@ def test_literal_map_get_broadcast(backend, alltypes, df):
param(["a", "b"], ["1", "2"], id="int"),
],
)
@pytest.mark.notyet(
["flink"],
raises=AssertionError,
reason="got list of tuples instead; requires PyFlink compatibility with PyArrow 13",
)
def test_map_construct_dict(con, keys, values):
expr = ibis.map(keys, values)
result = con.execute(expr.name("tmp"))
Expand Down
5 changes: 4 additions & 1 deletion ibis/backends/tests/test_param.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,10 @@ def test_scalar_param_date(backend, alltypes, value):
)
df = base.execute()
expected = (
df.loc[df.date_col.dt.normalize() == pd.Timestamp(value).normalize()]
df.loc[
pd.to_datetime(df.date_col).dt.normalize().dt.date
== pd.Timestamp(value).normalize().date()
]
.sort_values("id")
.reset_index(drop=True)
.drop(columns=["date_col"])
Expand Down
Loading

0 comments on commit 0626fb2

Please sign in to comment.