From 0626fb20966fe445e4bd4c4c579343b5f865e586 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 19 Nov 2023 10:25:10 -0500 Subject: [PATCH] fix(backends): ensure that returned date results are actually proper date values BREAKING CHANGE: Columns with Ibis `date` types are now returned as object dtype containing `datetime.date` objects when executing with the pandas backend. --- ci/make_geography_db.py | 5 +- ibis/backends/base/sql/registry/literal.py | 2 +- ibis/backends/clickhouse/compiler/values.py | 14 +- ibis/backends/flink/registry.py | 2 +- ibis/backends/flink/utils.py | 31 ++-- .../pandas/tests/execution/test_cast.py | 2 +- .../pandas/tests/execution/test_temporal.py | 4 +- ibis/backends/polars/__init__.py | 2 +- ibis/backends/polars/tests/conftest.py | 4 +- ibis/backends/polars/tests/test_udf.py | 2 +- ibis/backends/pyspark/__init__.py | 5 +- ibis/backends/sqlite/tests/test_types.py | 6 +- ibis/backends/tests/test_generic.py | 2 +- ibis/backends/tests/test_map.py | 5 - ibis/backends/tests/test_param.py | 5 +- ibis/backends/tests/test_temporal.py | 132 +++++++++++++----- ibis/backends/tests/test_uuid.py | 2 +- ibis/expr/api.py | 10 +- ibis/expr/types/generic.py | 4 +- ibis/formats/numpy.py | 3 +- ibis/formats/pandas.py | 21 ++- ibis/formats/tests/test_numpy.py | 9 +- ibis/formats/tests/test_pandas.py | 2 +- 23 files changed, 179 insertions(+), 95 deletions(-) diff --git a/ci/make_geography_db.py b/ci/make_geography_db.py index 6c7a3bdba4ca..e98551b143a8 100755 --- a/ci/make_geography_db.py +++ b/ci/make_geography_db.py @@ -56,10 +56,7 @@ "independence": lambda row: toolz.assoc( row, "independence_date", - datetime.datetime.strptime( - row["independence_date"], - "%Y-%m-%d", - ).date(), + datetime.datetime.fromisoformat(row["independence_date"]).date(), ) } diff --git a/ibis/backends/base/sql/registry/literal.py b/ibis/backends/base/sql/registry/literal.py index 10bad209cdf2..b31aec6fac45 100644 --- a/ibis/backends/base/sql/registry/literal.py +++ b/ibis/backends/base/sql/registry/literal.py @@ -62,7 +62,7 @@ def _interval_literal_format(translator, op): def _date_literal_format(translator, op): value = op.value if isinstance(value, datetime.date): - value = value.strftime("%Y-%m-%d") + value = value.isoformat() return repr(value) diff --git a/ibis/backends/clickhouse/compiler/values.py b/ibis/backends/clickhouse/compiler/values.py index bc10a2e82266..ea0d77cf1d39 100644 --- a/ibis/backends/clickhouse/compiler/values.py +++ b/ibis/backends/clickhouse/compiler/values.py @@ -346,18 +346,14 @@ def _literal(op, *, value, dtype, **kw): return interval(value, unit=dtype.resolution.upper()) elif dtype.is_timestamp(): - funcname = "makeDateTime" + funcname = "parseDateTime" + if micros := value.microsecond: funcname += "64" - args = [ - value.year, - value.month, - value.day, - value.hour, - value.minute, - value.second, - ] + funcname += "BestEffort" + + args = [value.isoformat()] if micros % 1000: args.append(micros) diff --git a/ibis/backends/flink/registry.py b/ibis/backends/flink/registry.py index 97fca3ff65c0..aa15829d5509 100644 --- a/ibis/backends/flink/registry.py +++ b/ibis/backends/flink/registry.py @@ -66,7 +66,7 @@ def _cast(translator: ExprTranslator, op: ops.generic.Cast) -> str: arg_translated = f"FROM_UNIXTIME({arg_translated})" if to.timezone: - return f"TO_TIMESTAMP(CONVERT_TZ(CAST({arg_translated} AS STRING), 'UTC+0', '{to.timezone}'))" + return f"TO_TIMESTAMP(CONVERT_TZ(CAST({arg_translated} AS STRING), 'UTC', {to.timezone!r}))" else: return f"TO_TIMESTAMP({arg_translated})" elif to.is_date(): diff --git a/ibis/backends/flink/utils.py b/ibis/backends/flink/utils.py index c27bdbf6737e..6f828efbbeaa 100644 --- a/ibis/backends/flink/utils.py +++ b/ibis/backends/flink/utils.py @@ -8,7 +8,7 @@ import ibis.expr.datatypes as dt import ibis.expr.operations as ops from ibis.backends.flink.datatypes import FlinkType -from ibis.common.temporal import IntervalUnit +from ibis.common.temporal import IntervalUnit, normalize_timezone from ibis.util import convert_unit # For details on what precisions Flink SQL interval types support, see @@ -264,7 +264,7 @@ def translate_literal(op: ops.Literal) -> str: return f"x'{value.hex()}'" elif dtype.is_date(): if isinstance(value, datetime.date): - value = value.strftime("%Y-%m-%d") + value = value.isoformat() return repr(value) elif dtype.is_numeric(): if math.isnan(value): @@ -285,15 +285,24 @@ def translate_literal(op: ops.Literal) -> str: return f"CAST({value} AS {FlinkType.from_ibis(dtype)!s})" elif dtype.is_timestamp(): # TODO(chloeh13q): support timestamp with local timezone - if isinstance(value, datetime.datetime): - fmt = "%Y-%m-%d %H:%M:%S" - # datetime.datetime only supports resolution up to microseconds, even - # though Flink supports fractional precision up to 9 digits. We will - # need to use numpy or pandas datetime types for higher resolutions. - if value.microsecond: - fmt += ".%f" - return "TIMESTAMP " + repr(value.strftime(fmt)) - raise NotImplementedError(f"No translation rule for timestamp {value}") + assert isinstance(value, datetime.datetime) + # datetime.datetime only supports resolution up to microseconds, even + # though Flink supports fractional precision up to 9 digits. We will + # need to use numpy or pandas datetime types for higher resolutions. + # + if dtype.timezone is not None: + value = value.astimezone(normalize_timezone("UTC")) + + # remove timezone information without altering the ISO output + # except for removing the UTC offset + # + # format to ISO 8601 without the T character + value = value.replace(tzinfo=None).isoformat(sep=" ") + + if (tz := dtype.timezone) is not None: + return f"TO_TIMESTAMP(CONVERT_TZ({value!r}, 'UTC', {tz!r}))" + else: + return f"TIMESTAMP {value!r}" elif dtype.is_time(): return f"TIME '{value}'" elif dtype.is_interval(): diff --git a/ibis/backends/pandas/tests/execution/test_cast.py b/ibis/backends/pandas/tests/execution/test_cast.py index 36fb67869a69..bc2d8a60f974 100644 --- a/ibis/backends/pandas/tests/execution/test_cast.py +++ b/ibis/backends/pandas/tests/execution/test_cast.py @@ -154,7 +154,7 @@ def test_timestamp_with_timezone_is_inferred_correctly(t, df): def test_cast_date(t, df, column): expr = t[column].cast("date") result = expr.execute() - expected = df[column].dt.normalize().dt.tz_localize(None) + expected = df[column].dt.normalize().dt.tz_localize(None).dt.date tm.assert_series_equal(result, expected) diff --git a/ibis/backends/pandas/tests/execution/test_temporal.py b/ibis/backends/pandas/tests/execution/test_temporal.py index 9a3a5f327d5a..cd9a1e98384b 100644 --- a/ibis/backends/pandas/tests/execution/test_temporal.py +++ b/ibis/backends/pandas/tests/execution/test_temporal.py @@ -66,7 +66,7 @@ def test_timestamp_functions(case_func, expected_func): def test_cast_datetime_strings_to_date(t, df, column): expr = t[column].cast("date") result = expr.execute() - expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None) + expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None).dt.date tm.assert_series_equal(result, expected) @@ -103,7 +103,7 @@ def test_cast_integer_to_date(t, df): expr = t.plain_int64.cast("date") result = expr.execute() expected = pd.Series( - pd.to_datetime(df.plain_int64.values, unit="D").values, + pd.to_datetime(df.plain_int64.values, unit="D").date, index=df.index, name="plain_int64", ) diff --git a/ibis/backends/polars/__init__.py b/ibis/backends/polars/__init__.py index 091bd3c29ad7..15b2fcca6120 100644 --- a/ibis/backends/polars/__init__.py +++ b/ibis/backends/polars/__init__.py @@ -420,7 +420,7 @@ def execute( else: assert isinstance(expr, ir.Column), type(expr) if expr.type().is_temporal(): - return df.to_pandas().iloc[:, 0] + return expr.__pandas_result__(df.to_pandas()) else: # note: skip frame-construction overhead return df.to_series().to_pandas() diff --git a/ibis/backends/polars/tests/conftest.py b/ibis/backends/polars/tests/conftest.py index a337ea16fbac..68ad32342853 100644 --- a/ibis/backends/polars/tests/conftest.py +++ b/ibis/backends/polars/tests/conftest.py @@ -33,10 +33,10 @@ def connect(*, tmpdir, worker_id, **kw): @classmethod def assert_series_equal(cls, left, right, *args, **kwargs) -> None: - check_dtype = not ( + check_dtype = kwargs.pop("check_dtype", True) and not ( issubclass(left.dtype.type, np.timedelta64) and issubclass(right.dtype.type, np.timedelta64) - ) and kwargs.pop("check_dtype", True) + ) return super().assert_series_equal( left, right, *args, **kwargs, check_dtype=check_dtype ) diff --git a/ibis/backends/polars/tests/test_udf.py b/ibis/backends/polars/tests/test_udf.py index 9d9aa696ff78..49580b8b28fe 100644 --- a/ibis/backends/polars/tests/test_udf.py +++ b/ibis/backends/polars/tests/test_udf.py @@ -46,7 +46,7 @@ def test_multiple_argument_udf(alltypes): result = expr.execute() df = alltypes[["smallint_col", "int_col"]].execute() - expected = (df.smallint_col + df.int_col).astype("int32") + expected = df.smallint_col + df.int_col tm.assert_series_equal(result, expected.rename("tmp")) diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py index 3f00b64675b8..755ddbe3d653 100644 --- a/ibis/backends/pyspark/__init__.py +++ b/ibis/backends/pyspark/__init__.py @@ -232,10 +232,7 @@ def execute(self, expr: ir.Expr, **kwargs: Any) -> Any: table_expr = expr.as_table() df = self.compile(table_expr, **kwargs).toPandas() - # TODO: remove the extra conversion - return expr.__pandas_result__( - PySparkPandasData.convert_table(df, table_expr.schema()) - ) + return expr.__pandas_result__(df) def _fully_qualified_name(self, name, database): if is_fully_qualified(name): diff --git a/ibis/backends/sqlite/tests/test_types.py b/ibis/backends/sqlite/tests/test_types.py index 8d2fbec6a834..dc2978332c38 100644 --- a/ibis/backends/sqlite/tests/test_types.py +++ b/ibis/backends/sqlite/tests/test_types.py @@ -1,6 +1,7 @@ from __future__ import annotations import sqlite3 +from datetime import date import pandas as pd import pytest @@ -86,10 +87,7 @@ def test_type_map(db): assert t.schema() == expected_schema res = t.filter(t.str_col == "a").execute() sol = pd.DataFrame( - { - "str_col": ["a"], - "date_col": pd.Series(["2022-01-01"], dtype="M8[ns]"), - } + {"str_col": ["a"], "date_col": pd.Series([date(2022, 1, 1)], dtype="object")} ) assert res.equals(sol) diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 5c2b40652128..9423ff2a7db7 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -817,7 +817,7 @@ def test_int_column(alltypes): assert result.dtype == np.int8 -@pytest.mark.notimpl(["druid", "oracle", "exasol"]) +@pytest.mark.notimpl(["druid", "oracle"]) @pytest.mark.never( ["bigquery", "sqlite", "snowflake"], reason="backend only implements int64" ) diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py index c25ce14d8cfc..19ec3e71fa21 100644 --- a/ibis/backends/tests/test_map.py +++ b/ibis/backends/tests/test_map.py @@ -226,11 +226,6 @@ def test_literal_map_get_broadcast(backend, alltypes, df): param(["a", "b"], ["1", "2"], id="int"), ], ) -@pytest.mark.notyet( - ["flink"], - raises=AssertionError, - reason="got list of tuples instead; requires PyFlink compatibility with PyArrow 13", -) def test_map_construct_dict(con, keys, values): expr = ibis.map(keys, values) result = con.execute(expr.name("tmp")) diff --git a/ibis/backends/tests/test_param.py b/ibis/backends/tests/test_param.py index 61b6c7ea17e7..7a15de8e0cff 100644 --- a/ibis/backends/tests/test_param.py +++ b/ibis/backends/tests/test_param.py @@ -225,7 +225,10 @@ def test_scalar_param_date(backend, alltypes, value): ) df = base.execute() expected = ( - df.loc[df.date_col.dt.normalize() == pd.Timestamp(value).normalize()] + df.loc[ + pd.to_datetime(df.date_col).dt.normalize().dt.date + == pd.Timestamp(value).normalize().date() + ] .sort_values("id") .reset_index(drop=True) .drop(columns=["date_col"]) diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py index f4fd52582b6c..4eed80ce057d 100644 --- a/ibis/backends/tests/test_temporal.py +++ b/ibis/backends/tests/test_temporal.py @@ -43,11 +43,14 @@ ArrowInvalid = None try: + from clickhouse_connect.driver.exceptions import ( + DatabaseError as ClickhouseDatabaseError, + ) from clickhouse_connect.driver.exceptions import ( InternalError as ClickhouseOperationalError, ) except ImportError: - ClickhouseOperationalError = None + ClickhouseOperationalError = ClickhouseDatabaseError = None try: from impala.error import ( @@ -64,12 +67,16 @@ except ImportError: Py4JJavaError = None - try: from pyexasol.exceptions import ExaQueryError except ImportError: ExaQueryError = None +try: + from pyspark.sql.utils import IllegalArgumentException +except ImportError: + IllegalArgumentException = None + @pytest.mark.parametrize("attr", ["year", "month", "day"]) @pytest.mark.parametrize( @@ -707,9 +714,9 @@ def test_date_truncate(backend, alltypes, df, unit): unit = PANDAS_UNITS.get(unit, unit) try: - expected = df.timestamp_col.dt.floor(unit) + expected = df.timestamp_col.dt.floor(unit).dt.date except ValueError: - expected = df.timestamp_col.dt.to_period(unit).dt.to_timestamp() + expected = df.timestamp_col.dt.to_period(unit).dt.to_timestamp().dt.date result = expr.execute() expected = backend.default_series_rename(expected) @@ -1017,10 +1024,14 @@ def convert_to_offset(x): warnings.simplefilter( "ignore", category=(UserWarning, pd.errors.PerformanceWarning) ) - expected = pd.to_datetime(df.date_string_col) + offset + expected = ( + pd.to_datetime(df.date_string_col) + .add(offset) + .map(lambda ts: ts.normalize().date(), na_action="ignore") + ) expected = backend.default_series_rename(expected) - backend.assert_series_equal(result, expected.map(lambda ts: ts.normalize())) + backend.assert_series_equal(result, expected) date_value = pd.Timestamp("2017-12-31") @@ -1119,7 +1130,12 @@ def convert_to_offset(x): ), param( lambda t, _: t.timestamp_col.date() + ibis.interval(days=4), - lambda t, _: t.timestamp_col.dt.floor("d") + pd.Timedelta(days=4), + lambda t, _: ( + t.timestamp_col.dt.floor("d") + .add(pd.Timedelta(days=4)) + .dt.normalize() + .dt.date + ), id="date-add-interval", marks=[ pytest.mark.notimpl( @@ -1131,7 +1147,12 @@ def convert_to_offset(x): ), param( lambda t, _: t.timestamp_col.date() - ibis.interval(days=14), - lambda t, _: t.timestamp_col.dt.floor("d") - pd.Timedelta(days=14), + lambda t, _: ( + t.timestamp_col.dt.floor("d") + .sub(pd.Timedelta(days=14)) + .dt.normalize() + .dt.date + ), id="date-subtract-interval", marks=[ pytest.mark.notimpl( @@ -1229,7 +1250,9 @@ def test_temporal_binop(backend, con, alltypes, df, expr_fn, expected_fn): result = con.execute(expr) expected = backend.default_series_rename(expected) - backend.assert_series_equal(result, expected.astype(result.dtype)) + backend.assert_series_equal( + result, expected.astype(result.dtype), check_dtype=False + ) plus = lambda t, td: t.timestamp_col + pd.Timedelta(td) @@ -1624,6 +1647,7 @@ def test_interval_add_cast_column(backend, alltypes, df): .dt.normalize() .add(df.bigint_col.astype("timedelta64[D]")) .rename("tmp") + .dt.date ) backend.assert_series_equal(result, expected.astype(result.dtype)) @@ -2200,11 +2224,6 @@ def test_timestamp_literal(con, backend): ", , , )" ), ) -@pytest.mark.notimpl( - ["flink"], - "https://github.com/ibis-project/ibis/pull/6920/files#r1372453059", - raises=AssertionError, -) @pytest.mark.notimpl(["exasol"], raises=ExaQueryError) def test_timestamp_with_timezone_literal(con, timezone, expected): expr = ibis.timestamp(2022, 2, 4, 16, 20, 0).cast(dt.Timestamp(timezone=timezone)) @@ -2511,7 +2530,7 @@ def test_date_column_from_iso(backend, con, alltypes, df): result = con.execute(expr.name("tmp")) golden = df.year.astype(str) + "-" + df.month.astype(str).str.rjust(2, "0") + "-13" - actual = result.dt.strftime("%Y-%m-%d") + actual = result.map(datetime.date.isoformat) backend.assert_series_equal(golden.rename("tmp"), actual.rename("tmp")) @@ -2976,7 +2995,7 @@ def test_timestamp_bucket_offset(backend, offset_mins): backend.assert_series_equal(res, sol) -_NO_SQLGLOT_DIALECT = {"pandas", "dask", "druid", "flink", "datafusion", "polars"} +_NO_SQLGLOT_DIALECT = ("pandas", "dask", "druid", "flink", "datafusion", "polars") no_sqlglot_dialect = sorted( param(backend, marks=pytest.mark.xfail) for backend in _NO_SQLGLOT_DIALECT ) @@ -3002,6 +3021,11 @@ def test_temporal_literal_sql(value, dialect, snapshot): snapshot.assert_match(sql, "out.sql") +no_time_type = pytest.mark.xfail( + raises=NotImplementedError, reason="no time type support" +) + + @pytest.mark.parametrize( "dialect", [ @@ -3011,24 +3035,9 @@ def test_temporal_literal_sql(value, dialect, snapshot): ), *no_sqlglot_dialect, *[ - param( - "impala", - marks=pytest.mark.xfail( - raises=NotImplementedError, reason="no time type support" - ), - ), - param( - "clickhouse", - marks=pytest.mark.xfail( - raises=NotImplementedError, reason="no time type support" - ), - ), - param( - "oracle", - marks=pytest.mark.xfail( - raises=NotImplementedError, reason="no time type support" - ), - ), + param("impala", marks=no_time_type), + param("clickhouse", marks=no_time_type), + param("oracle", marks=no_time_type), ], ], ) @@ -3038,3 +3047,58 @@ def test_time_literal_sql(dialect, snapshot, micros): expr = ibis.literal(value) sql = ibis.to_sql(expr, dialect=dialect) snapshot.assert_match(sql, "out.sql") + + +@pytest.mark.notimpl(["druid"], raises=sa.exc.CompileError, reason="no date support") +@pytest.mark.parametrize( + "value", + [ + param("2017-12-31", id="simple"), + param( + "9999-01-02", + marks=[ + pytest.mark.broken( + ["clickhouse"], + raises=AssertionError, + reason="clickhouse doesn't support dates after 2149-06-06", + ), + pytest.mark.notyet(["datafusion"], raises=Exception), + ], + id="large", + ), + param( + "0001-07-17", + id="small", + marks=[ + pytest.mark.broken( + ["clickhouse"], + raises=AssertionError, + reason="clickhouse doesn't support dates before the UNIX epoch", + ), + pytest.mark.notyet(["datafusion"], raises=Exception), + pytest.mark.notyet(["pyspark"], raises=IllegalArgumentException), + ], + ), + param( + "2150-01-01", + marks=pytest.mark.broken(["clickhouse"], raises=AssertionError), + id="medium", + ), + ], +) +@pytest.mark.parametrize( + "func", + [ + param(lambda x: x, id="identity"), + param(datetime.date.fromisoformat, id="fromstring"), + ], +) +def test_date_scalar(con, value, func): + expr = ibis.date(func(value)).name("tmp") + + result = con.execute(expr) + + assert not isinstance(result, datetime.datetime) + assert isinstance(result, datetime.date) + + assert result == datetime.date.fromisoformat(value) diff --git a/ibis/backends/tests/test_uuid.py b/ibis/backends/tests/test_uuid.py index 8a7ad0695bcb..eac109b68a89 100644 --- a/ibis/backends/tests/test_uuid.py +++ b/ibis/backends/tests/test_uuid.py @@ -39,7 +39,7 @@ "mssql": TEST_UUID, "dask": TEST_UUID, "oracle": TEST_UUID, - "flink": RAW_TEST_UUID, + "flink": TEST_UUID, "exasol": TEST_UUID, } diff --git a/ibis/expr/api.py b/ibis/expr/api.py index 17463734e94c..b0e9ba4704fd 100644 --- a/ibis/expr/api.py +++ b/ibis/expr/api.py @@ -802,19 +802,19 @@ def date(value_or_year, month=None, day=None, /): Create a date scalar from a string >>> ibis.date("2023-01-02") - Timestamp('2023-01-02 00:00:00') + datetime.date(2023, 1, 2) Create a date scalar from year, month, and day >>> ibis.date(2023, 1, 2) - Timestamp('2023-01-02 00:00:00') + datetime.date(2023, 1, 2) Create a date column from year, month, and day - >>> t = ibis.memtable({"y": [2001, 2002], "m": [1, 3], "d": [2, 4]}) - >>> ibis.date(t.y, t.m, t.d).name("date") + >>> t = ibis.memtable(dict(year=[2001, 2002], month=[1, 3], day=[2, 4])) + >>> ibis.date(t.year, t.month, t.day).name("my_date") ┏━━━━━━━━━━━━┓ - ┃ date ┃ + ┃ my_date ┃ ┡━━━━━━━━━━━━┩ │ date │ ├────────────┤ diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py index 8f187b507289..4959141de202 100644 --- a/ibis/expr/types/generic.py +++ b/ibis/expr/types/generic.py @@ -1249,7 +1249,9 @@ def __pyarrow_result__( return data_mapper.convert_scalar(table[0][0], self.type()) def __pandas_result__(self, df: pd.DataFrame) -> Any: - return df.iat[0, 0] + from ibis.formats.pandas import PandasData + + return PandasData.convert_scalar(df, self.type()) def as_table(self) -> ir.Table: """Promote the scalar expression to a table. diff --git a/ibis/formats/numpy.py b/ibis/formats/numpy.py index 8bdc0c3c2559..dd2b96ba8917 100644 --- a/ibis/formats/numpy.py +++ b/ibis/formats/numpy.py @@ -68,8 +68,7 @@ def from_ibis(cls, dtype: dt.DataType) -> np.dtype: # return np.dtype(f"datetime64[{dtype.unit.short}]") return np.dtype("datetime64[ns]") elif dtype.is_date(): - # return np.dtype("datetime64[D]") - return np.dtype("datetime64[ns]") + return np.dtype("datetime64[D]") elif dtype.is_time(): return np.dtype("timedelta64[ns]") elif ( diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py index 271b6c79da3e..e19190c8489e 100644 --- a/ibis/formats/pandas.py +++ b/ibis/formats/pandas.py @@ -1,6 +1,8 @@ from __future__ import annotations import contextlib +import datetime +import json import warnings import numpy as np @@ -133,6 +135,11 @@ def convert_column(cls, obj, dtype): assert not isinstance(result, np.ndarray), f"{convert_method} -> {type(result)}" return result + @classmethod + def convert_scalar(cls, obj, dtype): + df = PandasData.convert_table(obj, sch.Schema({obj.columns[0]: dtype})) + return df.iat[0, 0] + @classmethod def convert_GeoSpatial(cls, s, dtype, pandas_type): return s @@ -189,7 +196,19 @@ def convert_Timestamp(cls, s, dtype, pandas_type): def convert_Date(cls, s, dtype, pandas_type): if isinstance(s.dtype, pd.DatetimeTZDtype): s = s.dt.tz_convert("UTC").dt.tz_localize(None) - return s.astype(pandas_type, errors="ignore").dt.normalize() + try: + return s.astype(pandas_type).dt.date + except (TypeError, pd._libs.tslibs.OutOfBoundsDatetime): + + def try_date(v): + if isinstance(v, datetime.datetime): + return v.date() + elif isinstance(v, str): + return datetime.date.fromisoformat(v) + else: + return v + + return s.map(try_date, na_action="ignore") @classmethod def convert_Interval(cls, s, dtype, pandas_type): diff --git a/ibis/formats/tests/test_numpy.py b/ibis/formats/tests/test_numpy.py index af994b273094..be36cb22cb78 100644 --- a/ibis/formats/tests/test_numpy.py +++ b/ibis/formats/tests/test_numpy.py @@ -80,11 +80,16 @@ def test_variadic_to_numpy(ibis_type): assert NumpyType.from_ibis(ibis_type) == np.dtype("object") -@h.given(ibst.date_dtype() | ibst.timestamp_dtype()) -def test_date_to_numpy(ibis_type): +@h.given(ibst.timestamp_dtype()) +def test_timestamp_to_numpy(ibis_type): assert NumpyType.from_ibis(ibis_type) == np.dtype("datetime64[ns]") +@h.given(ibst.date_dtype()) +def test_date_to_numpy(ibis_type): + assert NumpyType.from_ibis(ibis_type) == np.dtype("datetime64[D]") + + @h.given(ibst.time_dtype()) def test_time_to_numpy(ibis_type): assert NumpyType.from_ibis(ibis_type) == np.dtype("timedelta64[ns]") diff --git a/ibis/formats/tests/test_pandas.py b/ibis/formats/tests/test_pandas.py index dce69fc5c66d..5bba865f0be5 100644 --- a/ibis/formats/tests/test_pandas.py +++ b/ibis/formats/tests/test_pandas.py @@ -32,7 +32,7 @@ (dt.float32, np.dtype("float32")), (dt.float64, np.dtype("float64")), (dt.boolean, np.dtype("bool")), - (dt.date, np.dtype("datetime64[ns]")), + (dt.date, np.dtype("datetime64[D]")), (dt.time, np.dtype("timedelta64[ns]")), (dt.timestamp, np.dtype("datetime64[ns]")), (dt.Interval("s"), np.dtype("timedelta64[s]")),