From 66f2ff05dbe945dc900321c3c5cd9ca89c790596 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Mon, 22 May 2023 14:36:31 +0200 Subject: [PATCH] feat(datatypes): support creating Timestamp from units --- ibis/expr/datatypes/core.py | 34 ++++- ibis/expr/datatypes/tests/test_core.py | 25 ++++ ibis/expr/datatypes/tests/test_value.py | 186 +++++++++++++++++++++++- ibis/expr/operations/generic.py | 52 ++++--- 4 files changed, 265 insertions(+), 32 deletions(-) diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py index e24bf79efc86..bfa3353ff1c3 100644 --- a/ibis/expr/datatypes/core.py +++ b/ibis/expr/datatypes/core.py @@ -19,7 +19,7 @@ from ibis.common.annotations import attribute from ibis.common.collections import FrozenDict, MapSet from ibis.common.grounds import Concrete, Singleton -from ibis.common.temporal import IntervalUnit +from ibis.common.temporal import IntervalUnit, TimestampUnit from ibis.common.validators import Coercible # TODO(kszucs): we don't support union types yet @@ -316,6 +316,7 @@ def from_ibis_dtype(value: DataType) -> DataType: return value +# TODO(kszucs): turn this to lazy singledispatch @dtype.register(np.dtype) def from_numpy_dtype(value): from ibis.formats.numpy import dtype_from_numpy @@ -323,6 +324,7 @@ def from_numpy_dtype(value): return dtype_from_numpy(value) +# TODO(kszucs): turn this to lazy singledispatch @dtype.register(pd.core.dtypes.base.ExtensionDtype) def from_pandas_extension_dtype(value): from ibis.formats.pandas import dtype_from_pandas @@ -465,6 +467,36 @@ class Timestamp(Temporal, Parametric): scalar = "TimestampScalar" column = "TimestampColumn" + @classmethod + def from_unit(cls, unit, timezone=None, nullable=True): + """Return a timestamp type with the given unit and timezone.""" + unit = TimestampUnit(unit) + if unit == TimestampUnit.SECOND: + scale = 0 + elif unit == TimestampUnit.MILLISECOND: + scale = 3 + elif unit == TimestampUnit.MICROSECOND: + scale = 6 + elif unit == TimestampUnit.NANOSECOND: + scale = 9 + else: + raise ValueError(f"Invalid unit {unit}") + return cls(scale=scale, timezone=timezone, nullable=nullable) + + @property + def unit(self) -> str: + """Return the unit of the timestamp.""" + if self.scale is None or self.scale == 0: + return TimestampUnit.SECOND + elif 1 <= self.scale <= 3: + return TimestampUnit.MILLISECOND + elif 4 <= self.scale <= 6: + return TimestampUnit.MICROSECOND + elif 7 <= self.scale <= 9: + return TimestampUnit.NANOSECOND + else: + raise ValueError(f"Invalid scale {self.scale}") + @property def _pretty_piece(self) -> str: pieces = [ diff --git a/ibis/expr/datatypes/tests/test_core.py b/ibis/expr/datatypes/tests/test_core.py index f6ac944ca2be..cb7ff61a0662 100644 --- a/ibis/expr/datatypes/tests/test_core.py +++ b/ibis/expr/datatypes/tests/test_core.py @@ -10,6 +10,7 @@ import pytest import ibis.expr.datatypes as dt +from ibis.common.temporal import TimestampUnit def test_validate_type(): @@ -501,6 +502,30 @@ def test_timestamp_with_scale_no_tz(scale): assert dt.parse(f"timestamp({scale:d})") == dt.Timestamp(scale=scale) +def test_timestamp_unit(): + assert dt.Timestamp().unit == TimestampUnit.SECOND + assert dt.Timestamp(scale=0).unit == TimestampUnit.SECOND + for scale in range(1, 3): + assert dt.Timestamp(scale=scale).unit == TimestampUnit.MILLISECOND + for scale in range(4, 7): + assert dt.Timestamp(scale=scale).unit == TimestampUnit.MICROSECOND + for scale in range(7, 10): + assert dt.Timestamp(scale=scale).unit == TimestampUnit.NANOSECOND + + +def test_timestamp_from_unit(): + assert dt.Timestamp.from_unit('s') == dt.Timestamp(scale=0) + assert dt.Timestamp.from_unit('ms', timezone='UTC') == dt.Timestamp( + scale=3, timezone='UTC' + ) + assert dt.Timestamp.from_unit('us', nullable=True) == dt.Timestamp( + scale=6, nullable=True + ) + assert dt.Timestamp.from_unit('ns', timezone='UTC', nullable=False) == dt.Timestamp( + scale=9, timezone='UTC', nullable=False + ) + + def get_leaf_classes(op): for child_class in op.__subclasses__(): yield child_class diff --git a/ibis/expr/datatypes/tests/test_value.py b/ibis/expr/datatypes/tests/test_value.py index 100dfa45674a..13ab9bb35b7a 100644 --- a/ibis/expr/datatypes/tests/test_value.py +++ b/ibis/expr/datatypes/tests/test_value.py @@ -1,12 +1,13 @@ -import datetime import decimal import enum from collections import OrderedDict +from datetime import date, datetime, timedelta import numpy as np import pandas as pd import pytest import pytz +from packaging.version import parse as vparse import ibis.expr.datatypes as dt @@ -24,14 +25,14 @@ class Foo(enum.Enum): (True, dt.boolean), ('foo', dt.string), (b'fooblob', dt.binary), - (datetime.date.today(), dt.date), - (datetime.datetime.now(), dt.timestamp), - (datetime.timedelta(days=3), dt.Interval(unit='D')), + (date.today(), dt.date), + (datetime.now(), dt.timestamp), + (timedelta(days=3), dt.Interval(unit='D')), (pd.Timedelta('5 hours'), dt.Interval(unit='h')), (pd.Timedelta('7 minutes'), dt.Interval(unit='m')), - (datetime.timedelta(seconds=9), dt.Interval(unit='s')), + (timedelta(seconds=9), dt.Interval(unit='s')), (pd.Timedelta('11 milliseconds'), dt.Interval(unit='ms')), - (datetime.timedelta(microseconds=15), dt.Interval(unit='us')), + (timedelta(microseconds=15), dt.Interval(unit='us')), (pd.Timedelta('17 nanoseconds'), dt.Interval(unit='ns')), # numeric types (5, dt.int8), @@ -115,7 +116,178 @@ def test_infer_mixed_type_fails(): def test_infer_timestamp_with_tz(): - now_raw = datetime.datetime.utcnow() + now_raw = datetime.utcnow() now_utc = pytz.utc.localize(now_raw) assert now_utc.tzinfo == pytz.UTC assert dt.infer(now_utc).timezone == str(pytz.UTC) + + +# str, pd.Timestamp, datetime, np.datetime64, numbers.Real +@pytest.mark.parametrize( + ("value", "expected"), + [ + ("2019-01-01", datetime(2019, 1, 1)), + ("2019-01-01 00:00:00", datetime(2019, 1, 1)), + ("2019-01-01 01:02:03.000004", datetime(2019, 1, 1, 1, 2, 3, 4)), + ( + "2019-01-01 01:02:03.000004+00:00", + datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.utc), + ), + ( + "2019-01-01 01:02:03.000004+01:00", + datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(60)), + ), + ( + "2019-01-01 01:02:03.000004-01:00", + datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(-60)), + ), + ( + "2019-01-01 01:02:03.000004+01", + datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(60)), + ), + (datetime(2019, 1, 1), datetime(2019, 1, 1)), + (datetime(2019, 1, 1, 1, 2, 3, 4), datetime(2019, 1, 1, 1, 2, 3, 4)), + (pd.Timestamp("2019-01-01"), datetime(2019, 1, 1)), + (pd.Timestamp("2019-01-01 00:00:00"), datetime(2019, 1, 1)), + (pd.Timestamp("2019-01-01 01:02:03.000004"), datetime(2019, 1, 1, 1, 2, 3, 4)), + (np.datetime64("2019-01-01"), datetime(2019, 1, 1)), + (np.datetime64("2019-01-01 01:02:03"), datetime(2019, 1, 1, 1, 2, 3)), + ], +) +def test_normalize_timestamp(value, expected): + normalized = dt.normalize(dt.timestamp, value) + assert normalized == expected + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ("2019-01-01", date(2019, 1, 1)), + ("2019-01-01 00:00:00", date(2019, 1, 1)), + ("2019-01-01 01:02:03.000004", date(2019, 1, 1)), + (datetime(2019, 1, 1), date(2019, 1, 1)), + (datetime(2019, 1, 1, 1, 2, 3, 4), date(2019, 1, 1)), + (pd.Timestamp("2019-01-01"), date(2019, 1, 1)), + (pd.Timestamp("2019-01-01 00:00:00"), date(2019, 1, 1)), + (pd.Timestamp("2019-01-01 01:02:03.000004"), date(2019, 1, 1)), + (np.datetime64("2019-01-01"), date(2019, 1, 1)), + (np.datetime64("2019-01-01 01:02:03"), date(2019, 1, 1)), + ], +) +def test_normalize_date(value, expected): + normalized = dt.normalize(dt.date, value) + assert normalized == expected + + +@pytest.mark.parametrize( + ('value', 'expected_dtype'), + [ + # numpy types + (np.int8(5), dt.int8), + (np.int16(-1), dt.int16), + (np.int32(2), dt.int32), + (np.int64(-5), dt.int64), + (np.uint8(5), dt.uint8), + (np.uint16(50), dt.uint16), + (np.uint32(500), dt.uint32), + (np.uint64(5000), dt.uint64), + (np.float32(5.5), dt.float32), + (np.float64(5.55), dt.float64), + (np.bool_(True), dt.boolean), + (np.bool_(False), dt.boolean), + # pandas types + ( + pd.Timestamp('2015-01-01 12:00:00', tz='US/Eastern'), + dt.Timestamp('US/Eastern'), + ), + ], +) +def test_infer_numpy_scalar(value, expected_dtype): + assert dt.infer(value) == expected_dtype + + +@pytest.mark.parametrize( + ('numpy_dtype', 'ibis_dtype'), + [ + (np.bool_, dt.boolean), + (np.int8, dt.int8), + (np.int16, dt.int16), + (np.int32, dt.int32), + (np.int64, dt.int64), + (np.uint8, dt.uint8), + (np.uint16, dt.uint16), + (np.uint32, dt.uint32), + (np.uint64, dt.uint64), + (np.float16, dt.float16), + (np.float32, dt.float32), + (np.float64, dt.float64), + (np.double, dt.double), + (np.str_, dt.string), + (np.datetime64, dt.timestamp), + ], +) +def test_from_numpy_dtype(numpy_dtype, ibis_dtype): + assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype + + +def test_from_numpy_timedelta(): + if vparse(pytest.importorskip("pyarrow").__version__) < vparse("9"): + pytest.skip("pyarrow < 9 globally mutates the timedelta64 numpy dtype") + + assert dt.dtype(np.dtype(np.timedelta64)) == dt.interval + + +@pytest.mark.parametrize( + ('numpy_array', 'expected_dtypes'), + [ + # Explicitly-defined dtype + (np.array([1, 2, 3], dtype='int8'), (dt.Array(dt.int8),)), + (np.array([1, 2, 3], dtype='int16'), (dt.Array(dt.int16),)), + (np.array([1, 2, 3], dtype='int32'), (dt.Array(dt.int32),)), + (np.array([1, 2, 3], dtype='int64'), (dt.Array(dt.int64),)), + (np.array([1, 2, 3], dtype='uint8'), (dt.Array(dt.uint8),)), + (np.array([1, 2, 3], dtype='uint16'), (dt.Array(dt.uint16),)), + (np.array([1, 2, 3], dtype='uint32'), (dt.Array(dt.uint32),)), + (np.array([1, 2, 3], dtype='uint64'), (dt.Array(dt.uint64),)), + (np.array([1.0, 2.0, 3.0], dtype='float32'), (dt.Array(dt.float32),)), + (np.array([1.0, 2.0, 3.0], dtype='float64'), (dt.Array(dt.float64),)), + (np.array([True, False, True], dtype='bool'), (dt.Array(dt.boolean),)), + # Implicit dtype + # Integer array could be inferred to int64 or int32 depending on system + (np.array([1, 2, 3]), (dt.Array(dt.int64), dt.Array(dt.int32))), + (np.array([1.0, 2.0, 3.0]), (dt.Array(dt.float64),)), + (np.array([np.nan, np.nan, np.nan]), (dt.Array(dt.float64),)), + (np.array([True, False, True]), (dt.Array(dt.boolean),)), + (np.array(['1', '2', '3']), (dt.Array(dt.string),)), + ( + np.array( + [ + pd.Timestamp('2015-01-01 12:00:00'), + pd.Timestamp('2015-01-02 12:00:00'), + pd.Timestamp('2015-01-03 12:00:00'), + ] + ), + (dt.Array(dt.Timestamp()), dt.Array(dt.Timestamp(scale=9))), + ), + # Implied from object dtype + (np.array([1, 2, 3], dtype=object), (dt.Array(dt.int64),)), + (np.array([1.0, 2.0, 3.0], dtype=object), (dt.Array(dt.float64),)), + (np.array([True, False, True], dtype=object), (dt.Array(dt.boolean),)), + (np.array(['1', '2', '3'], dtype=object), (dt.Array(dt.string),)), + ( + np.array( + [ + pd.Timestamp('2015-01-01 12:00:00'), + pd.Timestamp('2015-01-02 12:00:00'), + pd.Timestamp('2015-01-03 12:00:00'), + ], + dtype=object, + ), + (dt.Array(dt.Timestamp()), dt.Array(dt.Timestamp(scale=9))), + ), + ], +) +def test_infer_numpy_array(numpy_array, expected_dtypes): + pandas_series = pd.Series(numpy_array) + assert dt.infer(numpy_array) in expected_dtypes + assert dt.infer(pandas_series) in expected_dtypes diff --git a/ibis/expr/operations/generic.py b/ibis/expr/operations/generic.py index 1c0768d08b76..a9eb3815de4a 100644 --- a/ibis/expr/operations/generic.py +++ b/ibis/expr/operations/generic.py @@ -9,7 +9,6 @@ import uuid from operator import attrgetter -import numpy as np from public import public import ibis.expr.datatypes as dt @@ -169,31 +168,36 @@ class Least(Value): @public class Literal(Value): - __valid_input_types__ = ( - bytes, - datetime.date, - datetime.datetime, - datetime.time, - datetime.timedelta, - enum.Enum, - float, - frozenset, - int, - ipaddress.IPv4Address, - ipaddress.IPv6Address, - frozendict, - np.generic, - np.ndarray, - str, - tuple, - type(None), - uuid.UUID, - decimal.Decimal, - ) value = rlz.one_of( ( - rlz.instance_of(__valid_input_types__), - rlz.lazy_instance_of("shapely.geometry.BaseGeometry"), + rlz.instance_of( + ( + bytes, + datetime.date, + datetime.datetime, + datetime.time, + datetime.timedelta, + enum.Enum, + float, + frozenset, + int, + ipaddress.IPv4Address, + ipaddress.IPv6Address, + frozendict, + str, + tuple, + type(None), + uuid.UUID, + decimal.Decimal, + ) + ), + rlz.lazy_instance_of( + ( + "shapely.geometry.BaseGeometry", + "numpy.generic", + "numpy.ndarray", + ) + ), ) ) dtype = rlz.datatype