feat(datatypes): support creating Timestamp from units

ibis-project · May 22, 2023 · 66f2ff0 · 66f2ff0
1 parent 2d14ccc
commit 66f2ff0
Show file tree

Hide file tree

Showing 4 changed files with 265 additions and 32 deletions.
diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py
@@ -19,7 +19,7 @@
 from ibis.common.annotations import attribute
 from ibis.common.collections import FrozenDict, MapSet
 from ibis.common.grounds import Concrete, Singleton
-from ibis.common.temporal import IntervalUnit
+from ibis.common.temporal import IntervalUnit, TimestampUnit
 from ibis.common.validators import Coercible
 
 # TODO(kszucs): we don't support union types yet
@@ -316,13 +316,15 @@ def from_ibis_dtype(value: DataType) -> DataType:
     return value
 
 
+# TODO(kszucs): turn this to lazy singledispatch
 @dtype.register(np.dtype)
 def from_numpy_dtype(value):
     from ibis.formats.numpy import dtype_from_numpy
 
     return dtype_from_numpy(value)
 
 
+# TODO(kszucs): turn this to lazy singledispatch
 @dtype.register(pd.core.dtypes.base.ExtensionDtype)
 def from_pandas_extension_dtype(value):
     from ibis.formats.pandas import dtype_from_pandas
@@ -465,6 +467,36 @@ class Timestamp(Temporal, Parametric):
     scalar = "TimestampScalar"
     column = "TimestampColumn"
 
+    @classmethod
+    def from_unit(cls, unit, timezone=None, nullable=True):
+        """Return a timestamp type with the given unit and timezone."""
+        unit = TimestampUnit(unit)
+        if unit == TimestampUnit.SECOND:
+            scale = 0
+        elif unit == TimestampUnit.MILLISECOND:
+            scale = 3
+        elif unit == TimestampUnit.MICROSECOND:
+            scale = 6
+        elif unit == TimestampUnit.NANOSECOND:
+            scale = 9
+        else:
+            raise ValueError(f"Invalid unit {unit}")
+        return cls(scale=scale, timezone=timezone, nullable=nullable)
+
+    @property
+    def unit(self) -> str:
+        """Return the unit of the timestamp."""
+        if self.scale is None or self.scale == 0:
+            return TimestampUnit.SECOND
+        elif 1 <= self.scale <= 3:
+            return TimestampUnit.MILLISECOND
+        elif 4 <= self.scale <= 6:
+            return TimestampUnit.MICROSECOND
+        elif 7 <= self.scale <= 9:
+            return TimestampUnit.NANOSECOND
+        else:
+            raise ValueError(f"Invalid scale {self.scale}")
+
     @property
     def _pretty_piece(self) -> str:
         pieces = [

diff --git a/ibis/expr/datatypes/tests/test_core.py b/ibis/expr/datatypes/tests/test_core.py
@@ -10,6 +10,7 @@
 import pytest
 
 import ibis.expr.datatypes as dt
+from ibis.common.temporal import TimestampUnit
 
 
 def test_validate_type():
@@ -501,6 +502,30 @@ def test_timestamp_with_scale_no_tz(scale):
     assert dt.parse(f"timestamp({scale:d})") == dt.Timestamp(scale=scale)
 
 
+def test_timestamp_unit():
+    assert dt.Timestamp().unit == TimestampUnit.SECOND
+    assert dt.Timestamp(scale=0).unit == TimestampUnit.SECOND
+    for scale in range(1, 3):
+        assert dt.Timestamp(scale=scale).unit == TimestampUnit.MILLISECOND
+    for scale in range(4, 7):
+        assert dt.Timestamp(scale=scale).unit == TimestampUnit.MICROSECOND
+    for scale in range(7, 10):
+        assert dt.Timestamp(scale=scale).unit == TimestampUnit.NANOSECOND
+
+
+def test_timestamp_from_unit():
+    assert dt.Timestamp.from_unit('s') == dt.Timestamp(scale=0)
+    assert dt.Timestamp.from_unit('ms', timezone='UTC') == dt.Timestamp(
+        scale=3, timezone='UTC'
+    )
+    assert dt.Timestamp.from_unit('us', nullable=True) == dt.Timestamp(
+        scale=6, nullable=True
+    )
+    assert dt.Timestamp.from_unit('ns', timezone='UTC', nullable=False) == dt.Timestamp(
+        scale=9, timezone='UTC', nullable=False
+    )
+
+
 def get_leaf_classes(op):
     for child_class in op.__subclasses__():
         yield child_class

diff --git a/ibis/expr/datatypes/tests/test_value.py b/ibis/expr/datatypes/tests/test_value.py
@@ -1,12 +1,13 @@
-import datetime
 import decimal
 import enum
 from collections import OrderedDict
+from datetime import date, datetime, timedelta
 
 import numpy as np
 import pandas as pd
 import pytest
 import pytz
+from packaging.version import parse as vparse
 
 import ibis.expr.datatypes as dt
 
@@ -24,14 +25,14 @@ class Foo(enum.Enum):
         (True, dt.boolean),
         ('foo', dt.string),
         (b'fooblob', dt.binary),
-        (datetime.date.today(), dt.date),
-        (datetime.datetime.now(), dt.timestamp),
-        (datetime.timedelta(days=3), dt.Interval(unit='D')),
+        (date.today(), dt.date),
+        (datetime.now(), dt.timestamp),
+        (timedelta(days=3), dt.Interval(unit='D')),
         (pd.Timedelta('5 hours'), dt.Interval(unit='h')),
         (pd.Timedelta('7 minutes'), dt.Interval(unit='m')),
-        (datetime.timedelta(seconds=9), dt.Interval(unit='s')),
+        (timedelta(seconds=9), dt.Interval(unit='s')),
         (pd.Timedelta('11 milliseconds'), dt.Interval(unit='ms')),
-        (datetime.timedelta(microseconds=15), dt.Interval(unit='us')),
+        (timedelta(microseconds=15), dt.Interval(unit='us')),
         (pd.Timedelta('17 nanoseconds'), dt.Interval(unit='ns')),
         # numeric types
         (5, dt.int8),
@@ -115,7 +116,178 @@ def test_infer_mixed_type_fails():
 
 
 def test_infer_timestamp_with_tz():
-    now_raw = datetime.datetime.utcnow()
+    now_raw = datetime.utcnow()
     now_utc = pytz.utc.localize(now_raw)
     assert now_utc.tzinfo == pytz.UTC
     assert dt.infer(now_utc).timezone == str(pytz.UTC)
+
+
+# str, pd.Timestamp, datetime, np.datetime64, numbers.Real
+@pytest.mark.parametrize(
+    ("value", "expected"),
+    [
+        ("2019-01-01", datetime(2019, 1, 1)),
+        ("2019-01-01 00:00:00", datetime(2019, 1, 1)),
+        ("2019-01-01 01:02:03.000004", datetime(2019, 1, 1, 1, 2, 3, 4)),
+        (
+            "2019-01-01 01:02:03.000004+00:00",
+            datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.utc),
+        ),
+        (
+            "2019-01-01 01:02:03.000004+01:00",
+            datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(60)),
+        ),
+        (
+            "2019-01-01 01:02:03.000004-01:00",
+            datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(-60)),
+        ),
+        (
+            "2019-01-01 01:02:03.000004+01",
+            datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(60)),
+        ),
+        (datetime(2019, 1, 1), datetime(2019, 1, 1)),
+        (datetime(2019, 1, 1, 1, 2, 3, 4), datetime(2019, 1, 1, 1, 2, 3, 4)),
+        (pd.Timestamp("2019-01-01"), datetime(2019, 1, 1)),
+        (pd.Timestamp("2019-01-01 00:00:00"), datetime(2019, 1, 1)),
+        (pd.Timestamp("2019-01-01 01:02:03.000004"), datetime(2019, 1, 1, 1, 2, 3, 4)),
+        (np.datetime64("2019-01-01"), datetime(2019, 1, 1)),
+        (np.datetime64("2019-01-01 01:02:03"), datetime(2019, 1, 1, 1, 2, 3)),
+    ],
+)
+def test_normalize_timestamp(value, expected):
+    normalized = dt.normalize(dt.timestamp, value)
+    assert normalized == expected
+
+
+@pytest.mark.parametrize(
+    ("value", "expected"),
+    [
+        ("2019-01-01", date(2019, 1, 1)),
+        ("2019-01-01 00:00:00", date(2019, 1, 1)),
+        ("2019-01-01 01:02:03.000004", date(2019, 1, 1)),
+        (datetime(2019, 1, 1), date(2019, 1, 1)),
+        (datetime(2019, 1, 1, 1, 2, 3, 4), date(2019, 1, 1)),
+        (pd.Timestamp("2019-01-01"), date(2019, 1, 1)),
+        (pd.Timestamp("2019-01-01 00:00:00"), date(2019, 1, 1)),
+        (pd.Timestamp("2019-01-01 01:02:03.000004"), date(2019, 1, 1)),
+        (np.datetime64("2019-01-01"), date(2019, 1, 1)),
+        (np.datetime64("2019-01-01 01:02:03"), date(2019, 1, 1)),
+    ],
+)
+def test_normalize_date(value, expected):
+    normalized = dt.normalize(dt.date, value)
+    assert normalized == expected
+
+
+@pytest.mark.parametrize(
+    ('value', 'expected_dtype'),
+    [
+        # numpy types
+        (np.int8(5), dt.int8),
+        (np.int16(-1), dt.int16),
+        (np.int32(2), dt.int32),
+        (np.int64(-5), dt.int64),
+        (np.uint8(5), dt.uint8),
+        (np.uint16(50), dt.uint16),
+        (np.uint32(500), dt.uint32),
+        (np.uint64(5000), dt.uint64),
+        (np.float32(5.5), dt.float32),
+        (np.float64(5.55), dt.float64),
+        (np.bool_(True), dt.boolean),
+        (np.bool_(False), dt.boolean),
+        # pandas types
+        (
+            pd.Timestamp('2015-01-01 12:00:00', tz='US/Eastern'),
+            dt.Timestamp('US/Eastern'),
+        ),
+    ],
+)
+def test_infer_numpy_scalar(value, expected_dtype):
+    assert dt.infer(value) == expected_dtype
+
+
+@pytest.mark.parametrize(
+    ('numpy_dtype', 'ibis_dtype'),
+    [
+        (np.bool_, dt.boolean),
+        (np.int8, dt.int8),
+        (np.int16, dt.int16),
+        (np.int32, dt.int32),
+        (np.int64, dt.int64),
+        (np.uint8, dt.uint8),
+        (np.uint16, dt.uint16),
+        (np.uint32, dt.uint32),
+        (np.uint64, dt.uint64),
+        (np.float16, dt.float16),
+        (np.float32, dt.float32),
+        (np.float64, dt.float64),
+        (np.double, dt.double),
+        (np.str_, dt.string),
+        (np.datetime64, dt.timestamp),
+    ],
+)
+def test_from_numpy_dtype(numpy_dtype, ibis_dtype):
+    assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype
+
+
+def test_from_numpy_timedelta():
+    if vparse(pytest.importorskip("pyarrow").__version__) < vparse("9"):
+        pytest.skip("pyarrow < 9 globally mutates the timedelta64 numpy dtype")
+
+    assert dt.dtype(np.dtype(np.timedelta64)) == dt.interval
+
+
+@pytest.mark.parametrize(
+    ('numpy_array', 'expected_dtypes'),
+    [
+        # Explicitly-defined dtype
+        (np.array([1, 2, 3], dtype='int8'), (dt.Array(dt.int8),)),
+        (np.array([1, 2, 3], dtype='int16'), (dt.Array(dt.int16),)),
+        (np.array([1, 2, 3], dtype='int32'), (dt.Array(dt.int32),)),
+        (np.array([1, 2, 3], dtype='int64'), (dt.Array(dt.int64),)),
+        (np.array([1, 2, 3], dtype='uint8'), (dt.Array(dt.uint8),)),
+        (np.array([1, 2, 3], dtype='uint16'), (dt.Array(dt.uint16),)),
+        (np.array([1, 2, 3], dtype='uint32'), (dt.Array(dt.uint32),)),
+        (np.array([1, 2, 3], dtype='uint64'), (dt.Array(dt.uint64),)),
+        (np.array([1.0, 2.0, 3.0], dtype='float32'), (dt.Array(dt.float32),)),
+        (np.array([1.0, 2.0, 3.0], dtype='float64'), (dt.Array(dt.float64),)),
+        (np.array([True, False, True], dtype='bool'), (dt.Array(dt.boolean),)),
+        # Implicit dtype
+        # Integer array could be inferred to int64 or int32 depending on system
+        (np.array([1, 2, 3]), (dt.Array(dt.int64), dt.Array(dt.int32))),
+        (np.array([1.0, 2.0, 3.0]), (dt.Array(dt.float64),)),
+        (np.array([np.nan, np.nan, np.nan]), (dt.Array(dt.float64),)),
+        (np.array([True, False, True]), (dt.Array(dt.boolean),)),
+        (np.array(['1', '2', '3']), (dt.Array(dt.string),)),
+        (
+            np.array(
+                [
+                    pd.Timestamp('2015-01-01 12:00:00'),
+                    pd.Timestamp('2015-01-02 12:00:00'),
+                    pd.Timestamp('2015-01-03 12:00:00'),
+                ]
+            ),
+            (dt.Array(dt.Timestamp()), dt.Array(dt.Timestamp(scale=9))),
+        ),
+        # Implied from object dtype
+        (np.array([1, 2, 3], dtype=object), (dt.Array(dt.int64),)),
+        (np.array([1.0, 2.0, 3.0], dtype=object), (dt.Array(dt.float64),)),
+        (np.array([True, False, True], dtype=object), (dt.Array(dt.boolean),)),
+        (np.array(['1', '2', '3'], dtype=object), (dt.Array(dt.string),)),
+        (
+            np.array(
+                [
+                    pd.Timestamp('2015-01-01 12:00:00'),
+                    pd.Timestamp('2015-01-02 12:00:00'),
+                    pd.Timestamp('2015-01-03 12:00:00'),
+                ],
+                dtype=object,
+            ),
+            (dt.Array(dt.Timestamp()), dt.Array(dt.Timestamp(scale=9))),
+        ),
+    ],
+)
+def test_infer_numpy_array(numpy_array, expected_dtypes):
+    pandas_series = pd.Series(numpy_array)
+    assert dt.infer(numpy_array) in expected_dtypes
+    assert dt.infer(pandas_series) in expected_dtypes
diff --git a/ibis/expr/operations/generic.py b/ibis/expr/operations/generic.py
@@ -9,7 +9,6 @@
 import uuid
 from operator import attrgetter
 
-import numpy as np
 from public import public
 
 import ibis.expr.datatypes as dt
@@ -169,31 +168,36 @@ class Least(Value):
 
 @public
 class Literal(Value):
-    __valid_input_types__ = (
-        bytes,
-        datetime.date,
-        datetime.datetime,
-        datetime.time,
-        datetime.timedelta,
-        enum.Enum,
-        float,
-        frozenset,
-        int,
-        ipaddress.IPv4Address,
-        ipaddress.IPv6Address,
-        frozendict,
-        np.generic,
-        np.ndarray,
-        str,
-        tuple,
-        type(None),
-        uuid.UUID,
-        decimal.Decimal,
-    )
     value = rlz.one_of(
         (
-            rlz.instance_of(__valid_input_types__),
-            rlz.lazy_instance_of("shapely.geometry.BaseGeometry"),
+            rlz.instance_of(
+                (
+                    bytes,
+                    datetime.date,
+                    datetime.datetime,
+                    datetime.time,
+                    datetime.timedelta,
+                    enum.Enum,
+                    float,
+                    frozenset,
+                    int,
+                    ipaddress.IPv4Address,
+                    ipaddress.IPv6Address,
+                    frozendict,
+                    str,
+                    tuple,
+                    type(None),
+                    uuid.UUID,
+                    decimal.Decimal,
+                )
+            ),
+            rlz.lazy_instance_of(
+                (
+                    "shapely.geometry.BaseGeometry",
+                    "numpy.generic",
+                    "numpy.ndarray",
+                )
+            ),
         )
     )
     dtype = rlz.datatype