Skip to content

Commit

Permalink
feat(datatypes): support creating Timestamp from units
Browse files Browse the repository at this point in the history
  • Loading branch information
kszucs committed May 22, 2023
1 parent 2d14ccc commit 66f2ff0
Show file tree
Hide file tree
Showing 4 changed files with 265 additions and 32 deletions.
34 changes: 33 additions & 1 deletion ibis/expr/datatypes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from ibis.common.annotations import attribute
from ibis.common.collections import FrozenDict, MapSet
from ibis.common.grounds import Concrete, Singleton
from ibis.common.temporal import IntervalUnit
from ibis.common.temporal import IntervalUnit, TimestampUnit
from ibis.common.validators import Coercible

# TODO(kszucs): we don't support union types yet
Expand Down Expand Up @@ -316,13 +316,15 @@ def from_ibis_dtype(value: DataType) -> DataType:
return value


# TODO(kszucs): turn this to lazy singledispatch
@dtype.register(np.dtype)
def from_numpy_dtype(value):
from ibis.formats.numpy import dtype_from_numpy

return dtype_from_numpy(value)


# TODO(kszucs): turn this to lazy singledispatch
@dtype.register(pd.core.dtypes.base.ExtensionDtype)
def from_pandas_extension_dtype(value):
from ibis.formats.pandas import dtype_from_pandas
Expand Down Expand Up @@ -465,6 +467,36 @@ class Timestamp(Temporal, Parametric):
scalar = "TimestampScalar"
column = "TimestampColumn"

@classmethod
def from_unit(cls, unit, timezone=None, nullable=True):
"""Return a timestamp type with the given unit and timezone."""
unit = TimestampUnit(unit)
if unit == TimestampUnit.SECOND:
scale = 0
elif unit == TimestampUnit.MILLISECOND:
scale = 3
elif unit == TimestampUnit.MICROSECOND:
scale = 6
elif unit == TimestampUnit.NANOSECOND:
scale = 9
else:
raise ValueError(f"Invalid unit {unit}")
return cls(scale=scale, timezone=timezone, nullable=nullable)

@property
def unit(self) -> str:
"""Return the unit of the timestamp."""
if self.scale is None or self.scale == 0:
return TimestampUnit.SECOND
elif 1 <= self.scale <= 3:
return TimestampUnit.MILLISECOND
elif 4 <= self.scale <= 6:
return TimestampUnit.MICROSECOND
elif 7 <= self.scale <= 9:
return TimestampUnit.NANOSECOND
else:
raise ValueError(f"Invalid scale {self.scale}")

@property
def _pretty_piece(self) -> str:
pieces = [
Expand Down
25 changes: 25 additions & 0 deletions ibis/expr/datatypes/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pytest

import ibis.expr.datatypes as dt
from ibis.common.temporal import TimestampUnit


def test_validate_type():
Expand Down Expand Up @@ -501,6 +502,30 @@ def test_timestamp_with_scale_no_tz(scale):
assert dt.parse(f"timestamp({scale:d})") == dt.Timestamp(scale=scale)


def test_timestamp_unit():
assert dt.Timestamp().unit == TimestampUnit.SECOND
assert dt.Timestamp(scale=0).unit == TimestampUnit.SECOND
for scale in range(1, 3):
assert dt.Timestamp(scale=scale).unit == TimestampUnit.MILLISECOND
for scale in range(4, 7):
assert dt.Timestamp(scale=scale).unit == TimestampUnit.MICROSECOND
for scale in range(7, 10):
assert dt.Timestamp(scale=scale).unit == TimestampUnit.NANOSECOND


def test_timestamp_from_unit():
assert dt.Timestamp.from_unit('s') == dt.Timestamp(scale=0)
assert dt.Timestamp.from_unit('ms', timezone='UTC') == dt.Timestamp(
scale=3, timezone='UTC'
)
assert dt.Timestamp.from_unit('us', nullable=True) == dt.Timestamp(
scale=6, nullable=True
)
assert dt.Timestamp.from_unit('ns', timezone='UTC', nullable=False) == dt.Timestamp(
scale=9, timezone='UTC', nullable=False
)


def get_leaf_classes(op):
for child_class in op.__subclasses__():
yield child_class
Expand Down
186 changes: 179 additions & 7 deletions ibis/expr/datatypes/tests/test_value.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import datetime
import decimal
import enum
from collections import OrderedDict
from datetime import date, datetime, timedelta

import numpy as np
import pandas as pd
import pytest
import pytz
from packaging.version import parse as vparse

import ibis.expr.datatypes as dt

Expand All @@ -24,14 +25,14 @@ class Foo(enum.Enum):
(True, dt.boolean),
('foo', dt.string),
(b'fooblob', dt.binary),
(datetime.date.today(), dt.date),
(datetime.datetime.now(), dt.timestamp),
(datetime.timedelta(days=3), dt.Interval(unit='D')),
(date.today(), dt.date),
(datetime.now(), dt.timestamp),
(timedelta(days=3), dt.Interval(unit='D')),
(pd.Timedelta('5 hours'), dt.Interval(unit='h')),
(pd.Timedelta('7 minutes'), dt.Interval(unit='m')),
(datetime.timedelta(seconds=9), dt.Interval(unit='s')),
(timedelta(seconds=9), dt.Interval(unit='s')),
(pd.Timedelta('11 milliseconds'), dt.Interval(unit='ms')),
(datetime.timedelta(microseconds=15), dt.Interval(unit='us')),
(timedelta(microseconds=15), dt.Interval(unit='us')),
(pd.Timedelta('17 nanoseconds'), dt.Interval(unit='ns')),
# numeric types
(5, dt.int8),
Expand Down Expand Up @@ -115,7 +116,178 @@ def test_infer_mixed_type_fails():


def test_infer_timestamp_with_tz():
now_raw = datetime.datetime.utcnow()
now_raw = datetime.utcnow()
now_utc = pytz.utc.localize(now_raw)
assert now_utc.tzinfo == pytz.UTC
assert dt.infer(now_utc).timezone == str(pytz.UTC)


# str, pd.Timestamp, datetime, np.datetime64, numbers.Real
@pytest.mark.parametrize(
("value", "expected"),
[
("2019-01-01", datetime(2019, 1, 1)),
("2019-01-01 00:00:00", datetime(2019, 1, 1)),
("2019-01-01 01:02:03.000004", datetime(2019, 1, 1, 1, 2, 3, 4)),
(
"2019-01-01 01:02:03.000004+00:00",
datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.utc),
),
(
"2019-01-01 01:02:03.000004+01:00",
datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(60)),
),
(
"2019-01-01 01:02:03.000004-01:00",
datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(-60)),
),
(
"2019-01-01 01:02:03.000004+01",
datetime(2019, 1, 1, 1, 2, 3, 4, tzinfo=pytz.FixedOffset(60)),
),
(datetime(2019, 1, 1), datetime(2019, 1, 1)),
(datetime(2019, 1, 1, 1, 2, 3, 4), datetime(2019, 1, 1, 1, 2, 3, 4)),
(pd.Timestamp("2019-01-01"), datetime(2019, 1, 1)),
(pd.Timestamp("2019-01-01 00:00:00"), datetime(2019, 1, 1)),
(pd.Timestamp("2019-01-01 01:02:03.000004"), datetime(2019, 1, 1, 1, 2, 3, 4)),
(np.datetime64("2019-01-01"), datetime(2019, 1, 1)),
(np.datetime64("2019-01-01 01:02:03"), datetime(2019, 1, 1, 1, 2, 3)),
],
)
def test_normalize_timestamp(value, expected):
normalized = dt.normalize(dt.timestamp, value)
assert normalized == expected


@pytest.mark.parametrize(
("value", "expected"),
[
("2019-01-01", date(2019, 1, 1)),
("2019-01-01 00:00:00", date(2019, 1, 1)),
("2019-01-01 01:02:03.000004", date(2019, 1, 1)),
(datetime(2019, 1, 1), date(2019, 1, 1)),
(datetime(2019, 1, 1, 1, 2, 3, 4), date(2019, 1, 1)),
(pd.Timestamp("2019-01-01"), date(2019, 1, 1)),
(pd.Timestamp("2019-01-01 00:00:00"), date(2019, 1, 1)),
(pd.Timestamp("2019-01-01 01:02:03.000004"), date(2019, 1, 1)),
(np.datetime64("2019-01-01"), date(2019, 1, 1)),
(np.datetime64("2019-01-01 01:02:03"), date(2019, 1, 1)),
],
)
def test_normalize_date(value, expected):
normalized = dt.normalize(dt.date, value)
assert normalized == expected


@pytest.mark.parametrize(
('value', 'expected_dtype'),
[
# numpy types
(np.int8(5), dt.int8),
(np.int16(-1), dt.int16),
(np.int32(2), dt.int32),
(np.int64(-5), dt.int64),
(np.uint8(5), dt.uint8),
(np.uint16(50), dt.uint16),
(np.uint32(500), dt.uint32),
(np.uint64(5000), dt.uint64),
(np.float32(5.5), dt.float32),
(np.float64(5.55), dt.float64),
(np.bool_(True), dt.boolean),
(np.bool_(False), dt.boolean),
# pandas types
(
pd.Timestamp('2015-01-01 12:00:00', tz='US/Eastern'),
dt.Timestamp('US/Eastern'),
),
],
)
def test_infer_numpy_scalar(value, expected_dtype):
assert dt.infer(value) == expected_dtype


@pytest.mark.parametrize(
('numpy_dtype', 'ibis_dtype'),
[
(np.bool_, dt.boolean),
(np.int8, dt.int8),
(np.int16, dt.int16),
(np.int32, dt.int32),
(np.int64, dt.int64),
(np.uint8, dt.uint8),
(np.uint16, dt.uint16),
(np.uint32, dt.uint32),
(np.uint64, dt.uint64),
(np.float16, dt.float16),
(np.float32, dt.float32),
(np.float64, dt.float64),
(np.double, dt.double),
(np.str_, dt.string),
(np.datetime64, dt.timestamp),
],
)
def test_from_numpy_dtype(numpy_dtype, ibis_dtype):
assert dt.dtype(np.dtype(numpy_dtype)) == ibis_dtype


def test_from_numpy_timedelta():
if vparse(pytest.importorskip("pyarrow").__version__) < vparse("9"):
pytest.skip("pyarrow < 9 globally mutates the timedelta64 numpy dtype")

assert dt.dtype(np.dtype(np.timedelta64)) == dt.interval


@pytest.mark.parametrize(
('numpy_array', 'expected_dtypes'),
[
# Explicitly-defined dtype
(np.array([1, 2, 3], dtype='int8'), (dt.Array(dt.int8),)),
(np.array([1, 2, 3], dtype='int16'), (dt.Array(dt.int16),)),
(np.array([1, 2, 3], dtype='int32'), (dt.Array(dt.int32),)),
(np.array([1, 2, 3], dtype='int64'), (dt.Array(dt.int64),)),
(np.array([1, 2, 3], dtype='uint8'), (dt.Array(dt.uint8),)),
(np.array([1, 2, 3], dtype='uint16'), (dt.Array(dt.uint16),)),
(np.array([1, 2, 3], dtype='uint32'), (dt.Array(dt.uint32),)),
(np.array([1, 2, 3], dtype='uint64'), (dt.Array(dt.uint64),)),
(np.array([1.0, 2.0, 3.0], dtype='float32'), (dt.Array(dt.float32),)),
(np.array([1.0, 2.0, 3.0], dtype='float64'), (dt.Array(dt.float64),)),
(np.array([True, False, True], dtype='bool'), (dt.Array(dt.boolean),)),
# Implicit dtype
# Integer array could be inferred to int64 or int32 depending on system
(np.array([1, 2, 3]), (dt.Array(dt.int64), dt.Array(dt.int32))),
(np.array([1.0, 2.0, 3.0]), (dt.Array(dt.float64),)),
(np.array([np.nan, np.nan, np.nan]), (dt.Array(dt.float64),)),
(np.array([True, False, True]), (dt.Array(dt.boolean),)),
(np.array(['1', '2', '3']), (dt.Array(dt.string),)),
(
np.array(
[
pd.Timestamp('2015-01-01 12:00:00'),
pd.Timestamp('2015-01-02 12:00:00'),
pd.Timestamp('2015-01-03 12:00:00'),
]
),
(dt.Array(dt.Timestamp()), dt.Array(dt.Timestamp(scale=9))),
),
# Implied from object dtype
(np.array([1, 2, 3], dtype=object), (dt.Array(dt.int64),)),
(np.array([1.0, 2.0, 3.0], dtype=object), (dt.Array(dt.float64),)),
(np.array([True, False, True], dtype=object), (dt.Array(dt.boolean),)),
(np.array(['1', '2', '3'], dtype=object), (dt.Array(dt.string),)),
(
np.array(
[
pd.Timestamp('2015-01-01 12:00:00'),
pd.Timestamp('2015-01-02 12:00:00'),
pd.Timestamp('2015-01-03 12:00:00'),
],
dtype=object,
),
(dt.Array(dt.Timestamp()), dt.Array(dt.Timestamp(scale=9))),
),
],
)
def test_infer_numpy_array(numpy_array, expected_dtypes):
pandas_series = pd.Series(numpy_array)
assert dt.infer(numpy_array) in expected_dtypes
assert dt.infer(pandas_series) in expected_dtypes
52 changes: 28 additions & 24 deletions ibis/expr/operations/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import uuid
from operator import attrgetter

import numpy as np
from public import public

import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -169,31 +168,36 @@ class Least(Value):

@public
class Literal(Value):
__valid_input_types__ = (
bytes,
datetime.date,
datetime.datetime,
datetime.time,
datetime.timedelta,
enum.Enum,
float,
frozenset,
int,
ipaddress.IPv4Address,
ipaddress.IPv6Address,
frozendict,
np.generic,
np.ndarray,
str,
tuple,
type(None),
uuid.UUID,
decimal.Decimal,
)
value = rlz.one_of(
(
rlz.instance_of(__valid_input_types__),
rlz.lazy_instance_of("shapely.geometry.BaseGeometry"),
rlz.instance_of(
(
bytes,
datetime.date,
datetime.datetime,
datetime.time,
datetime.timedelta,
enum.Enum,
float,
frozenset,
int,
ipaddress.IPv4Address,
ipaddress.IPv6Address,
frozendict,
str,
tuple,
type(None),
uuid.UUID,
decimal.Decimal,
)
),
rlz.lazy_instance_of(
(
"shapely.geometry.BaseGeometry",
"numpy.generic",
"numpy.ndarray",
)
),
)
)
dtype = rlz.datatype
Expand Down

0 comments on commit 66f2ff0

Please sign in to comment.