fix(druid): get basic timestamp functionality working (#9692)

Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
ibis-project · Jul 25, 2024 · 6cd3eee · 6cd3eee
1 parent 7254f65
commit 6cd3eee
Show file tree

Hide file tree

Showing 8 changed files with 119 additions and 335 deletions.
diff --git a/ci/schema/druid.sql b/ci/schema/druid.sql
@@ -36,11 +36,24 @@ PARTITIONED BY ALL TIME;
 
 REPLACE INTO "functional_alltypes"
 OVERWRITE ALL
-SELECT *
+SELECT
+  "id",
+  "bool_col",
+  "tinyint_col",
+  "smallint_col",
+  "int_col",
+  "bigint_col",
+  "float_col",
+  "double_col",
+  "date_string_col",
+  "string_col",
+  TIME_PARSE(CONCAT(REPLACE("timestamp_col", ' ', 'T'), 'Z')) AS "timestamp_col",
+  "year",
+  "month"
 FROM TABLE(
   EXTERN(
-    '{"type":"local","files":["/data/functional_alltypes.parquet"]}',
-    '{"type":"parquet"}',
+    '{"type":"local","files":["/data/functional_alltypes.csv"]}',
+    '{"type":"csv","skipHeaderRows":1,"columns":["id","bool_col","tinyint_col","smallint_col","int_col","bigint_col","float_col","double_col","date_string_col","string_col","timestamp_col","year","month"]}',
     '[{"name":"id","type":"long"},{"name":"bool_col","type":"long"},{"name":"tinyint_col","type":"long"},{"name":"smallint_col","type":"long"},{"name":"int_col","type":"long"},{"name":"bigint_col","type":"long"},{"name":"float_col","type":"double"},{"name":"double_col","type":"double"},{"name":"date_string_col","type":"string"},{"name":"string_col","type":"string"},{"name":"timestamp_col","type":"string"},{"name":"year","type":"long"},{"name":"month","type":"long"}]'
   )
 )

diff --git a/ibis/backends/druid/tests/conftest.py b/ibis/backends/druid/tests/conftest.py
@@ -94,7 +94,7 @@ def run_query(session: Session, query: str) -> None:
 class TestConf(ServiceBackendTest):
     # druid has the same rounding behavior as postgres
     check_dtype = False
-    returned_timestamp_unit = "s"
+    returned_timestamp_unit = "ms"
     supports_arrays = False
     native_bool = True
     supports_structs = False
@@ -106,13 +106,26 @@ class TestConf(ServiceBackendTest):
     @property
     def functional_alltypes(self) -> ir.Table:
         t = self.connection.table("functional_alltypes")
-        # The parquet loading for booleans appears to be broken in Druid, so
-        # I'm using this as a workaround to make the data match what's on disk.
-        return t.mutate(bool_col=1 - t.id % 2)
+        return t.mutate(
+            # The parquet loading for booleans appears to be broken in Druid, so
+            # I'm using this as a workaround to make the data match what's on disk.
+            bool_col=1 - t.id % 2,
+            # timestamp_col is loaded as a long because druid's type system is
+            # awful: it does 99% of the work of a proper timestamp type, but
+            # encodes it as an integer. I've never seen or heard of any other
+            # tool that calls itself a time series database or "good for
+            # working with time series", that lacks a first-class timestamp
+            # type.
+            timestamp_col=t.timestamp_col.to_timestamp(unit="ms"),
+        )
 
     @property
     def test_files(self) -> Iterable[Path]:
-        return self.data_dir.joinpath("parquet").glob("*.parquet")
+        return [
+            path
+            for path in self.data_dir.joinpath("parquet").glob("*.parquet")
+            if path.name != "functional_alltypes.parquet"
+        ] + [self.data_dir.joinpath("csv", "functional_alltypes.csv")]
 
     def _load_data(self, **_: Any) -> None:
         """Load test data into a druid backend instance.

diff --git a/ibis/backends/sql/compilers/druid.py b/ibis/backends/sql/compilers/druid.py
@@ -4,11 +4,13 @@
 import sqlglot.expressions as sge
 import toolz
 
+import ibis.common.exceptions as exc
 import ibis.expr.datatypes as dt
 import ibis.expr.operations as ops
 from ibis.backends.sql.compilers.base import NULL, AggGen, SQLGlotCompiler
 from ibis.backends.sql.datatypes import DruidType
 from ibis.backends.sql.dialects import Druid
+from ibis.common.temporal import TimestampUnit
 
 
 class DruidCompiler(SQLGlotCompiler):
@@ -36,7 +38,9 @@ class DruidCompiler(SQLGlotCompiler):
         ops.ArrayZip,
         ops.CountDistinctStar,
         ops.Covariance,
+        ops.Date,
         ops.DateDelta,
+        ops.DateFromYMD,
         ops.DayOfWeekIndex,
         ops.DayOfWeekName,
         ops.First,
@@ -169,6 +173,13 @@ def visit_Cast(self, op, *, arg, to):
             return self.f.time_parse(arg)
         return super().visit_Cast(op, arg=arg, to=to)
 
+    def visit_TimestampFromUNIX(self, op, *, arg, unit):
+        if unit == TimestampUnit.SECOND:
+            return self.f.millis_to_timestamp(arg * 1_000)
+        elif unit == TimestampUnit.MILLISECOND:
+            return self.f.millis_to_timestamp(arg)
+        raise exc.UnsupportedArgumentError(f"Druid doesn't support {unit} units")
+
     def visit_TimestampFromYMDHMS(
         self, op, *, year, month, day, hours, minutes, seconds
     ):

diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py
@@ -110,11 +110,6 @@ def mean_udf(s):
         lambda t: t.timestamp_col.max(),
         lambda t: t.timestamp_col.max(),
         id="timestamp_max",
-        marks=pytest.mark.broken(
-            ["druid"],
-            raises=PyDruidProgrammingError,
-            reason="Max aggregation is not supported for 'STRING' type SQL",
-        ),
     ),
 ]
 
@@ -1174,12 +1169,13 @@ def test_string_quantile(alltypes, func):
     assert result == "a"
 
 
-@pytest.mark.notimpl(["bigquery", "sqlite"], raises=com.OperationNotDefinedError)
+@pytest.mark.notimpl(
+    ["bigquery", "sqlite", "druid"], raises=com.OperationNotDefinedError
+)
 @pytest.mark.notyet(
     ["impala", "mysql", "mssql", "trino", "exasol", "flink"],
     raises=com.OperationNotDefinedError,
 )
-@pytest.mark.broken(["druid"], raises=AttributeError)
 @pytest.mark.notyet(
     ["snowflake"],
     raises=SnowflakeProgrammingError,

diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py
@@ -28,20 +28,11 @@
 pa = pytest.importorskip("pyarrow")
 
 limit = [
-    param(
-        42,
-        id="limit",
-        # limit not implemented for pandas-family backends
-        marks=[pytest.mark.notimpl(["dask", "pandas"])],
-    ),
+    # limit not implemented for pandas-family backends
+    param(42, id="limit", marks=pytest.mark.notimpl(["dask", "pandas"])),
 ]
 
-no_limit = [
-    param(
-        None,
-        id="nolimit",
-    )
-]
+no_limit = [param(None, id="nolimit")]
 
 limit_no_limit = limit + no_limit
 
@@ -426,7 +417,9 @@ def test_roundtrip_delta(backend, con, alltypes, tmp_path):
 
 
 @pytest.mark.notimpl(
-    ["druid"], raises=AttributeError, reason="string type is used for timestamp_col"
+    ["druid"],
+    raises=PyDruidProgrammingError,
+    reason="Invalid SQL generated; druid doesn't know about TIMESTAMPTZ",
 )
 def test_arrow_timestamp_with_time_zone(alltypes):
     from ibis.formats.pyarrow import PyArrowType
@@ -512,9 +505,8 @@ def test_to_pandas_batches_column(backend, con, n):
     assert sum(map(len, t.to_pandas_batches())) == n
 
 
-@pytest.mark.notimpl(["druid"])
 def test_to_pandas_batches_scalar(backend, con):
-    t = backend.functional_alltypes.timestamp_col.max()
+    t = backend.functional_alltypes.int_col.max()
     expected = t.execute()
 
     result1 = list(con.to_pandas_batches(t))

diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py
@@ -1120,7 +1120,6 @@ def test_between(backend, alltypes, df):
     backend.assert_series_equal(result, expected)
 
 
-@pytest.mark.notimpl(["druid"])
 def test_interactive(alltypes, monkeypatch):
     monkeypatch.setattr(ibis.options, "interactive", True)
 

diff --git a/ibis/backends/tests/test_param.py b/ibis/backends/tests/test_param.py
@@ -124,25 +124,19 @@ def test_scalar_param_map(con):
             marks=[pytest.mark.notimpl(["druid"])],
         ),
         param(
-            "2009-01-20 01:02:03",
-            "timestamp",
-            "timestamp_col",
-            id="string_timestamp",
-            marks=[pytest.mark.notimpl(["druid"])],
+            "2009-01-20 01:02:03", "timestamp", "timestamp_col", id="string_timestamp"
         ),
         param(
             datetime.date(2009, 1, 20),
             "timestamp",
             "timestamp_col",
             id="date_timestamp",
-            marks=[pytest.mark.notimpl(["druid"])],
         ),
         param(
             datetime.datetime(2009, 1, 20, 1, 2, 3),
             "timestamp",
             "timestamp_col",
             id="datetime_timestamp",
-            marks=[pytest.mark.notimpl(["druid"])],
         ),
     ],
 )