feat: support empty arrays, improve ibis.array() API

Picking out the array stuff from #8666
ibis-project · Jun 27, 2024 · 94a37ef · 94a37ef
1 parent 33ec754
commit 94a37ef
Show file tree

Hide file tree

Showing 13 changed files with 174 additions and 50 deletions.
diff --git a/ibis/backends/dask/executor.py b/ibis/backends/dask/executor.py
@@ -28,6 +28,7 @@
     plan,
 )
 from ibis.common.exceptions import UnboundExpressionError, UnsupportedOperationError
+from ibis.formats.numpy import NumpyType
 from ibis.formats.pandas import PandasData, PandasType
 from ibis.util import gen_name
 
@@ -155,9 +156,10 @@ def mapper(df, cases):
         return cls.partitionwise(mapper, kwargs, name=op.name, dtype=dtype)
 
     @classmethod
-    def visit(cls, op: ops.Array, exprs):
+    def visit(cls, op: ops.Array, exprs, dtype):
+        np_type = NumpyType.from_ibis(dtype)
         return cls.rowwise(
-            lambda row: np.array(row, dtype=object), exprs, name=op.name, dtype=object
+            lambda row: np.array(row, dtype=np_type), exprs, name=op.name, dtype=object
         )
 
     @classmethod

diff --git a/ibis/backends/dask/helpers.py b/ibis/backends/dask/helpers.py
@@ -30,7 +30,7 @@ def concat(cls, dfs, **kwargs):
 
     @classmethod
     def asseries(cls, value, like=None):
-        """Ensure that value is a pandas Series object, broadcast if necessary."""
+        """Ensure that value is a dask Series object, broadcast if necessary."""
 
         if isinstance(value, dd.Series):
             return value
@@ -50,7 +50,7 @@ def asseries(cls, value, like=None):
         elif isinstance(value, pd.Series):
             return dd.from_pandas(value, npartitions=1)
         elif like is not None:
-            if isinstance(value, (tuple, list, dict)):
+            if isinstance(value, (tuple, list, dict, np.ndarray)):
                 fn = lambda df: pd.Series([value] * len(df), index=df.index)
             else:
                 fn = lambda df: pd.Series(value, index=df.index)

diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py
@@ -31,6 +31,7 @@
 )
 from ibis.common.dispatch import Dispatched
 from ibis.common.exceptions import OperationNotDefinedError, UnboundExpressionError
+from ibis.formats.numpy import NumpyType
 from ibis.formats.pandas import PandasData, PandasType
 from ibis.util import any_of, gen_name
 
@@ -49,6 +50,8 @@ def visit(cls, op: ops.Node, **kwargs):
 
     @classmethod
     def visit(cls, op: ops.Literal, value, dtype):
+        if value is None:
+            return None
         if dtype.is_interval():
             value = pd.Timedelta(value, dtype.unit.short)
         elif dtype.is_array():
@@ -220,8 +223,9 @@ def visit(cls, op: ops.FindInSet, needle, values):
         return pd.Series(result, name=op.name)
 
     @classmethod
-    def visit(cls, op: ops.Array, exprs):
-        return cls.rowwise(lambda row: np.array(row, dtype=object), exprs)
+    def visit(cls, op: ops.Array, exprs, dtype):
+        np_val_type = NumpyType.from_ibis(dtype.value_type)
+        return cls.rowwise(lambda row: np.array(row, dtype=np_val_type), exprs)
 
     @classmethod
     def visit(cls, op: ops.StructColumn, names, values):

diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py
@@ -87,25 +87,25 @@ def literal(op, **_):
     value = op.value
     dtype = op.dtype
 
-    if dtype.is_array():
-        value = pl.Series("", value)
-        typ = PolarsType.from_ibis(dtype)
-        val = pl.lit(value, dtype=typ)
-        return val.implode()
+    if dtype.is_interval():
+        return _make_duration(value, dtype)
+
+    typ = PolarsType.from_ibis(dtype)
+    if value is None:
+        return pl.lit(None, dtype=typ)
+    elif dtype.is_array():
+        return pl.lit(pl.Series("", value).implode(), dtype=typ)
     elif dtype.is_struct():
         values = [
             pl.lit(v, dtype=PolarsType.from_ibis(dtype[k])).alias(k)
             for k, v in value.items()
         ]
         return pl.struct(values)
-    elif dtype.is_interval():
-        return _make_duration(value, dtype)
     elif dtype.is_null():
         return pl.lit(value)
     elif dtype.is_binary():
         return pl.lit(value)
     else:
-        typ = PolarsType.from_ibis(dtype)
         return pl.lit(op.value, dtype=typ)
 
 
@@ -974,9 +974,12 @@ def array_concat(op, **kw):
 
 
 @translate.register(ops.Array)
-def array_column(op, **kw):
-    cols = [translate(col, **kw) for col in op.exprs]
-    return pl.concat_list(cols)
+def array_literal(op, **kw):
+    pdt = PolarsType.from_ibis(op.dtype)
+    if op.exprs:
+        return pl.concat_list([translate(col, **kw) for col in op.exprs]).cast(pdt)
+    else:
+        return pl.lit([], dtype=pdt)
 
 
 @translate.register(ops.ArrayCollect)

diff --git a/ibis/backends/sql/compiler.py b/ibis/backends/sql/compiler.py
@@ -1019,8 +1019,8 @@ def visit_InSubquery(self, op, *, rel, needle):
             query = sg.select(STAR).from_(query)
         return needle.isin(query=query)
 
-    def visit_Array(self, op, *, exprs):
-        return self.f.array(*exprs)
+    def visit_Array(self, op, *, exprs, dtype):
+        return self.cast(self.f.array(*exprs), dtype)
 
     def visit_StructColumn(self, op, *, names, values):
         return sge.Struct.from_arg_list(

diff --git a/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/clickhouse/out.sql b/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/clickhouse/out.sql
@@ -26,7 +26,7 @@ WITH "t5" AS (
           SELECT
             "t0"."field_of_study",
             arrayJoin(
-              [
+              CAST([
                 CAST(tuple('1970-71', "t0"."1970-71") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
                 CAST(tuple('1975-76', "t0"."1975-76") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
                 CAST(tuple('1980-81', "t0"."1980-81") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
@@ -45,7 +45,7 @@ WITH "t5" AS (
                 CAST(tuple('2017-18', "t0"."2017-18") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
                 CAST(tuple('2018-19', "t0"."2018-19") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
                 CAST(tuple('2019-20', "t0"."2019-20") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64)))
-              ]
+              ] AS Array(Tuple("years" Nullable(String), "degrees" Nullable(Int64))))
             ) AS "__pivoted__"
           FROM "humanities" AS "t0"
         ) AS "t1"

diff --git a/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/duckdb/out.sql b/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/duckdb/out.sql
@@ -26,7 +26,7 @@ WITH "t5" AS (
           SELECT
             "t0"."field_of_study",
             UNNEST(
-              [
+              CAST([
                 {'years': '1970-71', 'degrees': "t0"."1970-71"},
                 {'years': '1975-76', 'degrees': "t0"."1975-76"},
                 {'years': '1980-81', 'degrees': "t0"."1980-81"},
@@ -45,7 +45,7 @@ WITH "t5" AS (
                 {'years': '2017-18', 'degrees': "t0"."2017-18"},
                 {'years': '2018-19', 'degrees': "t0"."2018-19"},
                 {'years': '2019-20', 'degrees': "t0"."2019-20"}
-              ]
+              ] AS STRUCT("years" TEXT, "degrees" BIGINT)[])
             ) AS "__pivoted__"
           FROM "humanities" AS "t0"
         ) AS "t1"

diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py
@@ -31,6 +31,7 @@
     PySparkAnalysisException,
     TrinoUserError,
 )
+from ibis.common.annotations import ValidationError
 from ibis.common.collections import frozendict
 
 pytestmark = [
@@ -72,6 +73,85 @@
 # list.
 
 
+def test_array_factory(con):
+    a = ibis.array([1, 2, 3])
+    assert a.type() == dt.Array(value_type=dt.Int8)
+    assert con.execute(a) == [1, 2, 3]
+
+    a2 = ibis.array(a)
+    assert a.type() == dt.Array(value_type=dt.Int8)
+    assert con.execute(a2) == [1, 2, 3]
+
+
+@pytest.mark.broken(
+    ["pandas"],
+    raises=AssertionError,
+    reason="results in [1, 2, 3]",
+)
+def test_array_factory_typed(con):
+    typed = ibis.array([1, 2, 3], type="array<string>")
+    assert con.execute(typed) == ["1", "2", "3"]
+
+    typed2 = ibis.array(ibis.array([1, 2, 3]), type="array<string>")
+    assert con.execute(typed2) == ["1", "2", "3"]
+
+
+@pytest.mark.notimpl("flink", raises=Py4JJavaError)
+@pytest.mark.notimpl(["pandas", "dask"], raises=ValueError)
+def test_array_factory_empty(con):
+    with pytest.raises(ValidationError):
+        ibis.array([])
+
+    empty_typed = ibis.array([], type="array<string>")
+    assert empty_typed.type() == dt.Array(value_type=dt.string)
+    assert con.execute(empty_typed) == []
+
+
+@pytest.mark.notyet(
+    "clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL"
+)
+@pytest.mark.notyet(
+    "flink", raises=Py4JJavaError, reason="Parameters must be of the same type"
+)
+def test_array_factory_null(con):
+    with pytest.raises(ValidationError):
+        ibis.array(None)
+    with pytest.raises(ValidationError):
+        ibis.array(None, type="int64")
+    none_typed = ibis.array(None, type="array<string>")
+    assert none_typed.type() == dt.Array(value_type=dt.string)
+    assert con.execute(none_typed) is None
+
+    nones = ibis.array([None, None], type="array<string>")
+    assert nones.type() == dt.Array(value_type=dt.string)
+    assert con.execute(nones) == [None, None]
+
+    # Execute a real value here, so the backends that don't support arrays
+    # actually xfail as we expect them to.
+    # Otherwise would have to @mark.xfail every test in this file besides this one.
+    assert con.execute(ibis.array([1, 2])) == [1, 2]
+
+
+@pytest.mark.broken(
+    ["datafusion", "flink", "polars"],
+    raises=AssertionError,
+    reason="[None, 1] executes to [np.nan, 1.0]",
+)
+@pytest.mark.broken(
+    ["pandas"],
+    raises=AssertionError,
+    reason="even with explicit cast, results in [None, 1]",
+)
+def test_array_factory_null_mixed(con):
+    none_and_val = ibis.array([None, 1])
+    assert none_and_val.type() == dt.Array(value_type=dt.Int8)
+    assert con.execute(none_and_val) == [None, 1]
+
+    none_and_val_typed = ibis.array([None, 1], type="array<string>")
+    assert none_and_val_typed.type() == dt.Array(value_type=dt.String)
+    assert con.execute(none_and_val_typed) == [None, "1"]
+
+
 def test_array_column(backend, alltypes, df):
     expr = ibis.array(
         [alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)]
@@ -1354,11 +1434,6 @@ def test_unnest_range(con):
             id="array",
             marks=[
                 pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest),
-                pytest.mark.broken(
-                    ["polars"],
-                    reason="expression input not supported with nested arrays",
-                    raises=TypeError,
-                ),
             ],
         ),
     ],

diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py
@@ -31,7 +31,11 @@
             marks=[
                 pytest.mark.never(
                     ["impala", "mysql", "sqlite", "mssql", "exasol"],
-                    raises=(NotImplementedError, exc.UnsupportedBackendType),
+                    raises=(
+                        exc.OperationNotDefinedError,
+                        NotImplementedError,
+                        exc.UnsupportedBackendType,
+                    ),
                     reason="structs not supported in the backend",
                 ),
                 pytest.mark.notimpl(

diff --git a/ibis/expr/operations/arrays.py b/ibis/expr/operations/arrays.py
@@ -19,14 +19,15 @@ class Array(Value):
     """Construct an array."""
 
     exprs: VarTuple[Value]
+    dtype: Optional[dt.Array] = None
 
-    @attribute
-    def shape(self):
-        return rlz.highest_precedence_shape(self.exprs)
+    shape = rlz.shape_like("exprs")
 
-    @attribute
-    def dtype(self):
-        return dt.Array(rlz.highest_precedence_dtype(self.exprs))
+    def __init__(self, exprs, dtype: dt.Array | None = None):
+        # If len(exprs) == 0, the caller is responsible for providing a dtype
+        if dtype is None:
+            dtype = dt.Array(rlz.highest_precedence_dtype(exprs))
+        super().__init__(exprs=exprs, dtype=dtype)
 
 
 @public

diff --git a/ibis/expr/rules.py b/ibis/expr/rules.py
@@ -5,6 +5,7 @@
 
 from public import public
 
+import ibis.expr.datashape as ds
 import ibis.expr.datatypes as dt
 import ibis.expr.operations as ops
 from ibis import util
@@ -16,6 +17,9 @@
 
 @public
 def highest_precedence_shape(nodes):
+    nodes = tuple(nodes)
+    if len(nodes) == 0:
+        return ds.scalar
     return max(node.shape for node in nodes)
 
 

diff --git a/ibis/expr/tests/snapshots/test_format/test_format_dummy_table/repr.txt b/ibis/expr/tests/snapshots/test_format/test_format_dummy_table/repr.txt
@@ -1,2 +1,2 @@
 DummyTable
-  foo: Array([1])
+  foo: Array(exprs=[1], dtype=array<int8>)