Skip to content

Commit

Permalink
feat: support empty arrays, improve ibis.array() API
Browse files Browse the repository at this point in the history
Picking out the array stuff from #8666
  • Loading branch information
NickCrews committed Jun 27, 2024
1 parent 33ec754 commit 94a37ef
Show file tree
Hide file tree
Showing 13 changed files with 174 additions and 50 deletions.
6 changes: 4 additions & 2 deletions ibis/backends/dask/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
plan,
)
from ibis.common.exceptions import UnboundExpressionError, UnsupportedOperationError
from ibis.formats.numpy import NumpyType

Check warning on line 31 in ibis/backends/dask/executor.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/dask/executor.py#L31

Added line #L31 was not covered by tests
from ibis.formats.pandas import PandasData, PandasType
from ibis.util import gen_name

Expand Down Expand Up @@ -155,9 +156,10 @@ def mapper(df, cases):
return cls.partitionwise(mapper, kwargs, name=op.name, dtype=dtype)

@classmethod
def visit(cls, op: ops.Array, exprs):
def visit(cls, op: ops.Array, exprs, dtype):
np_type = NumpyType.from_ibis(dtype)

Check warning on line 160 in ibis/backends/dask/executor.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/dask/executor.py#L160

Added line #L160 was not covered by tests
return cls.rowwise(
lambda row: np.array(row, dtype=object), exprs, name=op.name, dtype=object
lambda row: np.array(row, dtype=np_type), exprs, name=op.name, dtype=object
)

@classmethod
Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/dask/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def concat(cls, dfs, **kwargs):

@classmethod
def asseries(cls, value, like=None):
"""Ensure that value is a pandas Series object, broadcast if necessary."""
"""Ensure that value is a dask Series object, broadcast if necessary."""

if isinstance(value, dd.Series):
return value
Expand All @@ -50,7 +50,7 @@ def asseries(cls, value, like=None):
elif isinstance(value, pd.Series):
return dd.from_pandas(value, npartitions=1)
elif like is not None:
if isinstance(value, (tuple, list, dict)):
if isinstance(value, (tuple, list, dict, np.ndarray)):
fn = lambda df: pd.Series([value] * len(df), index=df.index)
else:
fn = lambda df: pd.Series(value, index=df.index)
Expand Down
8 changes: 6 additions & 2 deletions ibis/backends/pandas/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
)
from ibis.common.dispatch import Dispatched
from ibis.common.exceptions import OperationNotDefinedError, UnboundExpressionError
from ibis.formats.numpy import NumpyType

Check warning on line 34 in ibis/backends/pandas/executor.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pandas/executor.py#L34

Added line #L34 was not covered by tests
from ibis.formats.pandas import PandasData, PandasType
from ibis.util import any_of, gen_name

Expand All @@ -49,6 +50,8 @@ def visit(cls, op: ops.Node, **kwargs):

@classmethod
def visit(cls, op: ops.Literal, value, dtype):
if value is None:
return None

Check warning on line 54 in ibis/backends/pandas/executor.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pandas/executor.py#L54

Added line #L54 was not covered by tests
if dtype.is_interval():
value = pd.Timedelta(value, dtype.unit.short)
elif dtype.is_array():
Expand Down Expand Up @@ -220,8 +223,9 @@ def visit(cls, op: ops.FindInSet, needle, values):
return pd.Series(result, name=op.name)

@classmethod
def visit(cls, op: ops.Array, exprs):
return cls.rowwise(lambda row: np.array(row, dtype=object), exprs)
def visit(cls, op: ops.Array, exprs, dtype):
np_val_type = NumpyType.from_ibis(dtype.value_type)

Check warning on line 227 in ibis/backends/pandas/executor.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pandas/executor.py#L227

Added line #L227 was not covered by tests
return cls.rowwise(lambda row: np.array(row, dtype=np_val_type), exprs)

@classmethod
def visit(cls, op: ops.StructColumn, names, values):
Expand Down
25 changes: 14 additions & 11 deletions ibis/backends/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,25 +87,25 @@ def literal(op, **_):
value = op.value
dtype = op.dtype

if dtype.is_array():
value = pl.Series("", value)
typ = PolarsType.from_ibis(dtype)
val = pl.lit(value, dtype=typ)
return val.implode()
if dtype.is_interval():
return _make_duration(value, dtype)

typ = PolarsType.from_ibis(dtype)
if value is None:
return pl.lit(None, dtype=typ)
elif dtype.is_array():
return pl.lit(pl.Series("", value).implode(), dtype=typ)
elif dtype.is_struct():
values = [
pl.lit(v, dtype=PolarsType.from_ibis(dtype[k])).alias(k)
for k, v in value.items()
]
return pl.struct(values)
elif dtype.is_interval():
return _make_duration(value, dtype)
elif dtype.is_null():
return pl.lit(value)
elif dtype.is_binary():
return pl.lit(value)
else:
typ = PolarsType.from_ibis(dtype)
return pl.lit(op.value, dtype=typ)


Expand Down Expand Up @@ -974,9 +974,12 @@ def array_concat(op, **kw):


@translate.register(ops.Array)
def array_column(op, **kw):
cols = [translate(col, **kw) for col in op.exprs]
return pl.concat_list(cols)
def array_literal(op, **kw):
pdt = PolarsType.from_ibis(op.dtype)
if op.exprs:
return pl.concat_list([translate(col, **kw) for col in op.exprs]).cast(pdt)
else:
return pl.lit([], dtype=pdt)


@translate.register(ops.ArrayCollect)
Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/sql/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,8 +1019,8 @@ def visit_InSubquery(self, op, *, rel, needle):
query = sg.select(STAR).from_(query)
return needle.isin(query=query)

def visit_Array(self, op, *, exprs):
return self.f.array(*exprs)
def visit_Array(self, op, *, exprs, dtype):
return self.cast(self.f.array(*exprs), dtype)

def visit_StructColumn(self, op, *, names, values):
return sge.Struct.from_arg_list(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ WITH "t5" AS (
SELECT
"t0"."field_of_study",
arrayJoin(
[
CAST([
CAST(tuple('1970-71', "t0"."1970-71") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
CAST(tuple('1975-76', "t0"."1975-76") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
CAST(tuple('1980-81', "t0"."1980-81") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
Expand All @@ -45,7 +45,7 @@ WITH "t5" AS (
CAST(tuple('2017-18', "t0"."2017-18") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
CAST(tuple('2018-19', "t0"."2018-19") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64))),
CAST(tuple('2019-20', "t0"."2019-20") AS Tuple("years" Nullable(String), "degrees" Nullable(Int64)))
]
] AS Array(Tuple("years" Nullable(String), "degrees" Nullable(Int64))))
) AS "__pivoted__"
FROM "humanities" AS "t0"
) AS "t1"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ WITH "t5" AS (
SELECT
"t0"."field_of_study",
UNNEST(
[
CAST([
{'years': '1970-71', 'degrees': "t0"."1970-71"},
{'years': '1975-76', 'degrees': "t0"."1975-76"},
{'years': '1980-81', 'degrees': "t0"."1980-81"},
Expand All @@ -45,7 +45,7 @@ WITH "t5" AS (
{'years': '2017-18', 'degrees': "t0"."2017-18"},
{'years': '2018-19', 'degrees': "t0"."2018-19"},
{'years': '2019-20', 'degrees': "t0"."2019-20"}
]
] AS STRUCT("years" TEXT, "degrees" BIGINT)[])
) AS "__pivoted__"
FROM "humanities" AS "t0"
) AS "t1"
Expand Down
85 changes: 80 additions & 5 deletions ibis/backends/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
PySparkAnalysisException,
TrinoUserError,
)
from ibis.common.annotations import ValidationError
from ibis.common.collections import frozendict

pytestmark = [
Expand Down Expand Up @@ -72,6 +73,85 @@
# list.


def test_array_factory(con):
a = ibis.array([1, 2, 3])
assert a.type() == dt.Array(value_type=dt.Int8)
assert con.execute(a) == [1, 2, 3]

a2 = ibis.array(a)
assert a.type() == dt.Array(value_type=dt.Int8)
assert con.execute(a2) == [1, 2, 3]


@pytest.mark.broken(
["pandas"],
raises=AssertionError,
reason="results in [1, 2, 3]",
)
def test_array_factory_typed(con):
typed = ibis.array([1, 2, 3], type="array<string>")
assert con.execute(typed) == ["1", "2", "3"]

typed2 = ibis.array(ibis.array([1, 2, 3]), type="array<string>")
assert con.execute(typed2) == ["1", "2", "3"]


@pytest.mark.notimpl("flink", raises=Py4JJavaError)
@pytest.mark.notimpl(["pandas", "dask"], raises=ValueError)
def test_array_factory_empty(con):
with pytest.raises(ValidationError):
ibis.array([])

empty_typed = ibis.array([], type="array<string>")
assert empty_typed.type() == dt.Array(value_type=dt.string)
assert con.execute(empty_typed) == []


@pytest.mark.notyet(
"clickhouse", raises=ClickHouseDatabaseError, reason="nested types can't be NULL"
)
@pytest.mark.notyet(
"flink", raises=Py4JJavaError, reason="Parameters must be of the same type"
)
def test_array_factory_null(con):
with pytest.raises(ValidationError):
ibis.array(None)
with pytest.raises(ValidationError):
ibis.array(None, type="int64")
none_typed = ibis.array(None, type="array<string>")
assert none_typed.type() == dt.Array(value_type=dt.string)
assert con.execute(none_typed) is None

nones = ibis.array([None, None], type="array<string>")
assert nones.type() == dt.Array(value_type=dt.string)
assert con.execute(nones) == [None, None]

# Execute a real value here, so the backends that don't support arrays
# actually xfail as we expect them to.
# Otherwise would have to @mark.xfail every test in this file besides this one.
assert con.execute(ibis.array([1, 2])) == [1, 2]


@pytest.mark.broken(
["datafusion", "flink", "polars"],
raises=AssertionError,
reason="[None, 1] executes to [np.nan, 1.0]",
)
@pytest.mark.broken(
["pandas"],
raises=AssertionError,
reason="even with explicit cast, results in [None, 1]",
)
def test_array_factory_null_mixed(con):
none_and_val = ibis.array([None, 1])
assert none_and_val.type() == dt.Array(value_type=dt.Int8)
assert con.execute(none_and_val) == [None, 1]

none_and_val_typed = ibis.array([None, 1], type="array<string>")
assert none_and_val_typed.type() == dt.Array(value_type=dt.String)
assert con.execute(none_and_val_typed) == [None, "1"]


def test_array_column(backend, alltypes, df):
expr = ibis.array(
[alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)]
Expand Down Expand Up @@ -1354,11 +1434,6 @@ def test_unnest_range(con):
id="array",
marks=[
pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest),
pytest.mark.broken(
["polars"],
reason="expression input not supported with nested arrays",
raises=TypeError,
),
],
),
],
Expand Down
6 changes: 5 additions & 1 deletion ibis/backends/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,11 @@
marks=[
pytest.mark.never(
["impala", "mysql", "sqlite", "mssql", "exasol"],
raises=(NotImplementedError, exc.UnsupportedBackendType),
raises=(
exc.OperationNotDefinedError,
NotImplementedError,
exc.UnsupportedBackendType,
),
reason="structs not supported in the backend",
),
pytest.mark.notimpl(
Expand Down
13 changes: 7 additions & 6 deletions ibis/expr/operations/arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@ class Array(Value):
"""Construct an array."""

exprs: VarTuple[Value]
dtype: Optional[dt.Array] = None

@attribute
def shape(self):
return rlz.highest_precedence_shape(self.exprs)
shape = rlz.shape_like("exprs")

@attribute
def dtype(self):
return dt.Array(rlz.highest_precedence_dtype(self.exprs))
def __init__(self, exprs, dtype: dt.Array | None = None):
# If len(exprs) == 0, the caller is responsible for providing a dtype
if dtype is None:
dtype = dt.Array(rlz.highest_precedence_dtype(exprs))
super().__init__(exprs=exprs, dtype=dtype)


@public
Expand Down
4 changes: 4 additions & 0 deletions ibis/expr/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from public import public

import ibis.expr.datashape as ds
import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
from ibis import util
Expand All @@ -16,6 +17,9 @@

@public
def highest_precedence_shape(nodes):
nodes = tuple(nodes)
if len(nodes) == 0:
return ds.scalar
return max(node.shape for node in nodes)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
DummyTable
foo: Array([1])
foo: Array(exprs=[1], dtype=array<int8>)
Loading

0 comments on commit 94a37ef

Please sign in to comment.