From 8a7a29a85c754e4193f7151a3070827960cf2f5b Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 08:33:28 -0500 Subject: [PATCH 01/11] feat(api): support literal expressions in array constructor --- ibis/backends/bigquery/registry.py | 4 +-- ibis/backends/clickhouse/compiler/values.py | 6 ++-- ibis/backends/dask/execution/arrays.py | 2 +- ibis/backends/datafusion/compiler/values.py | 6 ++-- ibis/backends/duckdb/registry.py | 4 +-- ibis/backends/pandas/execution/arrays.py | 2 +- ibis/backends/polars/compiler.py | 4 +-- ibis/backends/postgres/registry.py | 2 +- ibis/backends/pyspark/compiler.py | 4 +-- ibis/backends/snowflake/registry.py | 4 +-- ibis/backends/tests/test_array.py | 38 +++++++++++++++++++++ ibis/backends/tests/test_map.py | 2 +- ibis/backends/trino/registry.py | 4 +-- ibis/expr/operations/arrays.py | 10 +++--- ibis/expr/types/arrays.py | 19 ++--------- 15 files changed, 67 insertions(+), 44 deletions(-) diff --git a/ibis/backends/bigquery/registry.py b/ibis/backends/bigquery/registry.py index 7911f9b87555..7afc889d9ac8 100644 --- a/ibis/backends/bigquery/registry.py +++ b/ibis/backends/bigquery/registry.py @@ -129,7 +129,7 @@ def _array_concat(translator, op): def _array_column(translator, op): - return "[{}]".format(", ".join(map(translator.translate, op.cols))) + return "[{}]".format(", ".join(map(translator.translate, op.exprs))) def _array_index(translator, op): @@ -912,7 +912,7 @@ def _timestamp_range(translator, op): ops.StructColumn: _struct_column, ops.ArrayCollect: _array_agg, ops.ArrayConcat: _array_concat, - ops.ArrayColumn: _array_column, + ops.Array: _array_column, ops.ArrayIndex: _array_index, ops.ArrayLength: unary("ARRAY_LENGTH"), ops.ArrayRepeat: _array_repeat, diff --git a/ibis/backends/clickhouse/compiler/values.py b/ibis/backends/clickhouse/compiler/values.py index 3a3b479dbfb1..8486a6421a09 100644 --- a/ibis/backends/clickhouse/compiler/values.py +++ b/ibis/backends/clickhouse/compiler/values.py @@ -554,9 +554,9 @@ def _translate(op, *, arg, where, **_): return _translate -@translate_val.register(ops.ArrayColumn) -def _array_column(op, *, cols, **_): - return F.array(*cols) +@translate_val.register(ops.Array) +def _array_column(op, *, exprs, **_): + return F.array(*exprs) @translate_val.register(ops.StructColumn) diff --git a/ibis/backends/dask/execution/arrays.py b/ibis/backends/dask/execution/arrays.py index 85e94fbde13e..c94b82807152 100644 --- a/ibis/backends/dask/execution/arrays.py +++ b/ibis/backends/dask/execution/arrays.py @@ -34,7 +34,7 @@ ) -@execute_node.register(ops.ArrayColumn, tuple) +@execute_node.register(ops.Array, tuple) def execute_array_column(op, cols, **kwargs): cols = [execute(arg, **kwargs) for arg in cols] df = dd.concat(cols, axis=1) diff --git a/ibis/backends/datafusion/compiler/values.py b/ibis/backends/datafusion/compiler/values.py index 33cfe78f353c..60f75d66ac0f 100644 --- a/ibis/backends/datafusion/compiler/values.py +++ b/ibis/backends/datafusion/compiler/values.py @@ -733,9 +733,9 @@ def _not_null(op, *, arg, **_): return sg.not_(arg.is_(NULL)) -@translate_val.register(ops.ArrayColumn) -def array_column(op, *, cols, **_): - return F.make_array(*cols) +@translate_val.register(ops.Array) +def array_column(op, *, exprs, **_): + return F.make_array(*exprs) @translate_val.register(ops.ArrayRepeat) diff --git a/ibis/backends/duckdb/registry.py b/ibis/backends/duckdb/registry.py index df5d423f9682..49b23b978e0d 100644 --- a/ibis/backends/duckdb/registry.py +++ b/ibis/backends/duckdb/registry.py @@ -399,9 +399,9 @@ def _array_remove(t, op): operation_registry.update( { - ops.ArrayColumn: ( + ops.Array: ( lambda t, op: sa.cast( - sa.func.list_value(*map(t.translate, op.cols)), + sa.func.list_value(*map(t.translate, op.exprs)), t.get_sqla_type(op.dtype), ) ), diff --git a/ibis/backends/pandas/execution/arrays.py b/ibis/backends/pandas/execution/arrays.py index 8c5119610898..11881d23e6e7 100644 --- a/ibis/backends/pandas/execution/arrays.py +++ b/ibis/backends/pandas/execution/arrays.py @@ -17,7 +17,7 @@ from collections.abc import Collection -@execute_node.register(ops.ArrayColumn, tuple) +@execute_node.register(ops.Array, tuple) def execute_array_column(op, cols, **kwargs): cols = [execute(arg, **kwargs) for arg in cols] df = pd.concat(cols, axis=1) diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index 3d2168e613b5..a9927cd6ad1a 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -888,9 +888,9 @@ def array_concat(op, **kw): return result -@translate.register(ops.ArrayColumn) +@translate.register(ops.Array) def array_column(op, **kw): - cols = [translate(col, **kw) for col in op.cols] + cols = [translate(col, **kw) for col in op.exprs] return pl.concat_list(cols) diff --git a/ibis/backends/postgres/registry.py b/ibis/backends/postgres/registry.py index 961fb61741c5..1cc3e028a47e 100644 --- a/ibis/backends/postgres/registry.py +++ b/ibis/backends/postgres/registry.py @@ -750,7 +750,7 @@ def _range(t, op): # array operations ops.ArrayLength: unary(sa.func.cardinality), ops.ArrayCollect: reduction(sa.func.array_agg), - ops.ArrayColumn: (lambda t, op: pg.array(list(map(t.translate, op.cols)))), + ops.Array: (lambda t, op: pg.array(list(map(t.translate, op.exprs)))), ops.ArraySlice: _array_slice( index_converter=_neg_idx_to_pos, array_length=sa.func.cardinality, diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py index 73387c99db06..47003a437fb4 100644 --- a/ibis/backends/pyspark/compiler.py +++ b/ibis/backends/pyspark/compiler.py @@ -1634,9 +1634,9 @@ def compile_interval_from_integer(t, op, **kwargs): # -------------------------- Array Operations ---------------------------- -@compiles(ops.ArrayColumn) +@compiles(ops.Array) def compile_array_column(t, op, **kwargs): - cols = [t.translate(col, **kwargs) for col in op.cols] + cols = [t.translate(col, **kwargs) for col in op.exprs] return F.array(cols) diff --git a/ibis/backends/snowflake/registry.py b/ibis/backends/snowflake/registry.py index 3b8b2926c7b1..7e97330d4481 100644 --- a/ibis/backends/snowflake/registry.py +++ b/ibis/backends/snowflake/registry.py @@ -457,9 +457,7 @@ def _timestamp_range(t, op): ops.ArrayConcat: varargs( lambda *args: functools.reduce(sa.func.array_cat, args) ), - ops.ArrayColumn: lambda t, op: sa.func.array_construct( - *map(t.translate, op.cols) - ), + ops.Array: lambda t, op: sa.func.array_construct(*map(t.translate, op.exprs)), ops.ArraySlice: _array_slice, ops.ArrayCollect: reduction( lambda arg: sa.func.array_agg( diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 8f1e2d39ed00..aaffbd696e8e 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -15,6 +15,7 @@ import ibis import ibis.common.exceptions as com +import ibis.expr.datashape as ds import ibis.expr.datatypes as dt import ibis.expr.types as ir from ibis.backends.tests.errors import ( @@ -1070,3 +1071,40 @@ def test_unnest_range(con): result = con.execute(expr) expected = pd.DataFrame({"x": np.array([0, 1], dtype="int8"), "y": [1.0, 1.0]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.notyet(["flink"], raises=com.OperationNotDefinedError) +@pytest.mark.broken( + ["pandas"], reason="expression input not supported", raises=TypeError +) +@pytest.mark.broken( + ["dask"], reason="expression input not supported", raises=AttributeError +) +@pytest.mark.parametrize( + ("input", "expected"), + [ + param([1, ibis.literal(2)], [1, 2], id="int-int"), + param([1.0, ibis.literal(2)], [1.0, 2.0], id="float-int"), + param([1.0, ibis.literal(2.0)], [1.0, 2.0], id="float-float"), + param([1, ibis.literal(2.0)], [1.0, 2.0], id="int-float"), + param([ibis.literal(1), ibis.literal(2.0)], [1.0, 2.0], id="int-float-exprs"), + param( + [[1], ibis.literal([2])], + [[1], [2]], + id="array", + marks=[ + pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest), + pytest.mark.broken( + ["polars"], + reason="expression input not supported with nested arrays", + raises=TypeError, + ), + ], + ), + ], +) +def test_array_literal_with_exprs(con, input, expected): + expr = ibis.array(input) + assert expr.op().shape == ds.scalar + result = list(con.execute(expr)) + assert result == expected diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py index 19ec3e71fa21..8e9e97e2528d 100644 --- a/ibis/backends/tests/test_map.py +++ b/ibis/backends/tests/test_map.py @@ -236,7 +236,7 @@ def test_map_construct_dict(con, keys, values): @pytest.mark.notimpl( ["flink"], raises=exc.OperationNotDefinedError, - reason="No translation rule for ", + reason="No translation rule for ", ) def test_map_construct_array_column(con, alltypes, df): expr = ibis.map(ibis.array([alltypes.string_col]), ibis.array([alltypes.int_col])) diff --git a/ibis/backends/trino/registry.py b/ibis/backends/trino/registry.py index 74bd99a7f69a..0ba12a71763c 100644 --- a/ibis/backends/trino/registry.py +++ b/ibis/backends/trino/registry.py @@ -117,7 +117,7 @@ def _group_concat(t, op): def _array_column(t, op): args = ", ".join( str(t.translate(arg).compile(compile_kwargs={"literal_binds": True})) - for arg in op.cols + for arg in op.exprs ) return sa.literal_column(f"ARRAY[{args}]", type_=t.get_sqla_type(op.dtype)) @@ -431,7 +431,7 @@ def _range(t, op): ops.ArrayIndex: fixed_arity( lambda arg, index: sa.func.element_at(arg, index + 1), 2 ), - ops.ArrayColumn: _array_column, + ops.Array: _array_column, ops.ArrayRepeat: fixed_arity( lambda arg, times: sa.func.flatten(sa.func.repeat(arg, times)), 2 ), diff --git a/ibis/expr/operations/arrays.py b/ibis/expr/operations/arrays.py index c0e65b36272e..68ee711a2da6 100644 --- a/ibis/expr/operations/arrays.py +++ b/ibis/expr/operations/arrays.py @@ -13,14 +13,16 @@ @public -class ArrayColumn(Value): - cols: VarTuple[Value] +class Array(Value): + exprs: VarTuple[Value] - shape = ds.columnar + @attribute + def shape(self): + return rlz.highest_precedence_shape(self.exprs) @attribute def dtype(self): - return dt.Array(rlz.highest_precedence_dtype(self.cols)) + return dt.Array(rlz.highest_precedence_dtype(self.exprs)) @public diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py index 8258b4278ecf..1b467582f1f5 100644 --- a/ibis/expr/types/arrays.py +++ b/ibis/expr/types/arrays.py @@ -1020,28 +1020,13 @@ def __getitem__(self, index: int | ir.IntegerValue | slice) -> ir.Column: def array(values: Iterable[V], type: str | dt.DataType | None = None) -> ArrayValue: """Create an array expression. - If the input expressions are all column expressions, then the output will - be an `ArrayColumn`. The input columns will be concatenated row-wise to - produce each array in the output array column. Each array will have length - _n_, where _n_ is the number of input columns. All input columns should be - of the same datatype. - - If the input expressions are Python literals, then the output will be a - single `ArrayScalar` of length _n_, where _n_ is the number of input - values. This is equivalent to - - ```python - values = [1, 2, 3] - ibis.literal(values) - ``` - Parameters ---------- values An iterable of Ibis expressions or a list of Python literals type An instance of `ibis.expr.datatypes.DataType` or a string indicating - the ibis type of `value`. + the Ibis type of `value`. Returns ------- @@ -1086,7 +1071,7 @@ def array(values: Iterable[V], type: str | dt.DataType | None = None) -> ArrayVa └──────────────────────┘ """ if any(isinstance(value, Value) for value in values): - return ops.ArrayColumn(values).to_expr() + return ops.Array(values).to_expr() else: try: return literal(list(values), type=type) From 13914b0cc36d65848b7dcfe941b6872a41868332 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 08:38:17 -0500 Subject: [PATCH 02/11] chore: remove junky handling of array inputs --- ibis/backends/trino/registry.py | 2 +- ibis/expr/types/arrays.py | 18 +++++------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/ibis/backends/trino/registry.py b/ibis/backends/trino/registry.py index 0ba12a71763c..5c5f45658e3d 100644 --- a/ibis/backends/trino/registry.py +++ b/ibis/backends/trino/registry.py @@ -43,7 +43,7 @@ class make_array(FunctionElement): pass -@compiles(make_array, "trino") +@compiles(make_array, "default") def compile_make_array(element, compiler, **kw): return f"ARRAY[{compiler.process(element.clauses, **kw)}]" diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py index 1b467582f1f5..6e3a5d87d137 100644 --- a/ibis/expr/types/arrays.py +++ b/ibis/expr/types/arrays.py @@ -1043,7 +1043,7 @@ def array(values: Iterable[V], type: str | dt.DataType | None = None) -> ArrayVa >>> t = ibis.memtable({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> ibis.array([t.a, t.b]) ┏━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ ArrayColumn() ┃ + ┃ Array() ┃ ┡━━━━━━━━━━━━━━━━━━━━━━┩ │ array │ ├──────────────────────┤ @@ -1061,7 +1061,7 @@ def array(values: Iterable[V], type: str | dt.DataType | None = None) -> ArrayVa >>> ibis.array([t.a, 42]) ┏━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ ArrayColumn() ┃ + ┃ Array() ┃ ┡━━━━━━━━━━━━━━━━━━━━━━┩ │ array │ ├──────────────────────┤ @@ -1070,14 +1070,6 @@ def array(values: Iterable[V], type: str | dt.DataType | None = None) -> ArrayVa │ [3, 42] │ └──────────────────────┘ """ - if any(isinstance(value, Value) for value in values): - return ops.Array(values).to_expr() - else: - try: - return literal(list(values), type=type) - except com.IbisTypeError as e: - raise com.IbisTypeError( - "Could not create an array scalar from the values provided " - "to `array`. Ensure that all input values have the same " - "Python type, or can be casted to a single Python type." - ) from e + if type is None: + return ops.Array(tuple(values)).to_expr() + return literal(list(values), type=type) From ee1fae71388704e79309aec2673da9a549884d0c Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 08:50:31 -0500 Subject: [PATCH 03/11] refactor(arrays): remove `type` argument from `ibis.array` function BREAKING CHANGE: The `type` argument is removed from `ibis.array`. Ibis will attempt to infer the type of your input as it did before. If you need a specific type, cast the return value of `ibis.array` to a specific array type. --- ibis/backends/tests/test_array.py | 2 +- ibis/expr/tests/test_format.py | 2 +- ibis/expr/types/arrays.py | 11 ++--------- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index aaffbd696e8e..6369bcdb266e 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -328,7 +328,7 @@ def test_unnest_default_name(backend): array_types = backend.array_types df = array_types.execute() expr = ( - array_types.x.cast("!array") + ibis.array([1], type="!array") + array_types.x.cast("!array") + ibis.array([1]).cast("!array") ).unnest() assert expr.get_name().startswith("ArrayConcat(") diff --git a/ibis/expr/tests/test_format.py b/ibis/expr/tests/test_format.py index 86de3ecdeaeb..6ee6dbc42514 100644 --- a/ibis/expr/tests/test_format.py +++ b/ibis/expr/tests/test_format.py @@ -382,7 +382,7 @@ def test_format_literal(literal, typ, output): def test_format_dummy_table(snapshot): - t = ops.DummyTable([ibis.array([1], type="array").name("foo")]).to_expr() + t = ops.DummyTable([ibis.array([1]).cast("array").name("foo")]).to_expr() result = fmt(t) assert "DummyTable" in result diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py index 6e3a5d87d137..72185903453e 100644 --- a/ibis/expr/types/arrays.py +++ b/ibis/expr/types/arrays.py @@ -1017,22 +1017,17 @@ def __getitem__(self, index: int | ir.IntegerValue | slice) -> ir.Column: @public @deferrable -def array(values: Iterable[V], type: str | dt.DataType | None = None) -> ArrayValue: +def array(values: Iterable[V]) -> ArrayValue: """Create an array expression. Parameters ---------- values An iterable of Ibis expressions or a list of Python literals - type - An instance of `ibis.expr.datatypes.DataType` or a string indicating - the Ibis type of `value`. Returns ------- ArrayValue - An array column (if the inputs are column expressions), or an array - scalar (if the inputs are Python literals) Examples -------- @@ -1070,6 +1065,4 @@ def array(values: Iterable[V], type: str | dt.DataType | None = None) -> ArrayVa │ [3, 42] │ └──────────────────────┘ """ - if type is None: - return ops.Array(tuple(values)).to_expr() - return literal(list(values), type=type) + return ops.Array(tuple(values)).to_expr() From 866ffb5dd12797e04c7ef37cfc94019f97933c3b Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 08:58:19 -0500 Subject: [PATCH 04/11] test: fix mysql test failures --- ibis/backends/tests/test_array.py | 2 +- ibis/backends/tests/test_sql.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 6369bcdb266e..62692d84acad 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -1059,7 +1059,7 @@ def test_repr_timestamp_array(con, monkeypatch): assert ibis.options.interactive is True assert ibis.options.default_backend is con expr = ibis.array(pd.date_range("2010-01-01", "2010-01-03", freq="D").tolist()) - assert repr(expr) + assert "No translation rule" not in repr(expr) @pytest.mark.notyet( diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 21b91984f478..9727a41252d6 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -18,10 +18,15 @@ ibis.array([1]), marks=[ pytest.mark.never( - ["mysql", "mssql", "oracle"], + ["mssql", "oracle"], raises=sa.exc.CompileError, reason="arrays not supported in the backend", ), + pytest.mark.never( + ["mysql"], + raises=exc.OperationNotDefinedError, + reason="arrays not supported in the backend", + ), pytest.mark.notyet( ["impala", "sqlite"], raises=NotImplementedError, From 7d8058b5c4b9227543ca4ceaee5a2f31c6a04eab Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 09:19:25 -0500 Subject: [PATCH 05/11] test: fix more tests --- ibis/backends/tests/test_array.py | 1 - ibis/backends/tests/test_sql.py | 12 +----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 62692d84acad..36b4998775ca 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -1052,7 +1052,6 @@ def test_timestamp_range_zero_step(con, start, stop, step, tzinfo): @pytest.mark.notimpl(["flink"], raises=Py4JJavaError) -@pytest.mark.notimpl(["datafusion"], raises=Exception) def test_repr_timestamp_array(con, monkeypatch): monkeypatch.setattr(ibis.options, "interactive", True) monkeypatch.setattr(ibis.options, "default_backend", con) diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 9727a41252d6..b635ca8e1d09 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -18,20 +18,10 @@ ibis.array([1]), marks=[ pytest.mark.never( - ["mssql", "oracle"], - raises=sa.exc.CompileError, - reason="arrays not supported in the backend", - ), - pytest.mark.never( - ["mysql"], + ["mysql", "mssql", "oracle", "impala", "sqlite"], raises=exc.OperationNotDefinedError, reason="arrays not supported in the backend", ), - pytest.mark.notyet( - ["impala", "sqlite"], - raises=NotImplementedError, - reason="backends hasn't implemented array literals", - ), ], id="array_literal", ) From 1b6ee07fa692822c5b45ccacf33c0eeb1071d2f3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 09:21:07 -0500 Subject: [PATCH 06/11] test: remove typeof checking --- ibis/backends/tests/test_array.py | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 36b4998775ca..4e5e0c706b78 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -1,6 +1,5 @@ from __future__ import annotations -import contextlib import functools from datetime import datetime @@ -22,7 +21,6 @@ ClickHouseDatabaseError, GoogleBadRequest, PolarsComputeError, - Py4JJavaError, PySparkAnalysisException, ) @@ -54,18 +52,7 @@ def test_array_column(backend, alltypes, df): backend.assert_series_equal(result, expected, check_names=False) -ARRAY_BACKEND_TYPES = { - "clickhouse": "Array(Float64)", - "snowflake": "ARRAY", - "trino": "array(double)", - "bigquery": "ARRAY", - "duckdb": "DOUBLE[]", - "postgres": "numeric[]", - "flink": "ARRAY NOT NULL", -} - - -def test_array_scalar(con, backend): +def test_array_scalar(con): expr = ibis.array([1.0, 2.0, 3.0]) assert isinstance(expr, ir.ArrayScalar) @@ -74,10 +61,6 @@ def test_array_scalar(con, backend): assert np.array_equal(result, expected) - with contextlib.suppress(com.OperationNotDefinedError): - backend_name = backend.name() - assert con.execute(expr.typeof()) == ARRAY_BACKEND_TYPES[backend_name] - @pytest.mark.notimpl(["polars", "flink"], raises=com.OperationNotDefinedError) def test_array_repeat(con): @@ -1051,7 +1034,9 @@ def test_timestamp_range_zero_step(con, start, stop, step, tzinfo): assert list(result) == [] -@pytest.mark.notimpl(["flink"], raises=Py4JJavaError) +@pytest.mark.notimpl( + ["flink"], raises=AssertionError, reason="arrays not yet implemented" +) def test_repr_timestamp_array(con, monkeypatch): monkeypatch.setattr(ibis.options, "interactive", True) monkeypatch.setattr(ibis.options, "default_backend", con) From 688afae2f5a501023b3fc83a6b0e021f7f2d59f6 Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 18 Jan 2024 16:27:04 -0900 Subject: [PATCH 07/11] fix(pandas/dask): support ibis.array() with mix of expression shapes pd.concat and dd.concat can only handle array-likes. If we are given a scalar, we need to convert it to the appropriate Series type. --- ibis/backends/dask/execution/arrays.py | 18 ++++++++++++++++-- ibis/backends/pandas/execution/arrays.py | 16 ++++++++++++++-- ibis/backends/tests/test_array.py | 6 ++++-- 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/ibis/backends/dask/execution/arrays.py b/ibis/backends/dask/execution/arrays.py index c94b82807152..133cd15b5345 100644 --- a/ibis/backends/dask/execution/arrays.py +++ b/ibis/backends/dask/execution/arrays.py @@ -6,6 +6,7 @@ import dask.dataframe as dd import dask.dataframe.groupby as ddgb import numpy as np +import pandas as pd import ibis.expr.operations as ops from ibis.backends.dask.core import execute @@ -36,8 +37,21 @@ @execute_node.register(ops.Array, tuple) def execute_array_column(op, cols, **kwargs): - cols = [execute(arg, **kwargs) for arg in cols] - df = dd.concat(cols, axis=1) + vals = [execute(arg, **kwargs) for arg in cols] + # At least one of the values will be a Series. + # Otherwise op would be an ArrayScalar, not an ArrayColumn. + length = next(len(v) for v in vals if isinstance(v, dd.Series)) + n_partitions = next(v.npartitions for v in vals if isinstance(v, dd.Series)) + + def ensure_series(v): + if isinstance(v, dd.Series): + return v + else: + return dd.from_pandas(pd.Series([v] * length), npartitions=n_partitions) + + # dd.concat() can only handle array-likes. + # If we're given a scalar, we need to broadcast it as a Series. + df = dd.concat([ensure_series(v) for v in vals], axis=1) return df.apply( lambda row: np.array(row, dtype=object), axis=1, meta=(None, "object") ) diff --git a/ibis/backends/pandas/execution/arrays.py b/ibis/backends/pandas/execution/arrays.py index 11881d23e6e7..900318c5564b 100644 --- a/ibis/backends/pandas/execution/arrays.py +++ b/ibis/backends/pandas/execution/arrays.py @@ -19,8 +19,20 @@ @execute_node.register(ops.Array, tuple) def execute_array_column(op, cols, **kwargs): - cols = [execute(arg, **kwargs) for arg in cols] - df = pd.concat(cols, axis=1) + vals = [execute(arg, **kwargs) for arg in cols] + # At least one of the values will be a Series. + # Otherwise op would be an ArrayScalar, not an ArrayColumn. + length = next(len(v) for v in vals if isinstance(v, pd.Series)) + + def ensure_series(v): + if isinstance(v, pd.Series): + return v + else: + return pd.Series(v, index=range(length)) + + # pd.concat() can only handle array-likes. + # If we're given a scalar, we need to broadcast it as a Series. + df = pd.concat([ensure_series(v) for v in vals], axis=1) return df.apply(lambda row: np.array(row, dtype=object), axis=1) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 4e5e0c706b78..778fef4acc72 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -41,12 +41,14 @@ @pytest.mark.notimpl(["flink"], raises=com.OperationNotDefinedError) def test_array_column(backend, alltypes, df): - expr = ibis.array([alltypes["double_col"], alltypes["double_col"]]) + expr = ibis.array( + [alltypes["double_col"], alltypes["double_col"], 5.0, ibis.literal(6.0)] + ) assert isinstance(expr, ir.ArrayColumn) result = expr.execute() expected = df.apply( - lambda row: [row["double_col"], row["double_col"]], + lambda row: [row["double_col"], row["double_col"], 5.0, 6.0], axis=1, ) backend.assert_series_equal(result, expected, check_names=False) From f6aa097606f476fad195866651c11faff27e895b Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Thu, 18 Jan 2024 14:36:16 -0900 Subject: [PATCH 08/11] docs: improve docstring for ibis.array() --- ibis/expr/types/arrays.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py index 72185903453e..7cc58afe2ccf 100644 --- a/ibis/expr/types/arrays.py +++ b/ibis/expr/types/arrays.py @@ -1031,38 +1031,36 @@ def array(values: Iterable[V]) -> ArrayValue: Examples -------- - Create an array column from column expressions + Create an array from scalar values >>> import ibis >>> ibis.options.interactive = True + >>> ibis.array([1.0, None]) + [1.0, None] + + Create an array from column and scalar expressions + >>> t = ibis.memtable({"a": [1, 2, 3], "b": [4, 5, 6]}) - >>> ibis.array([t.a, t.b]) + >>> ibis.array([t.a, 42, ibis.literal(None)]) ┏━━━━━━━━━━━━━━━━━━━━━━┓ ┃ Array() ┃ ┡━━━━━━━━━━━━━━━━━━━━━━┩ │ array │ ├──────────────────────┤ - │ [1, 4] │ - │ [2, 5] │ - │ [3, 6] │ + │ [1, 42, ... +1] │ + │ [2, 42, ... +1] │ + │ [3, 42, ... +1] │ └──────────────────────┘ - Create an array scalar from Python literals - - >>> ibis.array([1.0, 2.0, 3.0]) - [1.0, 2.0, ... +1] - - Mixing scalar and column expressions is allowed - - >>> ibis.array([t.a, 42]) + >>> ibis.array([t.a, 42 + ibis.literal(5)]) ┏━━━━━━━━━━━━━━━━━━━━━━┓ ┃ Array() ┃ ┡━━━━━━━━━━━━━━━━━━━━━━┩ │ array │ ├──────────────────────┤ - │ [1, 42] │ - │ [2, 42] │ - │ [3, 42] │ + │ [1, 47] │ + │ [2, 47] │ + │ [3, 47] │ └──────────────────────┘ """ return ops.Array(tuple(values)).to_expr() From bf8efef3c520195d60ac98a680c2e486eedf81ea Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 09:42:48 -0500 Subject: [PATCH 09/11] feat(pandas): enable more array operations on scalar lists --- ibis/backends/pandas/execution/arrays.py | 26 ++++++++++++----------- ibis/backends/pandas/execution/generic.py | 22 +++++++++++++++++++ ibis/backends/tests/test_array.py | 3 --- ibis/backends/tests/test_param.py | 4 +--- 4 files changed, 37 insertions(+), 18 deletions(-) diff --git a/ibis/backends/pandas/execution/arrays.py b/ibis/backends/pandas/execution/arrays.py index 900318c5564b..b1adf4d26e54 100644 --- a/ibis/backends/pandas/execution/arrays.py +++ b/ibis/backends/pandas/execution/arrays.py @@ -18,11 +18,9 @@ @execute_node.register(ops.Array, tuple) -def execute_array_column(op, cols, **kwargs): +def execute_array(op, cols, **kwargs): vals = [execute(arg, **kwargs) for arg in cols] - # At least one of the values will be a Series. - # Otherwise op would be an ArrayScalar, not an ArrayColumn. - length = next(len(v) for v in vals if isinstance(v, pd.Series)) + length = next((len(v) for v in vals if isinstance(v, pd.Series)), None) def ensure_series(v): if isinstance(v, pd.Series): @@ -30,6 +28,8 @@ def ensure_series(v): else: return pd.Series(v, index=range(length)) + if length is None: + return vals # pd.concat() can only handle array-likes. # If we're given a scalar, we need to broadcast it as a Series. df = pd.concat([ensure_series(v) for v in vals], axis=1) @@ -41,7 +41,7 @@ def execute_array_length(op, data, **kwargs): return data.apply(len) -@execute_node.register(ops.ArrayLength, np.ndarray) +@execute_node.register(ops.ArrayLength, (list, np.ndarray)) def execute_array_length_scalar(op, data, **kwargs): return len(data) @@ -51,7 +51,7 @@ def execute_array_slice(op, data, start, stop, **kwargs): return data.apply(operator.itemgetter(slice(start, stop))) -@execute_node.register(ops.ArraySlice, np.ndarray, int, (int, type(None))) +@execute_node.register(ops.ArraySlice, (list, np.ndarray), int, (int, type(None))) def execute_array_slice_scalar(op, data, start, stop, **kwargs): return data[start:stop] @@ -65,7 +65,7 @@ def execute_array_index(op, data, index, **kwargs): ) -@execute_node.register(ops.ArrayIndex, np.ndarray, int) +@execute_node.register(ops.ArrayIndex, (list, np.ndarray), int) def execute_array_index_scalar(op, data, index, **kwargs): try: return data[index] @@ -73,7 +73,7 @@ def execute_array_index_scalar(op, data, index, **kwargs): return None -@execute_node.register(ops.ArrayContains, np.ndarray, object) +@execute_node.register(ops.ArrayContains, (list, np.ndarray), object) def execute_node_contains_value_array(op, haystack, needle, **kwargs): return needle in haystack @@ -103,7 +103,7 @@ def execute_array_concat_series(op, first, second, *args, **kwargs): @execute_node.register( - ops.ArrayConcat, np.ndarray, pd.Series, [(pd.Series, np.ndarray)] + ops.ArrayConcat, (list, np.ndarray), pd.Series, [(pd.Series, list, np.ndarray)] ) def execute_array_concat_mixed_left(op, left, right, *args, **kwargs): # ArrayConcat given a column (pd.Series) and a scalar (np.ndarray). @@ -114,7 +114,7 @@ def execute_array_concat_mixed_left(op, left, right, *args, **kwargs): @execute_node.register( - ops.ArrayConcat, pd.Series, np.ndarray, [(pd.Series, np.ndarray)] + ops.ArrayConcat, pd.Series, (list, np.ndarray), [(pd.Series, list, np.ndarray)] ) def execute_array_concat_mixed_right(op, left, right, *args, **kwargs): # Broadcast `right` to the length of `left` @@ -122,7 +122,9 @@ def execute_array_concat_mixed_right(op, left, right, *args, **kwargs): return _concat_iterables_to_series(left, right) -@execute_node.register(ops.ArrayConcat, np.ndarray, np.ndarray, [np.ndarray]) +@execute_node.register( + ops.ArrayConcat, (list, np.ndarray), (list, np.ndarray), [(list, np.ndarray)] +) def execute_array_concat_scalar(op, left, right, *args, **kwargs): return np.concatenate([left, right, *args]) @@ -134,7 +136,7 @@ def execute_array_repeat(op, data, n, **kwargs): return pd.Series(np.tile(arr, n) for arr in data) -@execute_node.register(ops.ArrayRepeat, np.ndarray, int) +@execute_node.register(ops.ArrayRepeat, (list, np.ndarray), int) def execute_array_repeat_scalar(op, data, n, **kwargs): # Negative n will be treated as 0 (repeat will produce empty array) return np.tile(data, max(n, 0)) diff --git a/ibis/backends/pandas/execution/generic.py b/ibis/backends/pandas/execution/generic.py index 2e30dd33c6b4..7c8b53cc2f79 100644 --- a/ibis/backends/pandas/execution/generic.py +++ b/ibis/backends/pandas/execution/generic.py @@ -145,6 +145,28 @@ def cast_to_array(array, numpy_type=numpy_type): return data.map(cast_to_array) +@execute_node.register(ops.Cast, list, dt.Array) +def execute_cast_list_array(op, data, type, **kwargs): + value_type = type.value_type + numpy_type = constants.IBIS_TYPE_TO_PANDAS_TYPE.get(value_type, None) + if numpy_type is None: + raise ValueError( + "Array value type must be a primitive type " + "(e.g., number, string, or timestamp)" + ) + + def cast_to_array(array, numpy_type=numpy_type): + elems = [ + el if el is None else np.array(el, dtype=numpy_type).item() for el in array + ] + try: + return np.array(elems, dtype=numpy_type) + except TypeError: + return np.array(elems) + + return cast_to_array(data) + + @execute_node.register(ops.Cast, pd.Series, dt.Timestamp) def execute_cast_series_timestamp(op, data, type, **kwargs): arg = op.arg diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 778fef4acc72..992b24f28e81 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -1060,9 +1060,6 @@ def test_unnest_range(con): @pytest.mark.notyet(["flink"], raises=com.OperationNotDefinedError) -@pytest.mark.broken( - ["pandas"], reason="expression input not supported", raises=TypeError -) @pytest.mark.broken( ["dask"], reason="expression input not supported", raises=AttributeError ) diff --git a/ibis/backends/tests/test_param.py b/ibis/backends/tests/test_param.py index dc7a898be44e..8fc7ac518ae0 100644 --- a/ibis/backends/tests/test_param.py +++ b/ibis/backends/tests/test_param.py @@ -60,9 +60,7 @@ def test_timestamp_accepts_date_literals(alltypes): assert expr.compile(params=params) is not None -@pytest.mark.notimpl( - ["dask", "impala", "pandas", "pyspark", "druid", "oracle", "exasol"] -) +@pytest.mark.notimpl(["dask", "impala", "pyspark", "druid", "oracle", "exasol"]) @pytest.mark.never( ["mysql", "sqlite", "mssql"], reason="backend will never implement array types" ) From 083a346a77babe733009c2f488fe59b5ef5a7f24 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sat, 20 Jan 2024 09:45:06 -0500 Subject: [PATCH 10/11] feat(dask): enable more array operations on scalar lists --- ibis/backends/dask/execution/arrays.py | 8 ++++---- ibis/backends/tests/test_array.py | 4 +--- ibis/backends/tests/test_param.py | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/ibis/backends/dask/execution/arrays.py b/ibis/backends/dask/execution/arrays.py index 133cd15b5345..495b74065d24 100644 --- a/ibis/backends/dask/execution/arrays.py +++ b/ibis/backends/dask/execution/arrays.py @@ -38,10 +38,8 @@ @execute_node.register(ops.Array, tuple) def execute_array_column(op, cols, **kwargs): vals = [execute(arg, **kwargs) for arg in cols] - # At least one of the values will be a Series. - # Otherwise op would be an ArrayScalar, not an ArrayColumn. - length = next(len(v) for v in vals if isinstance(v, dd.Series)) - n_partitions = next(v.npartitions for v in vals if isinstance(v, dd.Series)) + length = next((len(v) for v in vals if isinstance(v, dd.Series)), None) + n_partitions = next((v.npartitions for v in vals if isinstance(v, dd.Series)), None) def ensure_series(v): if isinstance(v, dd.Series): @@ -49,6 +47,8 @@ def ensure_series(v): else: return dd.from_pandas(pd.Series([v] * length), npartitions=n_partitions) + if length is None: + return vals # dd.concat() can only handle array-likes. # If we're given a scalar, we need to broadcast it as a Series. df = dd.concat([ensure_series(v) for v in vals], axis=1) diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 992b24f28e81..985d0911371b 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -54,6 +54,7 @@ def test_array_column(backend, alltypes, df): backend.assert_series_equal(result, expected, check_names=False) +@pytest.mark.notimpl(["flink"], raises=com.OperationNotDefinedError) def test_array_scalar(con): expr = ibis.array([1.0, 2.0, 3.0]) assert isinstance(expr, ir.ArrayScalar) @@ -1060,9 +1061,6 @@ def test_unnest_range(con): @pytest.mark.notyet(["flink"], raises=com.OperationNotDefinedError) -@pytest.mark.broken( - ["dask"], reason="expression input not supported", raises=AttributeError -) @pytest.mark.parametrize( ("input", "expected"), [ diff --git a/ibis/backends/tests/test_param.py b/ibis/backends/tests/test_param.py index 8fc7ac518ae0..83b1f11d57e4 100644 --- a/ibis/backends/tests/test_param.py +++ b/ibis/backends/tests/test_param.py @@ -60,7 +60,7 @@ def test_timestamp_accepts_date_literals(alltypes): assert expr.compile(params=params) is not None -@pytest.mark.notimpl(["dask", "impala", "pyspark", "druid", "oracle", "exasol"]) +@pytest.mark.notimpl(["impala", "pyspark", "druid", "oracle", "exasol"]) @pytest.mark.never( ["mysql", "sqlite", "mssql"], reason="backend will never implement array types" ) From b9f9bbed57b062457c5452041d44b07ce5f303b7 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 21 Jan 2024 06:26:53 -0500 Subject: [PATCH 11/11] chore: move length check up --- ibis/backends/dask/execution/arrays.py | 6 ++++-- ibis/backends/pandas/execution/arrays.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/ibis/backends/dask/execution/arrays.py b/ibis/backends/dask/execution/arrays.py index 495b74065d24..ef3dd7889298 100644 --- a/ibis/backends/dask/execution/arrays.py +++ b/ibis/backends/dask/execution/arrays.py @@ -38,7 +38,11 @@ @execute_node.register(ops.Array, tuple) def execute_array_column(op, cols, **kwargs): vals = [execute(arg, **kwargs) for arg in cols] + length = next((len(v) for v in vals if isinstance(v, dd.Series)), None) + if length is None: + return vals + n_partitions = next((v.npartitions for v in vals if isinstance(v, dd.Series)), None) def ensure_series(v): @@ -47,8 +51,6 @@ def ensure_series(v): else: return dd.from_pandas(pd.Series([v] * length), npartitions=n_partitions) - if length is None: - return vals # dd.concat() can only handle array-likes. # If we're given a scalar, we need to broadcast it as a Series. df = dd.concat([ensure_series(v) for v in vals], axis=1) diff --git a/ibis/backends/pandas/execution/arrays.py b/ibis/backends/pandas/execution/arrays.py index b1adf4d26e54..20461f022241 100644 --- a/ibis/backends/pandas/execution/arrays.py +++ b/ibis/backends/pandas/execution/arrays.py @@ -22,14 +22,15 @@ def execute_array(op, cols, **kwargs): vals = [execute(arg, **kwargs) for arg in cols] length = next((len(v) for v in vals if isinstance(v, pd.Series)), None) + if length is None: + return vals + def ensure_series(v): if isinstance(v, pd.Series): return v else: return pd.Series(v, index=range(length)) - if length is None: - return vals # pd.concat() can only handle array-likes. # If we're given a scalar, we need to broadcast it as a Series. df = pd.concat([ensure_series(v) for v in vals], axis=1)