From 3352a84ce4e28a5186ef0d1a1ec8c21f49da7209 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:22:26 -0400 Subject: [PATCH] feat(api): add `TableUnnest` operation to support cross-join unnest semantics as well as `offset` (#9423) --- ibis/backends/bigquery/compiler.py | 44 ++++++ ibis/backends/clickhouse/compiler.py | 57 ++++++++ ibis/backends/duckdb/compiler.py | 53 +++++++ ibis/backends/postgres/compiler.py | 57 ++++++++ ibis/backends/pyspark/compiler.py | 65 +++++++++ ibis/backends/snowflake/compiler.py | 67 +++++++++ ibis/backends/snowflake/tests/test_client.py | 9 ++ ibis/backends/tests/test_array.py | 99 ++++++++++++- ibis/backends/trino/compiler.py | 57 ++++++++ ibis/expr/operations/relations.py | 23 +++ ibis/expr/types/arrays.py | 20 ++- ibis/expr/types/relations.py | 143 +++++++++++++++++++ 12 files changed, 686 insertions(+), 8 deletions(-) diff --git a/ibis/backends/bigquery/compiler.py b/ibis/backends/bigquery/compiler.py index dd5aadfc1dad..7169f258b34e 100644 --- a/ibis/backends/bigquery/compiler.py +++ b/ibis/backends/bigquery/compiler.py @@ -690,3 +690,47 @@ def visit_DropColumns(self, op, *, parent, columns_to_drop): table = sg.to_identifier(parent.alias_or_name, quoted=quoted) column = sge.Column(this=star, table=table) return sg.select(column).from_(parent) + + def visit_TableUnnest( + self, op, *, parent, column, offset: str | None, keep_empty: bool + ): + quoted = self.quoted + + column_alias = sg.to_identifier( + util.gen_name("table_unnest_column"), quoted=quoted + ) + + selcols = [] + + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + + opname = op.column.name + overlaps_with_parent = opname in op.parent.schema + computed_column = column_alias.as_(opname, quoted=quoted) + + # replace the existing column if the unnested column hasn't been + # renamed + # + # e.g., table.unnest("x") + if overlaps_with_parent: + selcols.append( + sge.Column(this=sge.Star(replace=[computed_column]), table=table) + ) + else: + selcols.append(sge.Column(this=STAR, table=table)) + selcols.append(computed_column) + + if offset is not None: + offset = sg.to_identifier(offset, quoted=quoted) + selcols.append(offset) + + unnest = sge.Unnest( + expressions=[column], + alias=sge.TableAlias(columns=[column_alias]), + offset=offset, + ) + return ( + sg.select(*selcols) + .from_(parent) + .join(unnest, join_type="CROSS" if not keep_empty else "LEFT") + ) diff --git a/ibis/backends/clickhouse/compiler.py b/ibis/backends/clickhouse/compiler.py index c4283a6c3500..7ad80aea2eaa 100644 --- a/ibis/backends/clickhouse/compiler.py +++ b/ibis/backends/clickhouse/compiler.py @@ -648,3 +648,60 @@ def visit_DropColumns(self, op, *, parent, columns_to_drop): table = sg.to_identifier(parent.alias_or_name, quoted=quoted) column = sge.Column(this=star, table=table) return sg.select(column).from_(parent) + + def visit_TableUnnest( + self, op, *, parent, column, offset: str | None, keep_empty: bool + ): + quoted = self.quoted + + column_alias = sg.to_identifier( + util.gen_name("table_unnest_column"), quoted=quoted + ) + + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + + selcols = [] + + opname = op.column.name + overlaps_with_parent = opname in op.parent.schema + computed_column = column_alias.as_(opname, quoted=quoted) + + if offset is not None: + if overlaps_with_parent: + selcols.append( + sge.Column(this=sge.Star(replace=[computed_column]), table=table) + ) + else: + selcols.append(sge.Column(this=STAR, table=table)) + selcols.append(computed_column) + + offset = sg.to_identifier(offset, quoted=quoted) + selcols.append(offset) + elif overlaps_with_parent: + selcols.append( + sge.Column(this=sge.Star(replace=[computed_column]), table=table) + ) + else: + selcols.append(sge.Column(this=STAR, table=table)) + selcols.append(computed_column) + + select = ( + sg.select(*selcols) + .from_(parent) + .join( + sge.Join( + this=column.as_(column_alias, quoted=quoted), + kind="ARRAY", + side=None if not keep_empty else "LEFT", + ) + ) + ) + + if offset is not None: + param = sg.to_identifier(util.gen_name("arr_enum")) + func = sge.Lambda(this=param - 1, expressions=[param]) + return select.join( + self.f.arrayMap(func, self.f.arrayEnumerate(column_alias)).as_(offset) + ) + + return select diff --git a/ibis/backends/duckdb/compiler.py b/ibis/backends/duckdb/compiler.py index 300a9e4fc215..f2c38a17dd8c 100644 --- a/ibis/backends/duckdb/compiler.py +++ b/ibis/backends/duckdb/compiler.py @@ -14,6 +14,7 @@ from ibis.backends.sql.compiler import NULL, STAR, AggGen, SQLGlotCompiler from ibis.backends.sql.datatypes import DuckDBType from ibis.backends.sql.rewrites import exclude_nulls_from_array_collect +from ibis.util import gen_name _INTERVAL_SUFFIXES = { "ms": "milliseconds", @@ -547,3 +548,55 @@ def visit_DropColumns(self, op, *, parent, columns_to_drop): table = sg.to_identifier(parent.alias_or_name, quoted=quoted) column = sge.Column(this=star, table=table) return sg.select(column).from_(parent) + + def visit_TableUnnest( + self, op, *, parent, column, offset: str | None, keep_empty: bool + ): + quoted = self.quoted + + column_alias = sg.to_identifier(gen_name("table_unnest_column"), quoted=quoted) + + opname = op.column.name + overlaps_with_parent = opname in op.parent.schema + computed_column = column_alias.as_(opname, quoted=quoted) + + selcols = [] + + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + + if offset is not None: + # TODO: clean this up once WITH ORDINALITY is supported in DuckDB + # no need for struct_extract once that's upstream + column = self.f.list_zip(column, self.f.range(self.f.len(column))) + extract = self.f.struct_extract(column_alias, 1).as_(opname, quoted=quoted) + + if overlaps_with_parent: + replace = sge.Column(this=sge.Star(replace=[extract]), table=table) + selcols.append(replace) + else: + selcols.append(sge.Column(this=STAR, table=table)) + selcols.append(extract) + + selcols.append( + self.f.struct_extract(column_alias, 2).as_(offset, quoted=quoted) + ) + elif overlaps_with_parent: + selcols.append( + sge.Column(this=sge.Star(replace=[computed_column]), table=table) + ) + else: + selcols.append(sge.Column(this=STAR, table=table)) + selcols.append(computed_column) + + unnest = sge.Unnest( + expressions=[column], + alias=sge.TableAlias( + this=sg.to_identifier(gen_name("table_unnest"), quoted=quoted), + columns=[column_alias], + ), + ) + return ( + sg.select(*selcols) + .from_(parent) + .join(unnest, join_type="CROSS" if not keep_empty else "LEFT") + ) diff --git a/ibis/backends/postgres/compiler.py b/ibis/backends/postgres/compiler.py index dfa2b05f019e..cb959d2bd199 100644 --- a/ibis/backends/postgres/compiler.py +++ b/ibis/backends/postgres/compiler.py @@ -15,6 +15,7 @@ from ibis.backends.sql.datatypes import PostgresType from ibis.backends.sql.dialects import Postgres from ibis.backends.sql.rewrites import exclude_nulls_from_array_collect +from ibis.util import gen_name class PostgresUDFNode(ops.Value): @@ -611,3 +612,59 @@ def visit_Hash(self, op, *, arg): f"Hash({arg_dtype!r}) operation is not supported in the " f"{self.dialect} backend" ) + + def visit_TableUnnest( + self, op, *, parent, column, offset: str | None, keep_empty: bool + ): + quoted = self.quoted + + column_alias = sg.to_identifier(gen_name("table_unnest_column"), quoted=quoted) + + parent_alias = parent.alias_or_name + + opname = op.column.name + parent_schema = op.parent.schema + overlaps_with_parent = opname in parent_schema + computed_column = column_alias.as_(opname, quoted=quoted) + + selcols = [] + + if overlaps_with_parent: + column_alias_or_name = column.alias_or_name + selcols.extend( + sg.column(col, table=parent_alias, quoted=quoted) + if col != column_alias_or_name + else computed_column + for col in parent_schema.names + ) + else: + selcols.append( + sge.Column( + this=STAR, table=sg.to_identifier(parent_alias, quoted=quoted) + ) + ) + selcols.append(computed_column) + + if offset is not None: + offset_name = offset + offset = sg.to_identifier(offset_name, quoted=quoted) + selcols.append((offset - 1).as_(offset_name, quoted=quoted)) + + unnest = sge.Unnest( + expressions=[column], + alias=sge.TableAlias( + this=sg.to_identifier(gen_name("table_unnest"), quoted=quoted), + columns=[column_alias], + ), + offset=offset, + ) + + return ( + sg.select(*selcols) + .from_(parent) + .join( + unnest, + on=None if not keep_empty else sge.convert(True), + join_type="CROSS" if not keep_empty else "LEFT", + ) + ) diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py index a28faedde83c..6cbd7f0796a9 100644 --- a/ibis/backends/pyspark/compiler.py +++ b/ibis/backends/pyspark/compiler.py @@ -452,3 +452,68 @@ def visit_HexDigest(self, op, *, arg, how): return self.f.sha2(arg, int(how[-3:])) else: raise NotImplementedError(f"No available hashing function for {how}") + + def visit_TableUnnest( + self, op, *, parent, column, offset: str | None, keep_empty: bool + ): + quoted = self.quoted + + column_alias = sg.to_identifier(gen_name("table_unnest_column"), quoted=quoted) + + opname = op.column.name + parent_schema = op.parent.schema + overlaps_with_parent = opname in parent_schema + computed_column = column_alias.as_(opname, quoted=quoted) + + parent_alias = parent.alias_or_name + + selcols = [] + + if overlaps_with_parent: + column_alias_or_name = column.alias_or_name + selcols.extend( + sg.column(col, table=parent_alias, quoted=quoted) + if col != column_alias_or_name + else computed_column + for col in parent_schema.names + ) + else: + selcols.append( + sge.Column( + this=STAR, table=sg.to_identifier(parent_alias, quoted=quoted) + ) + ) + selcols.append(computed_column) + + alias_columns = [] + + if offset is not None: + offset = sg.column(offset, quoted=quoted) + selcols.append(offset) + alias_columns.append(offset) + + alias_columns.append(column_alias) + + # four possible functions + # + # explode: unnest + # explode_outer: unnest preserving empties and nulls + # posexplode: unnest with index + # posexplode_outer: unnest with index preserving empties and nulls + funcname = ( + ("pos" if offset is not None else "") + + "explode" + + ("_outer" if keep_empty else "") + ) + + return ( + sg.select(*selcols) + .from_(parent) + .lateral( + sge.Lateral( + this=self.f[funcname](column), + view=True, + alias=sge.TableAlias(columns=alias_columns), + ) + ) + ) diff --git a/ibis/backends/snowflake/compiler.py b/ibis/backends/snowflake/compiler.py index aa2faca6a7e9..927a5922f032 100644 --- a/ibis/backends/snowflake/compiler.py +++ b/ibis/backends/snowflake/compiler.py @@ -662,3 +662,70 @@ def visit_DropColumns(self, op, *, parent, columns_to_drop): table = sg.to_identifier(parent.alias_or_name, quoted=quoted) column = sge.Column(this=star, table=table) return sg.select(column).from_(parent) + + def visit_TableUnnest( + self, op, *, parent, column, offset: str | None, keep_empty: bool + ): + quoted = self.quoted + + column_alias = sg.to_identifier( + util.gen_name("table_unnest_column"), quoted=quoted + ) + + sep = sge.convert(util.guid()) + null_sentinel = sge.convert(util.guid()) + + table = sg.to_identifier(parent.alias_or_name, quoted=quoted) + + selcols = [] + + opcol = op.column + opname = opcol.name + overlaps_with_parent = opname in op.parent.schema + computed_column = self.cast( + self.f.nullif(column_alias, null_sentinel), opcol.dtype.value_type + ).as_(opname, quoted=quoted) + + if overlaps_with_parent: + selcols.append( + sge.Column(this=sge.Star(replace=[computed_column]), table=table) + ) + else: + selcols.append(sge.Column(this=STAR, table=table)) + selcols.append(computed_column) + + if offset is not None: + offset = sg.to_identifier(offset, quoted=quoted) + selcols.append(offset) + + alias = sge.TableAlias( + this=sg.to_identifier(util.gen_name("table_unnest"), quoted=quoted), + columns=[column_alias], + ) + + # there has to be a better way + param = sg.to_identifier(util.gen_name("table_unnest_param")) + column = self.f.transform( + column, + sge.Lambda( + this=self.f.coalesce(self.cast(param, dt.string), null_sentinel), + expressions=[param], + ), + ) + empty_array = self.f.array() + split = self.f.coalesce( + self.f.nullif( + self.f.split( + self.f.array_to_string(self.f.nullif(column, empty_array), sep), sep + ), + empty_array, + ), + self.f.array(null_sentinel), + ) + + unnest = sge.Unnest(expressions=[split], alias=alias, offset=offset) + return ( + sg.select(*selcols) + .from_(parent) + .join(unnest, join_type="CROSS" if not keep_empty else "LEFT") + ) diff --git a/ibis/backends/snowflake/tests/test_client.py b/ibis/backends/snowflake/tests/test_client.py index a904b1054197..e50b4b716f31 100644 --- a/ibis/backends/snowflake/tests/test_client.py +++ b/ibis/backends/snowflake/tests/test_client.py @@ -2,6 +2,7 @@ import json import os +from collections import Counter import pandas as pd import pandas.testing as tm @@ -429,3 +430,11 @@ def test_connect_without_snowflake_url(): ) assert nonurlcon.list_tables() + + +def test_table_unnest_with_empty_strings(con): + t = ibis.memtable({"x": [["", ""], [""], [], None]}) + expected = Counter(["", "", "", None, None]) + expr = t.unnest(t.x)["x"] + result = con.execute(expr) + assert Counter(result.values) == expected diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 8b55c189e9d6..1d3633a0e4ea 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -1384,7 +1384,104 @@ def test_zip_unnest_lift(con): t = ibis.memtable(data) zipped = t.mutate(zipped=t.array1.zip(t.array2)) unnested = zipped.mutate(unnest=zipped.zipped.unnest()) - lifted = unnested.unnest.lift() + lifted = unnested["unnest"].lift() result = con.execute(lifted) expected = pd.DataFrame({"f1": [1, 2, 3], "f2": [4, 5, 6]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.notimpl( + ["datafusion", "pandas", "polars", "dask", "flink"], + raises=com.OperationNotDefinedError, +) +@pytest.mark.parametrize( + "colspec", + ["y", lambda t: t.y, ibis._.y], + ids=["string", "lambda", "deferred"], +) +def test_table_unnest(backend, colspec): + t = backend.array_types + expr = t.unnest(colspec) + result = expr.execute() + assert set(result["y"].values) == set(t[["y"]].execute().explode("y")["y"].values) + + +@pytest.mark.notimpl( + ["datafusion", "pandas", "polars", "dask", "flink"], + raises=com.OperationNotDefinedError, +) +def test_table_unnest_with_offset(backend): + t = backend.array_types + col = "y" + df = ( + t[[col]] + .execute() + .assign(idx=lambda df: df[col].map(lambda v: list(range(len(v)))))[[col, "idx"]] + .explode("idx") + .assign(idx=lambda df: df["idx"].astype("int64")) + ) + idx = iter(df.idx.values) + expected = ( + df.assign(**{col: df[col].map(lambda v: v[next(idx)])}) + .sort_values(["idx", col]) + .reset_index(drop=True)[["idx", col]] + ) + + expr = t.unnest(col, offset="idx")[["idx", col]].order_by("idx", col) + result = expr.execute() + tm.assert_frame_equal(result, expected) + + +@pytest.mark.notimpl( + ["datafusion", "pandas", "polars", "dask", "flink"], + raises=com.OperationNotDefinedError, +) +def test_table_unnest_with_keep_empty(con): + t = ibis.memtable(pd.DataFrame({"y": [[], None, ["a"]]})) + expr = t.unnest("y", keep_empty=True)["y"] + result = con.execute(expr) + assert Counter(result.values) == Counter(["a", None, None]) + + +@pytest.mark.notimpl( + ["datafusion", "pandas", "polars", "dask", "flink"], + raises=com.OperationNotDefinedError, +) +@pytest.mark.notyet( + ["risingwave"], raises=PsycoPg2InternalError, reason="not supported in risingwave" +) +def test_table_unnest_column_expr(backend): + t = backend.array_types + expr = t.unnest(t.y.map(lambda v: v.cast("str") + "'s").name("plural")) + result = expr.execute()["plural"] + expected = t["y"].execute().explode("y") + "'s" + assert set(result.values) == set(expected.replace({np.nan: None}).values) + + +@pytest.mark.notimpl( + ["datafusion", "pandas", "polars", "dask", "flink"], + raises=com.OperationNotDefinedError, +) +@pytest.mark.notimpl(["trino"], raises=TrinoUserError) +@pytest.mark.notimpl(["postgres"], raises=PsycoPg2SyntaxError) +@pytest.mark.notimpl(["risingwave"], raises=PsycoPg2ProgrammingError) +@pytest.mark.notyet( + ["risingwave"], raises=PsycoPg2InternalError, reason="not supported in risingwave" +) +def test_table_unnest_array_of_struct_of_array(con): + t = ibis.memtable( + { + "a": [ + [{"x": [1, 2, 3]}, {"x": [1, 2]}], + [], + None, + [{"x": [3, 1, 2, 3]}], + ] + }, + schema={"a": "array>>"}, + ) + # two different unnests + expr = t.unnest("a").a.x.unnest().name("x").as_table().order_by("x") + result = con.execute(expr) + expected = pd.DataFrame({"x": [1, 1, 1, 2, 2, 2, 3, 3, 3]}) + tm.assert_frame_equal(result, expected) diff --git a/ibis/backends/trino/compiler.py b/ibis/backends/trino/compiler.py index 18bbfc913c98..f98aa48b4ab3 100644 --- a/ibis/backends/trino/compiler.py +++ b/ibis/backends/trino/compiler.py @@ -17,6 +17,7 @@ exclude_nulls_from_array_collect, exclude_unsupported_window_frame_from_ops, ) +from ibis.util import gen_name class TrinoCompiler(SQLGlotCompiler): @@ -511,3 +512,59 @@ def visit_ToJSONArray(self, op, *, arg): ), dt.Array(dt.json), ) + + def visit_TableUnnest( + self, op, *, parent, column, offset: str | None, keep_empty: bool + ): + quoted = self.quoted + + column_alias = sg.to_identifier(gen_name("table_unnest_column"), quoted=quoted) + + opname = op.column.name + parent_schema = op.parent.schema + overlaps_with_parent = opname in parent_schema + computed_column = column_alias.as_(opname, quoted=quoted) + + parent_alias_or_name = parent.alias_or_name + + selcols = [] + + if overlaps_with_parent: + column_alias_or_name = column.alias_or_name + selcols.extend( + sg.column(col, table=parent_alias_or_name, quoted=quoted) + if col != column_alias_or_name + else computed_column + for col in parent_schema.names + ) + else: + selcols.append( + sge.Column( + this=STAR, + table=sg.to_identifier(parent_alias_or_name, quoted=quoted), + ) + ) + selcols.append(computed_column) + + if offset is not None: + offset_name = offset + offset = sg.to_identifier(offset_name, quoted=quoted) + selcols.append((offset - 1).as_(offset_name, quoted=quoted)) + + unnest = sge.Unnest( + expressions=[column], + alias=sge.TableAlias( + this=sg.to_identifier(gen_name("table_unnest"), quoted=quoted), + columns=[column_alias], + ), + offset=offset, + ) + return ( + sg.select(*selcols) + .from_(parent) + .join( + unnest, + on=None if not keep_empty else sge.convert(True), + join_type="CROSS" if not keep_empty else "LEFT", + ) + ) diff --git a/ibis/expr/operations/relations.py b/ibis/expr/operations/relations.py index 0e26700b3967..5402ad698df1 100644 --- a/ibis/expr/operations/relations.py +++ b/ibis/expr/operations/relations.py @@ -488,4 +488,27 @@ class Distinct(Simple): """Compute the distinct rows of a table.""" +@public +class TableUnnest(Simple): + """Cross join unnest operation.""" + + column: Value[dt.Array] + offset: typing.Union[str, None] + keep_empty: bool + + @attribute + def schema(self): + column = self.column + offset = self.offset + + base = self.parent.schema.fields.copy() + + base[column.name] = column.dtype.value_type + + if offset is not None: + base[offset] = dt.int64 + + return Schema(base) + + # TODO(kszucs): support t.select(*t) syntax by implementing Table.__iter__() diff --git a/ibis/expr/types/arrays.py b/ibis/expr/types/arrays.py index 2d9e5a8f5b3a..2053bc47d87d 100644 --- a/ibis/expr/types/arrays.py +++ b/ibis/expr/types/arrays.py @@ -286,12 +286,23 @@ def repeat(self, n: int | ir.IntegerValue) -> ArrayValue: __mul__ = __rmul__ = repeat def unnest(self) -> ir.Value: - """Flatten an array into a column. + """Unnest an array into a column. ::: {.callout-note} - ## Rows with empty arrays are dropped in the output. + ## Empty arrays and `NULL`s are dropped in the output. + To preserve empty arrays as `NULL`s as well as existing `NULL` values, + use [`Table.unnest`](./expression-tables.qmd#ibis.expr.types.relations.Table.unnest). ::: + Returns + ------- + ir.Value + Unnested array + + See Also + -------- + [`Table.unnest`](./expression-tables.qmd#ibis.expr.types.relations.Table.unnest) + Examples -------- >>> import ibis @@ -318,11 +329,6 @@ def unnest(self) -> ir.Value: │ 3 │ │ 3 │ └───────┘ - - Returns - ------- - ir.Value - Unnested array """ expr = ops.Unnest(self).to_expr() try: diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index 4d80735888e8..b25dd193f365 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -4681,6 +4681,149 @@ def value_counts(self) -> ir.Table: lambda t: t.count().name("_".join(columns) + "_count") ) + def unnest( + self, column, offset: str | None = None, keep_empty: bool = False + ) -> Table: + """Unnest an array `column` from a table. + + When unnesting an existing column the newly unnested column replaces + the existing column. + + Parameters + ---------- + column + Array column to unnest. + offset + Name of the resulting index column. + keep_empty + Keep empty array values as `NULL` in the output table, as well as + existing `NULL` values. + + Returns + ------- + Table + Table with the array column `column` unnested. + + See Also + -------- + [`ArrayValue.unnest`](./expression-collections.qmd#ibis.expr.types.arrays.ArrayValue.unnest) + + Examples + -------- + >>> import ibis + >>> from ibis import _ + >>> ibis.options.interactive = True + + Construct a table expression with an array column. + + >>> t = ibis.memtable({"x": [[1, 2], [], None, [3, 4, 5]], "y": [1, 2, 3, 4]}) + >>> t + ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┓ + ┃ x ┃ y ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━┩ + │ array │ int64 │ + ├──────────────────────┼───────┤ + │ [1, 2] │ 1 │ + │ [] │ 2 │ + │ NULL │ 3 │ + │ [3, 4, ... +1] │ 4 │ + └──────────────────────┴───────┘ + + Unnest the array column `x`, replacing the **existing** `x` column. + + >>> t.unnest("x") + ┏━━━━━━━┳━━━━━━━┓ + ┃ x ┃ y ┃ + ┡━━━━━━━╇━━━━━━━┩ + │ int64 │ int64 │ + ├───────┼───────┤ + │ 1 │ 1 │ + │ 2 │ 1 │ + │ 3 │ 4 │ + │ 4 │ 4 │ + │ 5 │ 4 │ + └───────┴───────┘ + + Unnest the array column `x` with an offset. The `offset` parameter is + the name of the resulting index column. + + >>> t.unnest(t.x, offset="idx") + ┏━━━━━━━┳━━━━━━━┳━━━━━━━┓ + ┃ x ┃ y ┃ idx ┃ + ┡━━━━━━━╇━━━━━━━╇━━━━━━━┩ + │ int64 │ int64 │ int64 │ + ├───────┼───────┼───────┤ + │ 1 │ 1 │ 0 │ + │ 2 │ 1 │ 1 │ + │ 3 │ 4 │ 0 │ + │ 4 │ 4 │ 1 │ + │ 5 │ 4 │ 2 │ + └───────┴───────┴───────┘ + + Unnest the array column `x` keep empty array values as `NULL` in the + output table. + + >>> t.unnest(_.x, offset="idx", keep_empty=True) + ┏━━━━━━━┳━━━━━━━┳━━━━━━━┓ + ┃ x ┃ y ┃ idx ┃ + ┡━━━━━━━╇━━━━━━━╇━━━━━━━┩ + │ int64 │ int64 │ int64 │ + ├───────┼───────┼───────┤ + │ 1 │ 1 │ 0 │ + │ 2 │ 1 │ 1 │ + │ 3 │ 4 │ 0 │ + │ 4 │ 4 │ 1 │ + │ 5 │ 4 │ 2 │ + │ NULL │ 2 │ NULL │ + │ NULL │ 3 │ NULL │ + └───────┴───────┴───────┘ + + If you need to preserve the row order of the preserved empty arrays or + null values use + [`row_number`](./expression-tables.qmd#ibis.row_number) to + create an index column before calling `unnest`. + + >>> ( + ... t.mutate(original_row=ibis.row_number()) + ... .unnest("x", offset="idx", keep_empty=True) + ... .relocate("original_row") + ... .order_by("original_row") + ... ) + ┏━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓ + ┃ original_row ┃ x ┃ y ┃ idx ┃ + ┡━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━╇━━━━━━━┩ + │ int64 │ int64 │ int64 │ int64 │ + ├──────────────┼───────┼───────┼───────┤ + │ 0 │ 1 │ 1 │ 0 │ + │ 0 │ 2 │ 1 │ 1 │ + │ 1 │ NULL │ 2 │ NULL │ + │ 2 │ NULL │ 3 │ NULL │ + │ 3 │ 3 │ 4 │ 0 │ + │ 3 │ 4 │ 4 │ 1 │ + │ 3 │ 5 │ 4 │ 2 │ + └──────────────┴───────┴───────┴───────┘ + + You can also unnest more complex expressions, and the resulting column + will be projected as the last expression in the result. + + >>> t.unnest(_.x.map(lambda v: v + 1).name("plus_one")) + ┏━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━━┓ + ┃ x ┃ y ┃ plus_one ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━━┩ + │ array │ int64 │ int64 │ + ├──────────────────────┼───────┼──────────┤ + │ [1, 2] │ 1 │ 2 │ + │ [1, 2] │ 1 │ 3 │ + │ [3, 4, ... +1] │ 4 │ 4 │ + │ [3, 4, ... +1] │ 4 │ 5 │ + │ [3, 4, ... +1] │ 4 │ 6 │ + └──────────────────────┴───────┴──────────┘ + """ + (column,) = self.bind(column) + return ops.TableUnnest( + parent=self, column=column, offset=offset, keep_empty=keep_empty + ).to_expr() + @public class CachedTable(Table):