Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(api): add TableUnnest operation to support cross-join unnest semantics as well as offset #9423

Merged
merged 8 commits into from
Jul 1, 2024
44 changes: 44 additions & 0 deletions ibis/backends/bigquery/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,3 +690,47 @@
table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
column = sge.Column(this=star, table=table)
return sg.select(column).from_(parent)

def visit_TableUnnest(
self, op, *, parent, column, offset: str | None, keep_empty: bool
):
quoted = self.quoted

Check warning on line 697 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L697

Added line #L697 was not covered by tests
cpcloud marked this conversation as resolved.
Show resolved Hide resolved

column_alias = sg.to_identifier(

Check warning on line 699 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L699

Added line #L699 was not covered by tests
util.gen_name("table_unnest_column"), quoted=quoted
)

selcols = []

Check warning on line 703 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L703

Added line #L703 was not covered by tests

table = sg.to_identifier(parent.alias_or_name, quoted=quoted)

Check warning on line 705 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L705

Added line #L705 was not covered by tests

opname = op.column.name
overlaps_with_parent = opname in op.parent.schema
computed_column = column_alias.as_(opname, quoted=quoted)

Check warning on line 709 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L707-L709

Added lines #L707 - L709 were not covered by tests

# replace the existing column if the unnested column hasn't been
# renamed
#
# e.g., table.unnest("x")
if overlaps_with_parent:
selcols.append(

Check warning on line 716 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L716

Added line #L716 was not covered by tests
sge.Column(this=sge.Star(replace=[computed_column]), table=table)
)
else:
selcols.append(sge.Column(this=STAR, table=table))
selcols.append(computed_column)

Check warning on line 721 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L720-L721

Added lines #L720 - L721 were not covered by tests

if offset is not None:
offset = sg.to_identifier(offset, quoted=quoted)
selcols.append(offset)

Check warning on line 725 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L724-L725

Added lines #L724 - L725 were not covered by tests

unnest = sge.Unnest(

Check warning on line 727 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L727

Added line #L727 was not covered by tests
expressions=[column],
alias=sge.TableAlias(columns=[column_alias]),
offset=offset,
)
return (

Check warning on line 732 in ibis/backends/bigquery/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/bigquery/compiler.py#L732

Added line #L732 was not covered by tests
sg.select(*selcols)
.from_(parent)
.join(unnest, join_type="CROSS" if not keep_empty else "LEFT")
)
57 changes: 57 additions & 0 deletions ibis/backends/clickhouse/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,3 +648,60 @@
table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
column = sge.Column(this=star, table=table)
return sg.select(column).from_(parent)

def visit_TableUnnest(
self, op, *, parent, column, offset: str | None, keep_empty: bool
):
quoted = self.quoted

column_alias = sg.to_identifier(
util.gen_name("table_unnest_column"), quoted=quoted
)

table = sg.to_identifier(parent.alias_or_name, quoted=quoted)

selcols = []

opname = op.column.name
overlaps_with_parent = opname in op.parent.schema
computed_column = column_alias.as_(opname, quoted=quoted)

if offset is not None:
if overlaps_with_parent:
selcols.append(
sge.Column(this=sge.Star(replace=[computed_column]), table=table)
)
else:
selcols.append(sge.Column(this=STAR, table=table))
selcols.append(computed_column)

Check warning on line 676 in ibis/backends/clickhouse/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/clickhouse/compiler.py#L675-L676

Added lines #L675 - L676 were not covered by tests

offset = sg.to_identifier(offset, quoted=quoted)
selcols.append(offset)
elif overlaps_with_parent:
selcols.append(
sge.Column(this=sge.Star(replace=[computed_column]), table=table)
)
else:
selcols.append(sge.Column(this=STAR, table=table))
selcols.append(computed_column)

select = (
sg.select(*selcols)
.from_(parent)
.join(
sge.Join(
this=column.as_(column_alias, quoted=quoted),
kind="ARRAY",
side=None if not keep_empty else "LEFT",
)
)
)

if offset is not None:
param = sg.to_identifier(util.gen_name("arr_enum"))
func = sge.Lambda(this=param - 1, expressions=[param])
return select.join(
self.f.arrayMap(func, self.f.arrayEnumerate(column_alias)).as_(offset)
)

return select
53 changes: 53 additions & 0 deletions ibis/backends/duckdb/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from ibis.backends.sql.compiler import NULL, STAR, AggGen, SQLGlotCompiler
from ibis.backends.sql.datatypes import DuckDBType
from ibis.backends.sql.rewrites import exclude_nulls_from_array_collect
from ibis.util import gen_name

_INTERVAL_SUFFIXES = {
"ms": "milliseconds",
Expand Down Expand Up @@ -547,3 +548,55 @@
table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
column = sge.Column(this=star, table=table)
return sg.select(column).from_(parent)

def visit_TableUnnest(
self, op, *, parent, column, offset: str | None, keep_empty: bool
):
quoted = self.quoted

column_alias = sg.to_identifier(gen_name("table_unnest_column"), quoted=quoted)

opname = op.column.name
overlaps_with_parent = opname in op.parent.schema
computed_column = column_alias.as_(opname, quoted=quoted)

selcols = []

table = sg.to_identifier(parent.alias_or_name, quoted=quoted)

if offset is not None:
# TODO: clean this up once WITH ORDINALITY is supported in DuckDB
# no need for struct_extract once that's upstream
column = self.f.list_zip(column, self.f.range(self.f.len(column)))
extract = self.f.struct_extract(column_alias, 1).as_(opname, quoted=quoted)

if overlaps_with_parent:
replace = sge.Column(this=sge.Star(replace=[extract]), table=table)
selcols.append(replace)
else:
selcols.append(sge.Column(this=STAR, table=table))
selcols.append(extract)

Check warning on line 578 in ibis/backends/duckdb/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/duckdb/compiler.py#L577-L578

Added lines #L577 - L578 were not covered by tests

selcols.append(
self.f.struct_extract(column_alias, 2).as_(offset, quoted=quoted)
)
elif overlaps_with_parent:
selcols.append(
sge.Column(this=sge.Star(replace=[computed_column]), table=table)
)
else:
selcols.append(sge.Column(this=STAR, table=table))
selcols.append(computed_column)

unnest = sge.Unnest(
expressions=[column],
alias=sge.TableAlias(
this=sg.to_identifier(gen_name("table_unnest"), quoted=quoted),
columns=[column_alias],
),
)
return (
sg.select(*selcols)
.from_(parent)
.join(unnest, join_type="CROSS" if not keep_empty else "LEFT")
)
57 changes: 57 additions & 0 deletions ibis/backends/postgres/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ibis.backends.sql.datatypes import PostgresType
from ibis.backends.sql.dialects import Postgres
from ibis.backends.sql.rewrites import exclude_nulls_from_array_collect
from ibis.util import gen_name


class PostgresUDFNode(ops.Value):
Expand Down Expand Up @@ -611,3 +612,59 @@ def visit_Hash(self, op, *, arg):
f"Hash({arg_dtype!r}) operation is not supported in the "
f"{self.dialect} backend"
)

def visit_TableUnnest(
self, op, *, parent, column, offset: str | None, keep_empty: bool
):
quoted = self.quoted

column_alias = sg.to_identifier(gen_name("table_unnest_column"), quoted=quoted)

parent_alias = parent.alias_or_name

opname = op.column.name
parent_schema = op.parent.schema
overlaps_with_parent = opname in parent_schema
computed_column = column_alias.as_(opname, quoted=quoted)

selcols = []

if overlaps_with_parent:
column_alias_or_name = column.alias_or_name
selcols.extend(
sg.column(col, table=parent_alias, quoted=quoted)
if col != column_alias_or_name
else computed_column
for col in parent_schema.names
)
else:
selcols.append(
sge.Column(
this=STAR, table=sg.to_identifier(parent_alias, quoted=quoted)
)
)
selcols.append(computed_column)

if offset is not None:
offset_name = offset
offset = sg.to_identifier(offset_name, quoted=quoted)
selcols.append((offset - 1).as_(offset_name, quoted=quoted))

unnest = sge.Unnest(
expressions=[column],
alias=sge.TableAlias(
this=sg.to_identifier(gen_name("table_unnest"), quoted=quoted),
columns=[column_alias],
),
offset=offset,
)

return (
sg.select(*selcols)
.from_(parent)
.join(
unnest,
on=None if not keep_empty else sge.convert(True),
join_type="CROSS" if not keep_empty else "LEFT",
)
)
65 changes: 65 additions & 0 deletions ibis/backends/pyspark/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,3 +452,68 @@
return self.f.sha2(arg, int(how[-3:]))
else:
raise NotImplementedError(f"No available hashing function for {how}")

def visit_TableUnnest(
self, op, *, parent, column, offset: str | None, keep_empty: bool
):
quoted = self.quoted

Check warning on line 459 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L459

Added line #L459 was not covered by tests

column_alias = sg.to_identifier(gen_name("table_unnest_column"), quoted=quoted)

Check warning on line 461 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L461

Added line #L461 was not covered by tests

opname = op.column.name
parent_schema = op.parent.schema
overlaps_with_parent = opname in parent_schema
computed_column = column_alias.as_(opname, quoted=quoted)

Check warning on line 466 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L463-L466

Added lines #L463 - L466 were not covered by tests

parent_alias = parent.alias_or_name

Check warning on line 468 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L468

Added line #L468 was not covered by tests

selcols = []

Check warning on line 470 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L470

Added line #L470 was not covered by tests

if overlaps_with_parent:
column_alias_or_name = column.alias_or_name

Check warning on line 473 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L473

Added line #L473 was not covered by tests
selcols.extend(
sg.column(col, table=parent_alias, quoted=quoted)
if col != column_alias_or_name
else computed_column
for col in parent_schema.names
)
else:
selcols.append(

Check warning on line 481 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L481

Added line #L481 was not covered by tests
sge.Column(
this=STAR, table=sg.to_identifier(parent_alias, quoted=quoted)
)
)
selcols.append(computed_column)

Check warning on line 486 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L486

Added line #L486 was not covered by tests

alias_columns = []

Check warning on line 488 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L488

Added line #L488 was not covered by tests

if offset is not None:
offset = sg.column(offset, quoted=quoted)
selcols.append(offset)
alias_columns.append(offset)

Check warning on line 493 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L491-L493

Added lines #L491 - L493 were not covered by tests

alias_columns.append(column_alias)

Check warning on line 495 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L495

Added line #L495 was not covered by tests

# four possible functions
#
# explode: unnest
# explode_outer: unnest preserving empties and nulls
# posexplode: unnest with index
# posexplode_outer: unnest with index preserving empties and nulls
funcname = (

Check warning on line 503 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L503

Added line #L503 was not covered by tests
("pos" if offset is not None else "")
+ "explode"
+ ("_outer" if keep_empty else "")
)

return (

Check warning on line 509 in ibis/backends/pyspark/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/pyspark/compiler.py#L509

Added line #L509 was not covered by tests
sg.select(*selcols)
.from_(parent)
.lateral(
sge.Lateral(
this=self.f[funcname](column),
view=True,
alias=sge.TableAlias(columns=alias_columns),
)
)
)
67 changes: 67 additions & 0 deletions ibis/backends/snowflake/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,3 +662,70 @@
table = sg.to_identifier(parent.alias_or_name, quoted=quoted)
column = sge.Column(this=star, table=table)
return sg.select(column).from_(parent)

def visit_TableUnnest(
self, op, *, parent, column, offset: str | None, keep_empty: bool
):
quoted = self.quoted

Check warning on line 669 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L669

Added line #L669 was not covered by tests

column_alias = sg.to_identifier(

Check warning on line 671 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L671

Added line #L671 was not covered by tests
util.gen_name("table_unnest_column"), quoted=quoted
)

sep = sge.convert(util.guid())
null_sentinel = sge.convert(util.guid())

Check warning on line 676 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L675-L676

Added lines #L675 - L676 were not covered by tests

table = sg.to_identifier(parent.alias_or_name, quoted=quoted)

Check warning on line 678 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L678

Added line #L678 was not covered by tests

selcols = []

Check warning on line 680 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L680

Added line #L680 was not covered by tests

opcol = op.column
opname = opcol.name
overlaps_with_parent = opname in op.parent.schema
computed_column = self.cast(

Check warning on line 685 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L682-L685

Added lines #L682 - L685 were not covered by tests
self.f.nullif(column_alias, null_sentinel), opcol.dtype.value_type
).as_(opname, quoted=quoted)

if overlaps_with_parent:
selcols.append(

Check warning on line 690 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L690

Added line #L690 was not covered by tests
sge.Column(this=sge.Star(replace=[computed_column]), table=table)
)
else:
selcols.append(sge.Column(this=STAR, table=table))
selcols.append(computed_column)

Check warning on line 695 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L694-L695

Added lines #L694 - L695 were not covered by tests

if offset is not None:
offset = sg.to_identifier(offset, quoted=quoted)
selcols.append(offset)

Check warning on line 699 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L698-L699

Added lines #L698 - L699 were not covered by tests

alias = sge.TableAlias(

Check warning on line 701 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L701

Added line #L701 was not covered by tests
this=sg.to_identifier(util.gen_name("table_unnest"), quoted=quoted),
columns=[column_alias],
)

# there has to be a better way
param = sg.to_identifier(util.gen_name("table_unnest_param"))
column = self.f.transform(

Check warning on line 708 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L707-L708

Added lines #L707 - L708 were not covered by tests
column,
sge.Lambda(
this=self.f.coalesce(self.cast(param, dt.string), null_sentinel),
expressions=[param],
),
)
empty_array = self.f.array()
split = self.f.coalesce(

Check warning on line 716 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L715-L716

Added lines #L715 - L716 were not covered by tests
self.f.nullif(
self.f.split(
self.f.array_to_string(self.f.nullif(column, empty_array), sep), sep
),
empty_array,
),
self.f.array(null_sentinel),
)

unnest = sge.Unnest(expressions=[split], alias=alias, offset=offset)
return (

Check warning on line 727 in ibis/backends/snowflake/compiler.py

View check run for this annotation

Codecov / codecov/patch

ibis/backends/snowflake/compiler.py#L726-L727

Added lines #L726 - L727 were not covered by tests
sg.select(*selcols)
.from_(parent)
.join(unnest, join_type="CROSS" if not keep_empty else "LEFT")
)
Loading