Skip to content

Commit

Permalink
fix: avoid renaming known equal columns for inner joins with equality…
Browse files Browse the repository at this point in the history
… predicates

BREAKING CHANGE: Previously all column names that collided between
`left` and `right` tables were renamed with an appended suffix. Now for
the case of inner joins with only equality predicates, colliding columns
that are known to be equal due to the join predicates aren't renamed.
  • Loading branch information
jcrist authored and cpcloud committed Oct 26, 2022
1 parent fbd7134 commit 5d4b0ed
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 19 deletions.
30 changes: 19 additions & 11 deletions ibis/backends/tests/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,19 +86,27 @@ def test_mutating_join(backend, batting, awards_players, how):
result_order = ['playerID', 'yearID', 'lgID', 'stint']

expr = left.join(right, predicate, how=how)
result = (
expr.execute()
.fillna(np.nan)
.assign(
playerID=lambda df: df.playerID_x.where(
df.playerID_x.notnull(),
df.playerID_y,
if how == "inner":
result = (
expr.execute()
.fillna(np.nan)[left.columns]
.sort_values(result_order)
.reset_index(drop=True)
)
else:
result = (
expr.execute()
.fillna(np.nan)
.assign(
playerID=lambda df: df.playerID_x.where(
df.playerID_x.notnull(),
df.playerID_y,
)
)
.drop(['playerID_x', 'playerID_y'], axis=1)[left.columns]
.sort_values(result_order)
.reset_index(drop=True)
)
.drop(['playerID_x', 'playerID_y'], axis=1)[left.columns]
.sort_values(result_order)
.reset_index(drop=True)
)

expected = (
check_eq(
Expand Down
27 changes: 25 additions & 2 deletions ibis/expr/operations/relations.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import ibis.util as util
from ibis.common.annotations import attribute
from ibis.expr.operations.core import Named, Node, Value
from ibis.expr.operations.logical import ExistsSubquery, NotExistsSubquery
from ibis.expr.operations.generic import TableColumn
from ibis.expr.operations.logical import Equals, ExistsSubquery, NotExistsSubquery

_table_names = (f'unbound_table_{i:d}' for i in itertools.count())

Expand Down Expand Up @@ -582,24 +583,46 @@ def _dedup_join_columns(expr, suffixes: tuple[str, str]):

right_columns = frozenset(right.columns)
overlap = frozenset(column for column in left.columns if column in right_columns)
equal = set()

if isinstance(op, InnerJoin) and util.all_of(op.predicates, Equals):
# For inner joins composed exclusively of equality predicates, we can
# avoid renaming columns with colliding names if their values are
# guaranteed to be equal due to the predicate. Here we collect a set of
# colliding column names that are known to have equal values between
# the left and right tables in the join.
tables = {op.left, op.right}
for pred in op.predicates:
if (
isinstance(pred.left, TableColumn)
and isinstance(pred.right, TableColumn)
and {pred.left.table, pred.right.table} == tables
and pred.left.name == pred.right.name
):
equal.add(pred.left.name)

if not overlap:
return expr

left_suffix, right_suffix = suffixes

# Rename columns in the left table that overlap, unless they're known to be
# equal to a column in the right
left_projections = [
left[column].name(f"{column}{left_suffix}")
if column in overlap
if column in overlap and column not in equal
else left[column]
for column in left.columns
]

# Rename columns in the right table that overlap, dropping any columns that
# are known to be equal to those in the left table
right_projections = [
right[column].name(f"{column}{right_suffix}")
if column in overlap
else right[column]
for column in right.columns
if column not in equal
]
return expr.projection(left_projections + right_projections)

Expand Down
21 changes: 15 additions & 6 deletions ibis/tests/expr/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,10 +937,9 @@ def test_self_join_no_view_convenience(table):
# column names to join on rather than referentially-valid expressions

result = table.join(table, [('g', 'g')])

assert result.columns == [f"{column}_x" for column in table.columns] + [
f"{column}_y" for column in table.columns
]
expected_cols = [f"{c}_x" if c != 'g' else 'g' for c in table.columns]
expected_cols.extend(f"{c}_y" for c in table.columns if c != 'g')
assert result.columns == expected_cols


def test_join_reference_bug(con):
Expand Down Expand Up @@ -1017,7 +1016,7 @@ def test_cross_join_multiple(table):
assert joined.equals(expected)


def test_filter_join(table):
def test_filter_join():
table1 = ibis.table({'key1': 'string', 'key2': 'string', 'value1': 'double'})
table2 = ibis.table({'key3': 'string', 'value2': 'double'})

Expand All @@ -1027,17 +1026,27 @@ def test_filter_join(table):
repr(filtered)


def test_join_overlapping_column_names(table):
def test_inner_join_overlapping_column_names():
t1 = ibis.table([('foo', 'string'), ('bar', 'string'), ('value1', 'double')])
t2 = ibis.table([('foo', 'string'), ('bar', 'string'), ('value2', 'double')])

joined = t1.join(t2, 'foo')
expected = t1.join(t2, t1.foo == t2.foo)
assert_equal(joined, expected)
assert joined.columns == ["foo", "bar_x", "value1", "bar_y", "value2"]

joined = t1.join(t2, ['foo', 'bar'])
expected = t1.join(t2, [t1.foo == t2.foo, t1.bar == t2.bar])
assert_equal(joined, expected)
assert joined.columns == ["foo", "bar", "value1", "value2"]

# Equality predicates don't have same name, need to rename
joined = t1.join(t2, t1.foo == t2.bar)
assert joined.columns == ["foo_x", "bar_x", "value1", "foo_y", "bar_y", "value2"]

# Not all predicates are equality, still need to rename
joined = t1.join(t2, ["foo", t1.value1 < t2.value2])
assert joined.columns == ["foo_x", "bar_x", "value1", "foo_y", "bar_y", "value2"]


def test_join_key_alternatives(con):
Expand Down

0 comments on commit 5d4b0ed

Please sign in to comment.