Skip to content

Commit

Permalink
perf(duckdb): reduce branching factor for ArrayDistinct
Browse files Browse the repository at this point in the history
If I do something like `some_really_complex_array_expression.unique()`, then the SQL for `some_really_complex_array_expression` was duplicated 4 times. Now it is only duplicated 3 times.

This is important for perf, because duckdb sometimes appears to not be smart, and actually does a computation for each time a subexpression appears.
See duckdb/duckdb#14649.
So this can reduce the computation time to 3/4 of what it was.

I want to go through our other compilation steps and do similar optimizations whenever possible.
  • Loading branch information
NickCrews authored and cpcloud committed Nov 2, 2024
1 parent a28ceb1 commit 9b7a377
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 17 deletions.
23 changes: 8 additions & 15 deletions ibis/backends/sql/compilers/duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,15 +143,17 @@ def visit_StructColumn(self, op, *, names, values):
)

def visit_ArrayDistinct(self, op, *, arg):
return self._array_distinct(arg)

def _array_distinct(self, arg: sge.Expression) -> sge.Expression:
x = sg.to_identifier("x")
is_null = sge.Lambda(this=x.is_(NULL), expressions=[x])
contains_null = self.f.list_bool_or(self.f.list_apply(arg, is_null))
return self.if_(
arg.is_(NULL),
NULL,
self.f.list_distinct(arg)
+ self.if_(
self.f.list_count(arg) < self.f.len(arg),
self.f.array(NULL),
self.f.array(),
),
+ self.if_(contains_null, self.f.array(NULL), self.f.array()),
)

def visit_ArrayPosition(self, op, *, arg, other):
Expand Down Expand Up @@ -223,16 +225,7 @@ def visit_ArrayRemove(self, op, *, arg, other):

def visit_ArrayUnion(self, op, *, left, right):
arg = self.f.list_concat(left, right)
return self.if_(
arg.is_(NULL),
NULL,
self.f.list_distinct(arg)
+ self.if_(
self.f.list_count(arg) < self.f.len(arg),
self.f.array(NULL),
self.f.array(),
),
)
return self._array_distinct(arg)

def visit_ArrayZip(self, op, *, arg):
i = sg.to_identifier("i")
Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,8 +779,8 @@ def test_array_remove(con, input, expected):
("input", "expected"),
[
param(
{"a": [[1, 3, 3], [], [42, 42], [], [None], None]},
[{3, 1}, set(), {42}, set(), {None}, None],
{"a": [[1, 3, 3], [1, 3, None, 3], [42, 42], [], [None], None]},
[{3, 1}, {1, 3, None}, {42}, set(), {None}, None],
id="null",
marks=[
pytest.mark.notyet(
Expand Down

0 comments on commit 9b7a377

Please sign in to comment.