From 7878d8c0a2b60847c6f2708429c4f3b7ebe5d67a Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Fri, 21 Jun 2024 14:01:14 -0400 Subject: [PATCH] perf(duckdb): speed up memtable registration (#9419) --- ibis/backends/duckdb/__init__.py | 8 ++++++-- ibis/tests/benchmarks/test_benchmarks.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/ibis/backends/duckdb/__init__.py b/ibis/backends/duckdb/__init__.py index b8e4dfdc08d9..94f005ececc6 100644 --- a/ibis/backends/duckdb/__init__.py +++ b/ibis/backends/duckdb/__init__.py @@ -1552,8 +1552,12 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: ) def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: - # only register if we haven't already done so - if (name := op.name) not in self.list_tables(): + name = op.name + try: + # this handles tables _and_ views + self.con.table(name) + except (duckdb.CatalogException, duckdb.InvalidInputException): + # only register if we haven't already done so self.con.register(name, op.data.to_pyarrow(op.schema)) def _register_udfs(self, expr: ir.Expr) -> None: diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py index 770d0f0d5411..03e6d1df16a5 100644 --- a/ibis/tests/benchmarks/test_benchmarks.py +++ b/ibis/tests/benchmarks/test_benchmarks.py @@ -4,6 +4,7 @@ import functools import inspect import itertools +import math import os import string from operator import attrgetter, itemgetter @@ -865,3 +866,21 @@ def test_large_union_construct(benchmark, many_tables): def test_large_union_compile(benchmark, many_tables): expr = ibis.union(*many_tables) assert benchmark(ibis.to_sql, expr) is not None + + +@pytest.fixture(scope="session") +def lots_of_tables(tmp_path_factory): + duckdb = pytest.importorskip("duckdb") + db = str(tmp_path_factory.mktemp("data") / "lots_of_tables.ddb") + n = 100_000 + d = int(math.log10(n)) + sql = ";".join(f"CREATE TABLE t{i:0>{d}} (x TINYINT)" for i in range(n)) + with duckdb.connect(db) as con: + con.execute(sql) + return ibis.duckdb.connect(db) + + +def test_memtable_register(lots_of_tables, benchmark): + t = ibis.memtable({"x": [1, 2, 3]}) + result = benchmark(lots_of_tables.execute, t) + assert len(result) == 3