Skip to content

Commit

Permalink
perf(duckdb): speed up metadata access to support the many-columns us…
Browse files Browse the repository at this point in the history
…e case
  • Loading branch information
cpcloud authored and kszucs committed Aug 14, 2023
1 parent 9331186 commit 2854143
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 21 deletions.
13 changes: 6 additions & 7 deletions ibis/backends/base/sql/alchemy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -700,13 +700,12 @@ def insert(

from_table_expr = obj

with self.begin() as bind:
if from_table_expr is not None:
compiled = from_table_expr.compile()
columns = [
self.con.dialect.normalize_name(c)
for c in from_table_expr.columns
]
if from_table_expr is not None:
compiled = from_table_expr.compile()
columns = [
self.con.dialect.normalize_name(c) for c in from_table_expr.columns
]
with self.begin() as bind:
bind.execute(to_table.insert().from_select(columns, compiled))
elif isinstance(obj, (list, dict)):
to_table = self._get_sqla_table(table_name, schema=database)
Expand Down
62 changes: 51 additions & 11 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,57 @@ def read_csv(
con.exec_driver_sql(view)
return self.table(table_name)

def _get_sqla_table(
self,
name: str,
schema: str | None = None,
database: str | None = None,
**_: Any,
) -> sa.Table:
current_db = self.current_database
current_schema = self.current_schema
if schema is None:
schema = current_schema
*db, schema = schema.split(".")
db = "".join(db) or database
ident = ".".join(
map(
self._quote,
filter(None, (db if db != current_db else None, schema)),
)
)

query = f"DESCRIBE SELECT * FROM {ident}.{self._quote(name)}"

with self.begin() as con:
# fetch metadata with pyarrow, it's much faster for tables with "lots"
# of columns
meta = con.exec_driver_sql(query).cursor.fetch_arrow_table()

names = meta["column_name"].to_pylist()
types = meta["column_type"].to_pylist()
nullables = pa.compute.equal(meta["null"], "YES").to_pylist()

ibis_schema = sch.Schema(
{
name: parse(typ).copy(nullable=nullable)
for name, typ, nullable in zip(names, types, nullables)
}
)
columns = self._columns_from_schema(name, ibis_schema)
return sa.table(name, *columns, schema=ident)

def drop_table(
self, name: str, database: str | None = None, force: bool = False
) -> None:
name = self._quote(name)
# TODO: handle database quoting
if database is not None:
name = f"{database}.{name}"
drop_stmt = "DROP TABLE" + (" IF EXISTS" * force) + f" {name}"
with self.begin() as con:
con.exec_driver_sql(drop_stmt)

def read_parquet(
self,
source_list: str | Iterable[str],
Expand Down Expand Up @@ -992,17 +1043,6 @@ def _register(name, table):
except duckdb.NotImplementedException:
_register(name, data.to_pyarrow(schema))

def _get_sqla_table(
self, name: str, schema: str | None = None, **kwargs: Any
) -> sa.Table:
with warnings.catch_warnings():
# We don't rely on index reflection, ignore this warning
warnings.filterwarnings(
"ignore",
message="duckdb-engine doesn't yet support reflection on indices",
)
return super()._get_sqla_table(name, schema, **kwargs)

def _get_temp_view_definition(
self, name: str, definition: sa.sql.compiler.Compiled
) -> str:
Expand Down
6 changes: 3 additions & 3 deletions ibis/backends/duckdb/tests/test_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,17 @@ def test_null_dtype():
def test_parse_quoted_struct_field():
import ibis.backends.duckdb.datatypes as ddt

assert ddt.parse('STRUCT("a" INT, "a b c" INT)') == dt.Struct(
assert ddt.parse('STRUCT("a" INTEGER, "a b c" INTEGER)') == dt.Struct(
{"a": dt.int32, "a b c": dt.int32}
)


def test_generate_quoted_struct():
typ = sat.StructType(
{"in come": sa.TEXT(), "my count": sa.BIGINT(), "thing": sa.INT()}
{"in come": sa.VARCHAR(), "my count": sa.BIGINT(), "thing": sa.INTEGER()}
)
result = typ.compile(dialect=duckdb_engine.Dialect())
expected = 'STRUCT("in come" TEXT, "my count" BIGINT, thing INTEGER)'
expected = 'STRUCT("in come" VARCHAR, "my count" BIGINT, thing INTEGER)'
assert result == expected


Expand Down
10 changes: 10 additions & 0 deletions ibis/tests/benchmarks/test_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,3 +743,13 @@ def test_snowflake_medium_sized_to_pandas(benchmark):
)

benchmark.pedantic(lineitem.to_pandas, rounds=5, iterations=1, warmup_rounds=1)


def test_parse_many_duckdb_types(benchmark):
parse = pytest.importorskip("ibis.backends.duckdb.datatypes").parse

def parse_many(types):
list(map(parse, types))

types = ["VARCHAR", "INTEGER", "DOUBLE", "BIGINT"] * 1000
benchmark(parse_many, types)

0 comments on commit 2854143

Please sign in to comment.