Skip to content

Commit

Permalink
fix(duckdb): branch to avoid unnecessary dataframe construction
Browse files Browse the repository at this point in the history
This and the associated revert take repr performance back to a
reasonable place.
  • Loading branch information
gforsyth authored and cpcloud committed Apr 11, 2023
1 parent 4544dab commit 9d5d943
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 5 deletions.
37 changes: 32 additions & 5 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ibis.backends.base.sql.alchemy import BaseAlchemyBackend
from ibis.backends.duckdb.compiler import DuckDBSQLCompiler
from ibis.backends.duckdb.datatypes import parse
from ibis.backends.pandas.client import DataFrameProxy

if TYPE_CHECKING:
import pandas as pd
Expand Down Expand Up @@ -418,7 +419,7 @@ def read_in_memory(
ir.Table
The just-registered table
"""
table_name = table_name or f"ibis_read_in_memory_{next(pd_n)}"
table_name = table_name or util.gen_name("read_in_memory")
with self.begin() as con:
con.connection.register(table_name, dataframe)

Expand Down Expand Up @@ -710,17 +711,43 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
# in theory we could use pandas dataframes, but when using dataframes
# with pyarrow datatypes later reads of this data segfault
import pandas as pd

schema = op.schema
if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
raise exc.IbisTypeError(
"DuckDB cannot yet reliably handle `null` typed columns; "
f"got null typed columns: {null_columns}"
)

name = op.name
table = op.data.to_pyarrow(schema)
with self.begin() as con:
con.connection.register(name, table)
# only register if we haven't already done so
if (name := op.name) not in self.list_tables():
if isinstance(data := op.data, DataFrameProxy):
table = data.to_frame()

# convert to object string dtypes because duckdb is either
# 1. extremely slow to register DataFrames with not-pyarrow
# string dtypes
# 2. broken for string[pyarrow] dtypes (segfault)
if conversions := {
colname: "str"
for colname, col in table.items()
if isinstance(col.dtype, pd.StringDtype)
}:
table = table.astype(conversions)
else:
table = data.to_pyarrow(schema)

# register creates a transaction, and we can't nest transactions so
# we create a function to encapsulate the whole shebang
def _register(name, table):
with self.begin() as con:
con.connection.register(name, table)

try:
_register(name, table)
except duckdb.NotImplementedException:
_register(name, data.to_pyarrow(schema))

def _get_sqla_table(
self, name: str, schema: str | None = None, **kwargs: Any
Expand Down
15 changes: 15 additions & 0 deletions ibis/backends/duckdb/tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import tempfile
from pathlib import Path

import numpy as np
import pandas as pd
import pandas.testing as tm
import pyarrow as pa
import pytest

Expand Down Expand Up @@ -211,3 +213,16 @@ def test_s3_403_fallback(con, httpserver, monkeypatch):
# pyarrow, which indicates the fallback worked
with pytest.raises(pa.lib.ArrowInvalid):
con.read_parquet(httpserver.url_for("/myfile"))


@pytest.mark.broken(
["duckdb"],
reason="""
the fix for this (issue #5879) caused a serious performance regression in the repr.
added this xfail in #5959, which also reverted the bugfix that caused the regression.
""",
)
def test_register_numpy_str(con):
data = pd.DataFrame({"a": [np.str_("xyz"), None]})
result = con.read_in_memory(data)
tm.assert_frame_equal(result.execute(), data)

0 comments on commit 9d5d943

Please sign in to comment.