Skip to content

Commit

Permalink
fix(duckdb): workaround an ownership bug at the interaction of duckdb…
Browse files Browse the repository at this point in the history
…, pandas and pyarrow
  • Loading branch information
cpcloud committed Jan 29, 2023
1 parent 4f73953 commit 2819cff
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 22 deletions.
6 changes: 3 additions & 3 deletions ibis/backends/base/sql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,10 +248,10 @@ def execute(

return result

def _register_in_memory_table(self, table_op):
raise NotImplementedError
def _register_in_memory_table(self, _: ops.InMemoryTable) -> None:
raise NotImplementedError(self.name)

def _register_in_memory_tables(self, expr):
def _register_in_memory_tables(self, expr: ir.Expr) -> None:
if self.compiler.cheap_in_memory_tables:
for memtable in an.find_memtables(expr.op()):
self._register_in_memory_table(memtable)
Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/clickhouse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def __init__(self, *args, external_tables=None, **kwargs):
super().__init__(*args, **kwargs)
self._external_tables = external_tables or {}

def _register_in_memory_table(self, table_op):
self._external_tables[table_op.name] = table_op.data.to_frame()
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
self._external_tables[op.name] = op.data.to_frame()

def _log(self, sql: str) -> None:
"""Log the SQL, usually to the standard output.
Expand Down
11 changes: 8 additions & 3 deletions ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import pandas as pd
import pyarrow as pa

import ibis.expr.operations as ops

import ibis.expr.schema as sch
import ibis.expr.types as ir
from ibis.backends.base.sql.alchemy import BaseAlchemyBackend
Expand Down Expand Up @@ -568,10 +570,13 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
ibis_type = parse(type)
yield name, ibis_type.copy(nullable=null.lower() == "yes")

def _register_in_memory_table(self, table_op):
df = table_op.data.to_frame()
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
# in theory we could use pandas dataframes, but when using dataframes
# with pyarrow datatypes later reads of this data segfault
name = op.name
table = op.data.to_pyarrow()
with self.begin() as con:
con.connection.register(table_op.name, df)
con.connection.register(name, table)

def _get_sqla_table(
self, name: str, schema: str | None = None, **kwargs: Any
Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/duckdb/tests/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import tempfile
from pathlib import Path

import duckdb
import pandas as pd
import pytest
import sqlalchemy as sa
Expand Down Expand Up @@ -140,10 +139,6 @@ def test_memtable_with_nullable_pyarrow_string():
assert len(res) == len(data)


@pytest.mark.xfail(
raises=duckdb.NotImplementedException,
reason="DuckDB only supports the `string[pyarrow]` pandas dtype",
)
def test_memtable_with_nullable_pyarrow_not_string():
pytest.importorskip("pyarrow")

Expand Down
17 changes: 13 additions & 4 deletions ibis/backends/pandas/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import json
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -45,6 +46,9 @@
},
)

if TYPE_CHECKING:
import pyarrow as pa


@dt.dtype.register(DatetimeTZDtype)
def from_pandas_tzdtype(value):
Expand Down Expand Up @@ -196,20 +200,25 @@ def try_json(x):
class DataFrameProxy(Immutable, util.ToFrame):
__slots__ = ('_df', '_hash')

def __init__(self, df):
def __init__(self, df: pd.DataFrame) -> None:
object.__setattr__(self, "_df", df)
object.__setattr__(self, "_hash", hash((type(df), id(df))))

def __hash__(self):
def __hash__(self) -> int:
return self._hash

def __repr__(self):
def __repr__(self) -> str:
df_repr = util.indent(repr(self._df), spaces=2)
return f"{self.__class__.__name__}:\n{df_repr}"

def to_frame(self):
def to_frame(self) -> pd.DataFrame:
return self._df

def to_pyarrow(self) -> pa.Table:
import pyarrow as pa

return pa.Table.from_pandas(self._df)


class PandasInMemoryTable(ops.InMemoryTable):
data = rlz.instance_of(DataFrameProxy)
Expand Down
5 changes: 2 additions & 3 deletions ibis/backends/pyspark/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,9 +468,8 @@ def create_table(

return self.raw_sql(statement.compile())

def _register_in_memory_table(self, table_op):
spark_df = self.compile(table_op.to_expr())
spark_df.createOrReplaceTempView(table_op.name)
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
self.compile(op.to_expr()).createOrReplaceTempView(op.name)

def create_view(
self,
Expand Down
12 changes: 10 additions & 2 deletions ibis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from pathlib import Path

import pandas as pd
import pyarrow as pa

import ibis.expr.operations as ops

Expand Down Expand Up @@ -506,12 +507,19 @@ def experimental(func):


class ToFrame(abc.ABC):
"""Interface for in-memory objects that can be converted to a DataFrame."""
"""Interface for in-memory objects that can be converted to an in-memory structure.
Supports pandas DataFrames and PyArrow Tables.
"""

__slots__ = ()

@abc.abstractmethod
def to_frame(self) -> pd.DataFrame:
def to_frame(self) -> pd.DataFrame: # pragma: no cover
...

@abc.abstractmethod
def to_pyarrow(self) -> pa.Table: # pragma: no cover
...


Expand Down

0 comments on commit 2819cff

Please sign in to comment.