Skip to content

Commit

Permalink
feat(api): upcast pandas DataFrames to memtables in rlz.table rule
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud authored and kszucs committed Mar 13, 2023
1 parent c4254f6 commit 8dcfb8d
Show file tree
Hide file tree
Showing 10 changed files with 75 additions and 9 deletions.
9 changes: 8 additions & 1 deletion ibis/backends/duckdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -687,8 +687,15 @@ def _metadata(self, query: str) -> Iterator[tuple[str, dt.DataType]]:
def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
# in theory we could use pandas dataframes, but when using dataframes
# with pyarrow datatypes later reads of this data segfault
schema = op.schema
if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]:
raise exc.IbisTypeError(
"DuckDB cannot yet reliably handle `null` typed columns; "
f"got null typed columns: {null_columns}"
)

name = op.name
table = op.data.to_pyarrow()
table = op.data.to_pyarrow(schema)
with self.begin() as con:
con.connection.register(name, table)

Expand Down
16 changes: 16 additions & 0 deletions ibis/backends/duckdb/tests/test_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from packaging.version import parse as vparse
from pytest import param

import ibis.common.exceptions as exc
import ibis.expr.datatypes as dt
from ibis.backends.duckdb.datatypes import parse

Expand Down Expand Up @@ -95,3 +96,18 @@ def test_cast_uints(uint_type, snapshot):
snapshot.assert_match(
str(ibis.to_sql(t.a.cast(uint_type), dialect="duckdb")), "out.sql"
)


def test_null_dtype():
import ibis

con = ibis.connect("duckdb://:memory:")

t = ibis.memtable({"a": [None, None]})
assert t.schema() == ibis.schema(dict(a="null"))

with pytest.raises(
exc.IbisTypeError,
match="DuckDB cannot yet reliably handle `null` typed columns",
):
con.execute(t)
6 changes: 4 additions & 2 deletions ibis/backends/pandas/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,10 +228,12 @@ def __repr__(self) -> str:
def to_frame(self) -> pd.DataFrame:
return self._df

def to_pyarrow(self) -> pa.Table:
def to_pyarrow(self, schema: sch.Schema) -> pa.Table:
import pyarrow as pa

return pa.Table.from_pandas(self._df)
from ibis.backends.pyarrow.datatypes import ibis_to_pyarrow_schema

return pa.Table.from_pandas(self._df, schema=ibis_to_pyarrow_schema(schema))


class PandasInMemoryTable(ops.InMemoryTable):
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/pandas/tests/test_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def test_pandas_dtype(pandas_dtype, ibis_dtype):
# mixed
(pd.Series([b'1', '2', 3.0]), dt.binary),
# empty
(pd.Series([], dtype='object'), dt.binary),
(pd.Series([], dtype='object'), dt.null),
(pd.Series([], dtype="string"), dt.string),
],
)
Expand Down
2 changes: 2 additions & 0 deletions ibis/backends/pyarrow/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
dt.Boolean: pa.bool_(),
dt.Timestamp: pa.timestamp('ns'),
dt.Date: pa.date64(),
dt.JSON: pa.string(),
dt.Null: pa.null(),
}


Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@


@pytest.mark.broken(["duckdb", "impala", "bigquery"], 'assert nan is None')
@pytest.mark.notimpl(["datafusion"], raises=NotImplementedError)
@pytest.mark.notimpl(["datafusion"])
def test_null_literal(con, backend):
expr = ibis.null()
result = con.execute(expr)
Expand Down
32 changes: 32 additions & 0 deletions ibis/backends/tests/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from packaging.version import parse as vparse
from pytest import param

import ibis.common.exceptions as exc
import ibis.expr.schema as sch


def _pandas_semi_join(left, right, on, **_):
assert len(on) == 1, str(on)
Expand Down Expand Up @@ -194,3 +197,32 @@ def test_semi_join_topk(batting, awards_players):
left = batting.semi_join(batting.year.topk(5), "year").select("year", "RBI")
expr = left.join(awards_players, left.year == awards_players.yearID)
assert not expr.limit(5).execute().empty


@pytest.mark.notimpl(["dask", "datafusion", "druid", "pandas"])
@pytest.mark.broken(
["duckdb"],
raises=exc.IbisTypeError,
reason="DuckDB as of 0.7.1 occasionally segfaults when there are `null`-typed columns present",
)
def test_join_with_pandas(batting, awards_players):
batting_filt = batting[lambda t: t.yearID < 1900]
awards_players_filt = awards_players[lambda t: t.yearID < 1900].execute()
assert isinstance(awards_players_filt, pd.DataFrame)
expr = batting_filt.join(awards_players_filt, "yearID")
df = expr.execute()
assert df.yearID.nunique() == 7


@pytest.mark.notimpl(["dask", "datafusion", "pandas"])
def test_join_with_pandas_non_null_typed_columns(batting, awards_players):
batting_filt = batting[lambda t: t.yearID < 1900][["yearID"]]
awards_players_filt = awards_players[lambda t: t.yearID < 1900][
["yearID"]
].execute()
# ensure that none of the columns have type null
assert sch.infer(awards_players_filt) == sch.Schema(dict(yearID="int"))
assert isinstance(awards_players_filt, pd.DataFrame)
expr = batting_filt.join(awards_players_filt, "yearID")
df = expr.execute()
assert df.yearID.nunique() == 7
2 changes: 1 addition & 1 deletion ibis/expr/datatypes/value.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def _infer_object_array_dtype(x):
'timedelta': dt.interval,
'time': dt.time,
'period': dt.binary,
'empty': dt.binary,
'empty': dt.null,
'unicode': dt.string,
}[classifier]

Expand Down
6 changes: 6 additions & 0 deletions ibis/expr/rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,8 +466,14 @@ def table(arg, schema=None, **kwargs):
it must be of the specified type. The table may have extra columns not
specified in the schema.
"""
import pandas as pd

import ibis
import ibis.expr.operations as ops

if isinstance(arg, pd.DataFrame):
arg = ibis.memtable(arg).op()

if not isinstance(arg, ops.TableNode):
raise com.IbisTypeError(
f'Argument is not a table; got type {type(arg).__name__}'
Expand Down
7 changes: 4 additions & 3 deletions ibis/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
import pyarrow as pa

import ibis.expr.operations as ops
import ibis.expr.schema as sch

Graph = Mapping[ops.Node, Sequence[ops.Node]]

Expand Down Expand Up @@ -493,11 +494,11 @@ class ToFrame(abc.ABC):

@abc.abstractmethod
def to_frame(self) -> pd.DataFrame: # pragma: no cover
...
"""Convert this input to a pandas DataFrame."""

@abc.abstractmethod
def to_pyarrow(self) -> pa.Table: # pragma: no cover
...
def to_pyarrow(self, schema: sch.Schema) -> pa.Table: # pragma: no cover
"""Convert this input to a PyArrow Table."""


def backend_entry_points() -> list[importlib.metadata.EntryPoint]:
Expand Down

0 comments on commit 8dcfb8d

Please sign in to comment.