Skip to content

Commit

Permalink
refactor(memtable): dedup table proxy code
Browse files Browse the repository at this point in the history
  • Loading branch information
cpcloud authored and kszucs committed Mar 22, 2023
1 parent 9d4fbbd commit 3bccec0
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 91 deletions.
7 changes: 1 addition & 6 deletions ibis/backends/base/sql/compiler/select_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,12 +301,7 @@ def _collect_Selection(self, op, toplevel=False):
self.table_set = table
self.filters = filters

def _collect_PandasInMemoryTable(self, node, toplevel=False):
if toplevel:
self.select_set = [node]
self.table_set = node

def _collect_PyArrowInMemoryTable(self, node, toplevel=False):
def _collect_InMemoryTable(self, node, toplevel=False):
if toplevel:
self.select_set = [node]
self.table_set = node
Expand Down
26 changes: 5 additions & 21 deletions ibis/backends/pandas/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@

import ibis.expr.datatypes as dt
import ibis.expr.operations as ops
import ibis.expr.rules as rlz
import ibis.expr.schema as sch
from ibis import util
from ibis.backends.base import Database
from ibis.common.grounds import Immutable
from ibis.expr.operations.relations import TableProxy

_ibis_dtypes = toolz.valmap(
np.dtype,
Expand Down Expand Up @@ -211,33 +210,18 @@ def try_json(x):
return pd.Series(list(map(try_json, col)), dtype="object")


class DataFrameProxy(Immutable, util.ToFrame):
__slots__ = ('_df', '_hash')

def __init__(self, df: pd.DataFrame) -> None:
object.__setattr__(self, "_df", df)
object.__setattr__(self, "_hash", hash((type(df), id(df))))

def __hash__(self) -> int:
return self._hash

def __repr__(self) -> str:
df_repr = util.indent(repr(self._df), spaces=2)
return f"{self.__class__.__name__}:\n{df_repr}"
class DataFrameProxy(TableProxy):
__slots__ = ()

def to_frame(self) -> pd.DataFrame:
return self._df
return self._data

def to_pyarrow(self, schema: sch.Schema) -> pa.Table:
import pyarrow as pa

from ibis.backends.pyarrow.datatypes import ibis_to_pyarrow_schema

return pa.Table.from_pandas(self._df, schema=ibis_to_pyarrow_schema(schema))


class PandasInMemoryTable(ops.InMemoryTable):
data = rlz.instance_of(DataFrameProxy)
return pa.Table.from_pandas(self._data, schema=ibis_to_pyarrow_schema(schema))


class PandasTable(ops.DatabaseTable):
Expand Down
28 changes: 5 additions & 23 deletions ibis/backends/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,39 +4,21 @@

import pyarrow as pa

import ibis.expr.operations as ops
import ibis.expr.rules as rlz
import ibis.expr.schema as sch
from ibis import util
from ibis.common.grounds import Immutable
from ibis.expr.operations.relations import TableProxy

if TYPE_CHECKING:
import pandas as pd


class PyArrowTableProxy(Immutable, util.ToFrame):
__slots__ = ('_t', '_hash')

def __init__(self, t: pa.Table) -> None:
object.__setattr__(self, "_t", t)
object.__setattr__(self, "_hash", hash((type(t), id(t))))

def __hash__(self) -> int:
return self._hash

def __repr__(self) -> str:
df_repr = util.indent(repr(self._t), spaces=2)
return f"{self.__class__.__name__}:\n{df_repr}"
class PyArrowTableProxy(TableProxy):
__slots__ = ()

def to_frame(self) -> pd.DataFrame:
return self._t.to_pandas()
return self._data.to_pandas()

def to_pyarrow(self, _: sch.Schema) -> pa.Table:
return self._t


class PyArrowInMemoryTable(ops.InMemoryTable):
data = rlz.instance_of(PyArrowTableProxy)
return self._data


@sch.infer.register(pa.Table)
Expand Down
5 changes: 1 addition & 4 deletions ibis/backends/pyspark/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,7 @@
import ibis.expr.types as ir
from ibis import interval
from ibis.backends.base.df.timecontext import adjust_context
from ibis.backends.pandas.client import PandasInMemoryTable
from ibis.backends.pandas.execution import execute
from ibis.backends.pyarrow import PyArrowInMemoryTable
from ibis.backends.pyspark.datatypes import spark_dtype
from ibis.backends.pyspark.timecontext import (
combine_time_context,
Expand Down Expand Up @@ -1863,8 +1861,7 @@ def compile_random(*args, **kwargs):
return F.rand()


@compiles(PandasInMemoryTable)
@compiles(PyArrowInMemoryTable)
@compiles(ops.InMemoryTable)
def compile_in_memory_table(t, op, session, **kwargs):
fields = [
pt.StructField(name, spark_dtype(dtype), dtype.nullable)
Expand Down
4 changes: 2 additions & 2 deletions ibis/backends/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ def test_array_slice(con, start, stop):
["dask", "pandas"],
raises=com.UnboundExpressionError,
reason=(
"Node of type 'PandasInMemoryTable' has no data bound to it. "
"Node of type 'InMemoryTable' has no data bound to it. "
"You probably tried to execute an expression without a data source."
),
)
Expand Down Expand Up @@ -512,7 +512,7 @@ def test_array_map(backend, con):
["dask", "pandas"],
raises=com.UnboundExpressionError,
reason=(
"Node of type 'PandasInMemoryTable' has no data bound to it. "
"Node of type 'InMemoryTable' has no data bound to it. "
"You probably tried to execute an expression without a data source."
),
)
Expand Down
16 changes: 8 additions & 8 deletions ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def memtable(
>>> import ibis
>>> t = ibis.memtable([{"a": 1}, {"a": 2}])
>>> t
PandasInMemoryTable
InMemoryTable
data:
DataFrameProxy:
a
Expand All @@ -356,7 +356,7 @@ def memtable(
>>> t = ibis.memtable([{"a": 1, "b": "foo"}, {"a": 2, "b": "baz"}])
>>> t
PandasInMemoryTable
InMemoryTable
data:
DataFrameProxy:
a b
Expand All @@ -368,7 +368,7 @@ def memtable(
>>> t = ibis.memtable([(1, "foo"), (2, "baz")], columns=["a", "b"])
>>> t
PandasInMemoryTable
InMemoryTable
data:
DataFrameProxy:
a b
Expand All @@ -380,7 +380,7 @@ def memtable(
>>> t = ibis.memtable([(1, "foo"), (2, "baz")])
>>> t
PandasInMemoryTable
InMemoryTable
data:
DataFrameProxy:
col0 col1
Expand Down Expand Up @@ -421,9 +421,9 @@ def _memtable_from_dataframe(
name: str | None = None,
schema: SupportsSchema | None = None,
) -> Table:
from ibis.backends.pandas.client import DataFrameProxy, PandasInMemoryTable
from ibis.backends.pandas.client import DataFrameProxy

op = PandasInMemoryTable(
op = ops.InMemoryTable(
name=name if name is not None else util.generate_unique_table_name("memtable"),
schema=sch.infer(df) if schema is None else schema,
data=DataFrameProxy(df),
Expand All @@ -434,9 +434,9 @@ def _memtable_from_dataframe(
def _memtable_from_pyarrow_table(
data: pa.Table, *, name: str | None = None, schema: SupportsSchema | None = None
):
from ibis.backends.pyarrow import PyArrowInMemoryTable, PyArrowTableProxy
from ibis.backends.pyarrow import PyArrowTableProxy

return PyArrowInMemoryTable(
return ops.InMemoryTable(
name=name if name is not None else util.generate_unique_table_name("memtable"),
schema=sch.infer(data) if schema is None else schema,
data=PyArrowTableProxy(data),
Expand Down
36 changes: 31 additions & 5 deletions ibis/expr/operations/relations.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from __future__ import annotations

import abc
import collections
import itertools
from abc import abstractmethod
from typing import TYPE_CHECKING

from public import public

Expand All @@ -14,11 +16,16 @@
from ibis import util
from ibis.common.annotations import attribute
from ibis.common.collections import frozendict
from ibis.common.grounds import Immutable
from ibis.expr.deferred import Deferred
from ibis.expr.operations.core import Named, Node, Value
from ibis.expr.operations.generic import TableColumn
from ibis.expr.operations.logical import Equals, ExistsSubquery, NotExistsSubquery

if TYPE_CHECKING:
import pandas as pd
import pyarrow as pa

_table_names = (f'unbound_table_{i:d}' for i in itertools.count())


Expand Down Expand Up @@ -70,15 +77,34 @@ class SQLQueryResult(TableNode):
source = rlz.client


class TableProxy(Immutable):
__slots__ = ('_data', '_hash')

def __init__(self, data) -> None:
object.__setattr__(self, "_data", data)
object.__setattr__(self, "_hash", hash((type(data), id(data))))

def __hash__(self) -> int:
return self._hash

def __repr__(self) -> str:
data_repr = util.indent(repr(self._data), spaces=2)
return f"{self.__class__.__name__}:\n{data_repr}"

@abc.abstractmethod
def to_frame(self) -> pd.DataFrame: # pragma: no cover
"""Convert this input to a pandas DataFrame."""

@abc.abstractmethod
def to_pyarrow(self, schema: sch.Schema) -> pa.Table: # pragma: no cover
"""Convert this input to a PyArrow Table."""


@public
class InMemoryTable(PhysicalTable):
name = rlz.instance_of(str)
schema = rlz.instance_of(sch.Schema)

@property
@abstractmethod
def data(self) -> util.ToFrame:
"""Return the data of an in-memory table."""
data = rlz.instance_of(TableProxy)


# TODO(kszucs): desperately need to clean this up, the majority of this
Expand Down
22 changes: 0 additions & 22 deletions ibis/util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Ibis utility functions."""
from __future__ import annotations

import abc
import collections
import functools
import importlib.metadata
Expand Down Expand Up @@ -32,11 +31,7 @@
if TYPE_CHECKING:
from pathlib import Path

import pandas as pd
import pyarrow as pa

import ibis.expr.operations as ops
import ibis.expr.schema as sch

Graph = Mapping[ops.Node, Sequence[ops.Node]]

Expand Down Expand Up @@ -484,23 +479,6 @@ def experimental(func):
return func


class ToFrame(abc.ABC):
"""Interface for in-memory objects that can be converted to an in-memory structure.
Supports pandas DataFrames and PyArrow Tables.
"""

__slots__ = ()

@abc.abstractmethod
def to_frame(self) -> pd.DataFrame: # pragma: no cover
"""Convert this input to a pandas DataFrame."""

@abc.abstractmethod
def to_pyarrow(self, schema: sch.Schema) -> pa.Table: # pragma: no cover
"""Convert this input to a PyArrow Table."""


def backend_entry_points() -> list[importlib.metadata.EntryPoint]:
"""Get the list of installed `ibis.backend` entrypoints."""

Expand Down

0 comments on commit 3bccec0

Please sign in to comment.