Skip to content

Commit

Permalink
feat(api): natively support polars dataframes in ibis.memtable
Browse files Browse the repository at this point in the history
  • Loading branch information
jcrist committed Feb 27, 2024
1 parent 53454c1 commit 464bebc
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 24 deletions.
19 changes: 3 additions & 16 deletions ibis/backends/polars/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import ibis.expr.operations as ops
from ibis.backends.pandas.rewrites import PandasAsofJoin, PandasJoin, PandasRename
from ibis.expr.operations.udf import InputType
from ibis.formats.polars import PolarsSchema, PolarsType
from ibis.formats.polars import PolarsType
from ibis.util import gen_name


Expand Down Expand Up @@ -66,21 +66,8 @@ def dummy_table(op, **kw):


@translate.register(ops.InMemoryTable)
def pandas_in_memory_table(op, **_):
lf = pl.from_pandas(op.data.to_frame()).lazy()
schema = PolarsSchema.to_ibis(lf.schema)

columns = []
for name, current_dtype in schema.items():
desired_dtype = op.schema[name]
if current_dtype != desired_dtype:
typ = PolarsType.from_ibis(desired_dtype)
columns.append(pl.col(name).cast(typ))

if columns:
return lf.with_columns(columns)
else:
return lf
def in_memory_table(op, **_):
return op.data.to_polars(op.schema).lazy()


@translate.register(ops.Alias)
Expand Down
20 changes: 19 additions & 1 deletion ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,7 +946,7 @@ def test_memtable_bool_column(con):
assert Counter(con.execute(t.a)) == Counter(data)


def test_memtable_construct(backend, con, monkeypatch):
def test_memtable_construct_from_pyarrow(backend, con, monkeypatch):
pa = pytest.importorskip("pyarrow")
monkeypatch.setattr(ibis.options, "default_backend", con)

Expand All @@ -964,6 +964,24 @@ def test_memtable_construct(backend, con, monkeypatch):
)


@pytest.mark.parametrize("lazy", [False, True])
def test_memtable_construct_from_polars(backend, con, lazy):
pl = pytest.importorskip("polars")
df = pl.DataFrame(
{
"a": list("abc"),
"b": [1, 2, 3],
"c": [1.0, 2.0, 3.0],
"d": [None, "b", None],
}
)
obj = df.lazy() if lazy else df
t = ibis.memtable(obj)
res = con.to_pandas(t.order_by("a")).fillna(pd.NA)
sol = df.to_pandas().fillna(pd.NA)
backend.assert_frame_equal(res, sol)


@pytest.mark.parametrize(
"df, columns, expected",
[
Expand Down
39 changes: 33 additions & 6 deletions ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
from pathlib import Path

import pandas as pd
import polars as pl
import pyarrow as pa

from ibis.expr.schema import SchemaLike
Expand Down Expand Up @@ -359,14 +360,15 @@ def memtable(
Parameters
----------
data
Any data accepted by the `pandas.DataFrame` constructor or a `pyarrow.Table`.
A table-like object (`pandas.DataFrame`, `pyarrow.Table`, or
`polars.DataFrame`), or any data accepted by the `pandas.DataFrame`
constructor (e.g. a list of dicts).
Examples of acceptable objects are a `pandas.DataFrame`, a `pyarrow.Table`,
a list of dicts of non-ibis Python objects, etc.
`ibis` objects, like `MapValue`, will result in an error.
Note that ibis objects (e.g. `MapValue`) may not be passed in as part
of `data` and will result in an error.
Do not depend on the underlying storage type (e.g., pyarrow.Table), it's subject
to change across non-major releases.
Do not depend on the underlying storage type (e.g., pyarrow.Table),
it's subject to change across non-major releases.
columns
Optional [](`typing.Iterable`) of [](`str`) column names. If provided,
must match the number of columns in `data`.
Expand Down Expand Up @@ -507,6 +509,31 @@ def _memtable_from_pyarrow_table(
).to_expr()


@_memtable.register("polars.LazyFrame")
def _memtable_from_polars_lazyframe(data: pl.LazyFrame, **kwargs):
return _memtable_from_polars_dataframe(data.collect(), **kwargs)


@_memtable.register("polars.DataFrame")
def _memtable_from_polars_dataframe(
data: pl.DataFrame,
*,
name: str | None = None,
schema: SchemaLike | None = None,
columns: Iterable[str] | None = None,
):
from ibis.formats.polars import PolarsDataFrameProxy

if columns is not None:
assert schema is None, "if `columns` is not `None` then `schema` must be `None`"
schema = sch.Schema(dict(zip(columns, sch.infer(data).values())))
return ops.InMemoryTable(
name=name if name is not None else util.gen_name("polars_memtable"),
schema=sch.infer(data) if schema is None else schema,
data=PolarsDataFrameProxy(data),
).to_expr()


def _deferred_method_call(expr, method_name):
method = operator.methodcaller(method_name)
if isinstance(expr, str):
Expand Down
5 changes: 5 additions & 0 deletions ibis/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

if TYPE_CHECKING:
import pandas as pd
import polars as pl
import pyarrow as pa

from ibis.expr.datatypes import DataType
Expand Down Expand Up @@ -249,6 +250,10 @@ def to_frame(self) -> pd.DataFrame: # pragma: no cover
def to_pyarrow(self, schema: Schema) -> pa.Table: # pragma: no cover
"""Convert this input to a PyArrow Table."""

@abstractmethod
def to_polars(self, schema: Schema) -> pl.DataFrame: # pragma: no cover
"""Convert this input to a Polars DataFrame."""

def to_pyarrow_bytes(self, schema: Schema) -> bytes:
import pyarrow as pa
import pyarrow_hotfix # noqa: F401
Expand Down
12 changes: 12 additions & 0 deletions ibis/formats/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import warnings
from functools import partial
from importlib.util import find_spec as _find_spec
from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
Expand All @@ -19,6 +20,9 @@
from ibis.formats.numpy import NumpyType
from ibis.formats.pyarrow import PyArrowData, PyArrowSchema, PyArrowType

if TYPE_CHECKING:
import polars as pl

_has_arrow_dtype = hasattr(pd, "ArrowDtype")

if not _has_arrow_dtype:
Expand Down Expand Up @@ -404,3 +408,11 @@ def to_frame(self) -> pd.DataFrame:
def to_pyarrow(self, schema: sch.Schema) -> pa.Table:
pyarrow_schema = PyArrowSchema.from_ibis(schema)
return pa.Table.from_pandas(self.obj, schema=pyarrow_schema)

def to_polars(self, schema: sch.Schema) -> pl.DataFrame:
import polars as pl

from ibis.formats.polars import PolarsSchema

pl_schema = PolarsSchema.from_ibis(schema)
return pl.from_pandas(self.obj, schema_overrides=pl_schema)
19 changes: 18 additions & 1 deletion ibis/formats/polars.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@

import ibis.expr.datatypes as dt
from ibis.expr.schema import Schema
from ibis.formats import DataMapper, SchemaMapper, TypeMapper
from ibis.formats import DataMapper, SchemaMapper, TableProxy, TypeMapper

if TYPE_CHECKING:
from collections.abc import Sequence

import pandas as pd
import pyarrow as pa


_to_polars_types = {
dt.Boolean: pl.Boolean,
Expand Down Expand Up @@ -157,3 +160,17 @@ def convert_table(cls, df: pl.DataFrame, schema: Schema) -> pl.DataFrame:
if df.schema == pl_schema:
return df
return df.cast(pl_schema)


class PolarsDataFrameProxy(TableProxy[pl.DataFrame]):
def to_frame(self) -> pd.DataFrame:
return self.obj.to_pandas()

def to_pyarrow(self, schema: Schema) -> pa.Table:
from ibis.formats.pyarrow import PyArrowData

table = self.obj.to_arrow()
return PyArrowData.convert_table(table, schema)

def to_polars(self, schema: Schema) -> pl.DataFrame:
return self.obj
10 changes: 10 additions & 0 deletions ibis/formats/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
if TYPE_CHECKING:
from collections.abc import Sequence

import polars as pl


_from_pyarrow_types = {
pa.int8(): dt.Int8,
Expand Down Expand Up @@ -268,3 +270,11 @@ def to_frame(self):

def to_pyarrow(self, schema: Schema) -> pa.Table:
return self.obj

def to_polars(self, schema: Schema) -> pl.DataFrame:
import polars as pl

from ibis.formats.polars import PolarsData

df = pl.from_arrow(self.obj)
return PolarsData.convert_table(df, schema)

0 comments on commit 464bebc

Please sign in to comment.