diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index 3dfac85712bc..7887d3fbf6af 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -17,7 +17,7 @@ import ibis.expr.operations as ops from ibis.backends.pandas.rewrites import PandasAsofJoin, PandasJoin, PandasRename from ibis.expr.operations.udf import InputType -from ibis.formats.polars import PolarsSchema, PolarsType +from ibis.formats.polars import PolarsType from ibis.util import gen_name @@ -66,21 +66,8 @@ def dummy_table(op, **kw): @translate.register(ops.InMemoryTable) -def pandas_in_memory_table(op, **_): - lf = pl.from_pandas(op.data.to_frame()).lazy() - schema = PolarsSchema.to_ibis(lf.schema) - - columns = [] - for name, current_dtype in schema.items(): - desired_dtype = op.schema[name] - if current_dtype != desired_dtype: - typ = PolarsType.from_ibis(desired_dtype) - columns.append(pl.col(name).cast(typ)) - - if columns: - return lf.with_columns(columns) - else: - return lf +def in_memory_table(op, **_): + return op.data.to_polars(op.schema).lazy() @translate.register(ops.Alias) diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index b7139813a31c..f53994cc4a69 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -946,7 +946,7 @@ def test_memtable_bool_column(con): assert Counter(con.execute(t.a)) == Counter(data) -def test_memtable_construct(backend, con, monkeypatch): +def test_memtable_construct_from_pyarrow(backend, con, monkeypatch): pa = pytest.importorskip("pyarrow") monkeypatch.setattr(ibis.options, "default_backend", con) @@ -964,6 +964,24 @@ def test_memtable_construct(backend, con, monkeypatch): ) +@pytest.mark.parametrize("lazy", [False, True]) +def test_memtable_construct_from_polars(backend, con, lazy): + pl = pytest.importorskip("polars") + df = pl.DataFrame( + { + "a": list("abc"), + "b": [1, 2, 3], + "c": [1.0, 2.0, 3.0], + "d": [None, "b", None], + } + ) + obj = df.lazy() if lazy else df + t = ibis.memtable(obj) + res = con.to_pandas(t.order_by("a")).fillna(pd.NA) + sol = df.to_pandas().fillna(pd.NA) + backend.assert_frame_equal(res, sol) + + @pytest.mark.parametrize( "df, columns, expected", [ diff --git a/ibis/expr/api.py b/ibis/expr/api.py index a9cf4445360b..a83879286753 100644 --- a/ibis/expr/api.py +++ b/ibis/expr/api.py @@ -45,6 +45,7 @@ from pathlib import Path import pandas as pd + import polars as pl import pyarrow as pa from ibis.expr.schema import SchemaLike @@ -359,14 +360,15 @@ def memtable( Parameters ---------- data - Any data accepted by the `pandas.DataFrame` constructor or a `pyarrow.Table`. + A table-like object (`pandas.DataFrame`, `pyarrow.Table`, or + `polars.DataFrame`), or any data accepted by the `pandas.DataFrame` + constructor (e.g. a list of dicts). - Examples of acceptable objects are a `pandas.DataFrame`, a `pyarrow.Table`, - a list of dicts of non-ibis Python objects, etc. - `ibis` objects, like `MapValue`, will result in an error. + Note that ibis objects (e.g. `MapValue`) may not be passed in as part + of `data` and will result in an error. - Do not depend on the underlying storage type (e.g., pyarrow.Table), it's subject - to change across non-major releases. + Do not depend on the underlying storage type (e.g., pyarrow.Table), + it's subject to change across non-major releases. columns Optional [](`typing.Iterable`) of [](`str`) column names. If provided, must match the number of columns in `data`. @@ -507,6 +509,31 @@ def _memtable_from_pyarrow_table( ).to_expr() +@_memtable.register("polars.LazyFrame") +def _memtable_from_polars_lazyframe(data: pl.LazyFrame, **kwargs): + return _memtable_from_polars_dataframe(data.collect(), **kwargs) + + +@_memtable.register("polars.DataFrame") +def _memtable_from_polars_dataframe( + data: pl.DataFrame, + *, + name: str | None = None, + schema: SchemaLike | None = None, + columns: Iterable[str] | None = None, +): + from ibis.formats.polars import PolarsDataFrameProxy + + if columns is not None: + assert schema is None, "if `columns` is not `None` then `schema` must be `None`" + schema = sch.Schema(dict(zip(columns, sch.infer(data).values()))) + return ops.InMemoryTable( + name=name if name is not None else util.gen_name("polars_memtable"), + schema=sch.infer(data) if schema is None else schema, + data=PolarsDataFrameProxy(data), + ).to_expr() + + def _deferred_method_call(expr, method_name): method = operator.methodcaller(method_name) if isinstance(expr, str): diff --git a/ibis/formats/__init__.py b/ibis/formats/__init__.py index b089ffb6e547..d8c3b24669c7 100644 --- a/ibis/formats/__init__.py +++ b/ibis/formats/__init__.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: import pandas as pd + import polars as pl import pyarrow as pa from ibis.expr.datatypes import DataType @@ -249,6 +250,10 @@ def to_frame(self) -> pd.DataFrame: # pragma: no cover def to_pyarrow(self, schema: Schema) -> pa.Table: # pragma: no cover """Convert this input to a PyArrow Table.""" + @abstractmethod + def to_polars(self, schema: Schema) -> pl.DataFrame: # pragma: no cover + """Convert this input to a Polars DataFrame.""" + def to_pyarrow_bytes(self, schema: Schema) -> bytes: import pyarrow as pa import pyarrow_hotfix # noqa: F401 diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py index 9c4269efe23a..55f76598092b 100644 --- a/ibis/formats/pandas.py +++ b/ibis/formats/pandas.py @@ -5,6 +5,7 @@ import warnings from functools import partial from importlib.util import find_spec as _find_spec +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -19,6 +20,9 @@ from ibis.formats.numpy import NumpyType from ibis.formats.pyarrow import PyArrowData, PyArrowSchema, PyArrowType +if TYPE_CHECKING: + import polars as pl + _has_arrow_dtype = hasattr(pd, "ArrowDtype") if not _has_arrow_dtype: @@ -404,3 +408,11 @@ def to_frame(self) -> pd.DataFrame: def to_pyarrow(self, schema: sch.Schema) -> pa.Table: pyarrow_schema = PyArrowSchema.from_ibis(schema) return pa.Table.from_pandas(self.obj, schema=pyarrow_schema) + + def to_polars(self, schema: sch.Schema) -> pl.DataFrame: + import polars as pl + + from ibis.formats.polars import PolarsSchema + + pl_schema = PolarsSchema.from_ibis(schema) + return pl.from_pandas(self.obj, schema_overrides=pl_schema) diff --git a/ibis/formats/polars.py b/ibis/formats/polars.py index fb6fd3fff6ee..8af1c595d217 100644 --- a/ibis/formats/polars.py +++ b/ibis/formats/polars.py @@ -6,11 +6,14 @@ import ibis.expr.datatypes as dt from ibis.expr.schema import Schema -from ibis.formats import DataMapper, SchemaMapper, TypeMapper +from ibis.formats import DataMapper, SchemaMapper, TableProxy, TypeMapper if TYPE_CHECKING: from collections.abc import Sequence + import pandas as pd + import pyarrow as pa + _to_polars_types = { dt.Boolean: pl.Boolean, @@ -157,3 +160,17 @@ def convert_table(cls, df: pl.DataFrame, schema: Schema) -> pl.DataFrame: if df.schema == pl_schema: return df return df.cast(pl_schema) + + +class PolarsDataFrameProxy(TableProxy[pl.DataFrame]): + def to_frame(self) -> pd.DataFrame: + return self.obj.to_pandas() + + def to_pyarrow(self, schema: Schema) -> pa.Table: + from ibis.formats.pyarrow import PyArrowData + + table = self.obj.to_arrow() + return PyArrowData.convert_table(table, schema) + + def to_polars(self, schema: Schema) -> pl.DataFrame: + return self.obj diff --git a/ibis/formats/pyarrow.py b/ibis/formats/pyarrow.py index 3617c093cb4d..4d1ffebc4b24 100644 --- a/ibis/formats/pyarrow.py +++ b/ibis/formats/pyarrow.py @@ -12,6 +12,8 @@ if TYPE_CHECKING: from collections.abc import Sequence + import polars as pl + _from_pyarrow_types = { pa.int8(): dt.Int8, @@ -268,3 +270,11 @@ def to_frame(self): def to_pyarrow(self, schema: Schema) -> pa.Table: return self.obj + + def to_polars(self, schema: Schema) -> pl.DataFrame: + import polars as pl + + from ibis.formats.polars import PolarsData + + df = pl.from_arrow(self.obj) + return PolarsData.convert_table(df, schema)