Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(formats): remove unnecessary schema argument from schema inference #8814

Merged
merged 2 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions ibis/backends/dask/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@

# import the pandas execution module to register dispatched implementations of
# execute_node that the dask backend will later override
import ibis.expr.operations as ops
import ibis.expr.schema as sch
import ibis.expr.types as ir
from ibis import util
from ibis.backends import NoUrl
Expand Down Expand Up @@ -167,11 +165,14 @@ def read_parquet(
self.dictionary[table_name] = df
return self.table(table_name)

def table(self, name: str, schema: sch.Schema | None = None):
df = self.dictionary[name]
schema = schema or self.schemas.get(name, None)
schema = PandasData.infer_table(df.head(1), schema=schema)
return ops.DatabaseTable(name, schema, self).to_expr()
def get_schema(self, table_name, *, database=None):
try:
schema = self.schemas[table_name]
except KeyError:
df = self.dictionary[table_name]
self.schemas[table_name] = schema = PandasData.infer_table(df.head(1))

return schema

def _convert_object(self, obj) -> dd.DataFrame:
if isinstance(obj, dd.DataFrame):
Expand Down
12 changes: 5 additions & 7 deletions ibis/backends/pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,18 +165,16 @@ def list_tables(self, like=None, database=None):
return self._filter_with_like(list(self.dictionary.keys()), like)

def table(self, name: str, schema: sch.Schema | None = None):
df = self.dictionary[name]
schema = schema or self.schemas.get(name, None)
schema = PandasData.infer_table(df, schema=schema)
return ops.DatabaseTable(name, schema, self).to_expr()
inferred_schema = self.get_schema(name)
overridden_schema = {**inferred_schema, **(schema or {})}
return ops.DatabaseTable(name, overridden_schema, self).to_expr()

def get_schema(self, table_name, *, database=None):
schemas = self.schemas
try:
schema = schemas[table_name]
schema = self.schemas[table_name]
except KeyError:
df = self.dictionary[table_name]
schemas[table_name] = schema = PandasData.infer_table(df)
self.schemas[table_name] = schema = PandasData.infer_table(df)

return schema

Expand Down
9 changes: 0 additions & 9 deletions ibis/expr/datatypes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,11 +247,6 @@ def from_polars(cls, polars_type, nullable=True) -> Self:

return PolarsType.to_ibis(polars_type, nullable=nullable)

@classmethod
def from_dask(cls, dask_type, nullable=True) -> Self:
"""Return the equivalent ibis datatype."""
return cls.from_pandas(dask_type, nullable=nullable)

def to_numpy(self):
"""Return the equivalent numpy datatype."""
from ibis.formats.numpy import NumpyType
Expand All @@ -276,10 +271,6 @@ def to_polars(self):

return PolarsType.from_ibis(self)

def to_dask(self):
"""Return the equivalent dask datatype."""
return self.to_pandas()

def is_array(self) -> bool:
"""Return True if an instance of an Array type."""
return isinstance(self, Array)
Expand Down
26 changes: 7 additions & 19 deletions ibis/expr/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,6 @@ def from_polars(cls, polars_schema):

return PolarsSchema.to_ibis(polars_schema)

@classmethod
def from_dask(cls, dask_schema):
"""Return the equivalent ibis schema."""
return cls.from_pandas(dask_schema)

def to_numpy(self):
"""Return the equivalent numpy dtypes."""
from ibis.formats.numpy import NumpySchema
Expand All @@ -191,10 +186,6 @@ def to_polars(self):

return PolarsSchema.from_ibis(self)

def to_dask(self):
"""Return the equivalent dask dtypes."""
return self.to_pandas()

def as_struct(self) -> dt.Struct:
return dt.Struct(self)

Expand Down Expand Up @@ -238,7 +229,7 @@ def schema(value: Any) -> Schema:


@lazy_singledispatch
def infer(value: Any, schema=None) -> Schema:
def infer(value: Any) -> Schema:
"""Infer the corresponding ibis schema for a python object."""
raise InputTypeError(value)

Expand Down Expand Up @@ -278,28 +269,25 @@ def from_pyarrow_schema(schema):


@infer.register("pandas.DataFrame")
def infer_pandas_dataframe(df, schema=None):
def infer_pandas_dataframe(df):
from ibis.formats.pandas import PandasData

return PandasData.infer_table(df, schema)
return PandasData.infer_table(df)


# TODO(kszucs): do we really need the schema kwarg?
@infer.register("pyarrow.Table")
def infer_pyarrow_table(table, schema=None):
def infer_pyarrow_table(table):
from ibis.formats.pyarrow import PyArrowSchema

schema = schema if schema is not None else table.schema
return PyArrowSchema.to_ibis(schema)
return PyArrowSchema.to_ibis(table.schema)


@infer.register("polars.DataFrame")
@infer.register("polars.LazyFrame")
def infer_polars_dataframe(df, schema=None):
def infer_polars_dataframe(df):
from ibis.formats.polars import PolarsSchema

schema = schema if schema is not None else df.schema
return PolarsSchema.to_ibis(schema)
return PolarsSchema.to_ibis(df.schema)


# lock the dispatchers to avoid adding new implementations
Expand Down
11 changes: 0 additions & 11 deletions ibis/expr/tests/test_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,6 @@

has_pandas = True

has_dask = False
with contextlib.suppress(ImportError):
import dask.dataframe as dd # noqa: F401

has_dask = True


def test_whole_schema():
schema = {
Expand Down Expand Up @@ -437,11 +431,6 @@ def test_schema_from_to_numpy_dtypes():
@pytest.mark.parametrize(
("from_method", "to_method"),
[
pytest.param(
"from_dask",
"to_dask",
marks=pytest.mark.skipif(not has_dask, reason="dask not installed"),
),
pytest.param(
"from_pandas",
"to_pandas",
Expand Down
4 changes: 1 addition & 3 deletions ibis/formats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,15 +168,13 @@ def convert_column(cls, obj: C, dtype: DataType) -> C:
raise NotImplementedError

@classmethod
def convert_table(cls, obj: T, schema: Schema) -> T:
def convert_table(cls, obj: T) -> T:
gforsyth marked this conversation as resolved.
Show resolved Hide resolved
"""Convert a format-specific table to the given ibis schema.

Parameters
----------
obj
The format-specific table-like object to convert.
schema
The Ibis schema to convert to.

Returns
-------
Expand Down
17 changes: 6 additions & 11 deletions ibis/formats/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,25 +94,20 @@ def infer_column(cls, s):
return PyArrowData.infer_column(s)

@classmethod
def infer_table(cls, df, schema=None):
schema = schema if schema is not None else {}

def infer_table(cls, df):
pairs = []
for column_name in df.dtypes.keys():
if not isinstance(column_name, str):
raise TypeError(
"Column names must be strings to use the pandas backend"
)

if column_name in schema:
ibis_dtype = schema[column_name]
pandas_column = df[column_name]
pandas_dtype = pandas_column.dtype
if pandas_dtype == np.object_:
ibis_dtype = cls.infer_column(pandas_column)
else:
pandas_column = df[column_name]
pandas_dtype = pandas_column.dtype
if pandas_dtype == np.object_:
ibis_dtype = cls.infer_column(pandas_column)
else:
ibis_dtype = PandasType.to_ibis(pandas_dtype)
ibis_dtype = PandasType.to_ibis(pandas_dtype)

pairs.append((column_name, ibis_dtype))

Expand Down
Loading