Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT-#4359: Add __dataframe__ method to the protocol dataframe #4360

Merged
merged 2 commits into from
Apr 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/release_notes/release_notes-0.15.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Key Features and Updates
* XGBoost enhancements
*
* Developer API enhancements
*
* FEAT-#4359: Add __dataframe__ method to the protocol dataframe (#4360)
* Update testing suite
* TEST-#4363: Use Ray from pypi in CI (#4364)
* Documentation improvements
Expand All @@ -28,3 +28,4 @@ Key Features and Updates

Contributors
------------
@YarShev
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,33 @@ class ProtocolDataframe(ABC):
to the dataframe interchange protocol specification.
"""

@abstractmethod
def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
"""
Get a new dataframe exchange object.

See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html.

Parameters
----------
nan_as_null : bool, default: False
A keyword intended for the consumer to tell the producer
to overwrite null values in the data with ``NaN`` (or ``NaT``).
This currently has no effect; once support for nullable extension
dtypes is added, this value should be propagated to columns.
allow_copy : bool, default: True
A keyword that defines whether or not the library is allowed
to make a copy of the data. For example, copying data would be necessary
if a library supports strided buffers, given that this protocol
specifies contiguous buffers. Currently, if the flag is set to ``False``
and a copy is needed, a ``RuntimeError`` will be raised.

Returns
-------
ProtocolDataframe
"""
pass
YarShev marked this conversation as resolved.
Show resolved Hide resolved

@property
@abstractmethod
def metadata(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ def __init__(
self._nan_as_null = nan_as_null
self._allow_copy = allow_copy

def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
return PandasProtocolDataframe(
self._df, nan_as_null=nan_as_null, allow_copy=allow_copy
)

@property
def metadata(self) -> Dict[str, Any]:
return {"modin.index": self._df.index}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,11 @@ def __init__(
self._nan_as_null = nan_as_null
self._allow_copy = allow_copy

def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True):
return OmnisciProtocolDataframe(
self._df, nan_as_null=nan_as_null, allow_copy=allow_copy
)

@property
@raise_copy_alert_if_materialize
def metadata(self) -> Dict[str, Any]:
Expand Down
16 changes: 10 additions & 6 deletions modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,25 +202,29 @@ def test_simple_import(data_has_nulls):
"""Test that ``modin.pandas.utils.from_dataframe`` works properly."""
data = get_data_of_all_types(data_has_nulls)

md_df_producer = pd.DataFrame(data)
modin_df_producer = pd.DataFrame(data)
internal_modin_df_producer = modin_df_producer.__dataframe__()
# Our configuration in pytest.ini requires that we explicitly catch all
# instances of defaulting to pandas, this one raises a warning on `.from_dataframe`
with warns_that_defaulting_to_pandas():
md_df_consumer = from_dataframe(md_df_producer)
modin_df_consumer = from_dataframe(modin_df_producer)
internal_modin_df_consumer = from_dataframe(internal_modin_df_producer)

# TODO: the following assertions verify that `from_dataframe` doesn't return
# the same object untouched due to optimization branching, it actually should
# do so but the logic is not implemented yet, so the assertions are passing
# for now. It's required to replace the producer's type with a different one
# to consumer when we have some other implementation of the protocol as the
# assertions may start failing shortly.
assert md_df_producer is not md_df_consumer
assert modin_df_producer is not modin_df_consumer
assert internal_modin_df_producer is not internal_modin_df_consumer
assert (
md_df_producer._query_compiler._modin_frame
is not md_df_consumer._query_compiler._modin_frame
modin_df_producer._query_compiler._modin_frame
is not modin_df_consumer._query_compiler._modin_frame
)

df_equals(md_df_producer, md_df_consumer)
df_equals(modin_df_producer, modin_df_consumer)
df_equals(modin_df_producer, internal_modin_df_consumer)


@pytest.mark.parametrize("data_has_nulls", [True, False])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@

def test_simple_import():
modin_df_producer = pd.DataFrame(test_data["int_data"])
internal_modin_df_producer = modin_df_producer.__dataframe__()
# Our configuration in pytest.ini requires that we explicitly catch all
# instances of defaulting to pandas, this one raises a warning on `.from_dataframe`
with warns_that_defaulting_to_pandas():
modin_df_consumer = from_dataframe(modin_df_producer)
internal_modin_df_consumer = from_dataframe(internal_modin_df_producer)

# TODO: the following assertions verify that `from_dataframe` doesn't return
# the same object untouched due to optimization branching, it actually should
Expand All @@ -33,9 +35,11 @@ def test_simple_import():
# to consumer when we have some other implementation of the protocol as the
# assertions may start failing shortly.
assert modin_df_producer is not modin_df_consumer
assert internal_modin_df_producer is not internal_modin_df_consumer
assert (
modin_df_producer._query_compiler._modin_frame
is not modin_df_consumer._query_compiler._modin_frame
)

df_equals(modin_df_producer, modin_df_consumer)
df_equals(modin_df_producer, internal_modin_df_consumer)