From aa1cfc578a7458a358c3a49844d8a4b493f6c016 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 22 Feb 2022 16:39:54 +0300 Subject: [PATCH 01/33] FEAT-#4244: Implement dataframe exchange protocol for OmniSci Signed-off-by: Dmitry Chigarev --- .github/workflows/ci.yml | 2 + .github/workflows/push.yml | 2 + .../base/exchange/dataframe_protocol/utils.py | 1 + .../omnisci_on_native/dataframe/dataframe.py | 41 ++ .../omnisci_on_native/exchange/__init__.py | 12 + .../exchange/dataframe_protocol/__init__.py | 16 + .../exchange/dataframe_protocol/buffer.py | 117 +++++ .../exchange/dataframe_protocol/column.py | 411 +++++++++++++++++ .../exchange/dataframe_protocol/dataframe.py | 419 ++++++++++++++++++ .../exchange/dataframe_protocol/utils.py | 32 ++ .../storage_formats/omnisci/query_compiler.py | 4 +- .../dataframe_protocol/omnisci/__init__.py | 12 + .../omnisci/test_protocol.py | 18 + .../dataframe_protocol/test_general.py | 173 ++++++++ 14 files changed, 1257 insertions(+), 3 deletions(-) create mode 100644 modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/__init__.py create mode 100644 modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py create mode 100644 modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py create mode 100644 modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py create mode 100644 modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py create mode 100644 modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py create mode 100644 modin/test/exchange/dataframe_protocol/omnisci/__init__.py create mode 100644 modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py create mode 100644 modin/test/exchange/dataframe_protocol/test_general.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 20ae9ea578d..ebe8b62dedf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -339,6 +339,8 @@ jobs: - run: MODIN_BENCHMARK_MODE=True pytest modin/pandas/test/internals/test_benchmark_mode.py - run: pytest modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py - run: pytest modin/pandas/test/test_io.py::TestCsv --verbose + - run: pytest modin/test/exchange/dataframe_protocol/test_general.py + - run: pytest modin/test/exchange/dataframe_protocol/omnisci - uses: codecov/codecov-action@v2 test-asv-benchmarks: diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 1a466a56bf6..58e2aa64065 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -110,6 +110,8 @@ jobs: - run: pytest modin/test/storage_formats/omnisci/test_internals.py - run: pytest modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py - run: pytest modin/pandas/test/test_io.py::TestCsv + - run: pytest modin/test/exchange/dataframe_protocol/test_general.py + - run: pytest modin/test/exchange/dataframe_protocol/omnisci - uses: codecov/codecov-action@v2 test-all: diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py b/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py index 7dbe7e04fb9..96369662758 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py @@ -124,6 +124,7 @@ class ArrowCTypes: # - microseconds -> 'u' # - nanoseconds -> 'n' TIMESTAMP = "ts{resolution}:{tz}" + TIME = "tt{resolution}" def pandas_dtype_to_arrow_c(dtype) -> str: diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index e4c44fb9486..20804be9ab7 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2037,6 +2037,47 @@ def _get_columns(self): """ return super(OmnisciOnNativeDataframe, self)._get_columns() + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> dict: + """ + Get a DataFrame exchange protocol object representing data of the Modin DataFrame. + + Parameters + ---------- + nan_as_null : bool, default: False + A keyword intended for the consumer to tell the producer + to overwrite null values in the data with ``NaN`` (or ``NaT``). + This currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + allow_copy : bool, default: True + A keyword that defines whether or not the library is allowed + to make a copy of the data. For example, copying data would be necessary + if a library supports strided buffers, given that this protocol + specifies contiguous buffers. Currently, if the flag is set to ``False`` + and a copy is needed, a ``RuntimeError`` will be raised. + + Returns + ------- + dict + A dictionary object following the dataframe protocol specification. + """ + if self._has_unsupported_data: + pd_df = self.to_pandas() + if hasattr(pd_df, "__dataframe__"): + return pd_df.__dataframe__() + raise NotImplementedError( + "OmniSci execution does not support exchange protocol if the frame contains data types " + + "that are unsupported by OmniSci." + ) + + from ..exchange.dataframe_protocol import OmnisciProtocolDataframe + + return { + "dataframe": OmnisciProtocolDataframe( + self, nan_as_null=nan_as_null, allow_copy=allow_copy + ), + "version": 0, + } + columns = property(_get_columns) index = property(_get_index) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/__init__.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py new file mode 100644 index 00000000000..42f5b7d53b3 --- /dev/null +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py @@ -0,0 +1,16 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +from .dataframe import OmnisciProtocolDataframe + +__all__ = ["OmnisciProtocolDataframe"] diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py new file mode 100644 index 00000000000..cde70dfd793 --- /dev/null +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py @@ -0,0 +1,117 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pyarrow as pa + +from typing import Tuple +from modin.core.dataframe.base.exchange.dataframe_protocol.utils import DlpackDeviceType +from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( + ProtocolBuffer, +) + + +class OmnisciProtocolBuffer(ProtocolBuffer): + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both (a) data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + + Parameters + ---------- + x : np.ndarray + Data to be held by ``Buffer``. + allow_copy : bool, default: True + A keyword that defines whether or not the library is allowed + to make a copy of the data. For example, copying data would be necessary + if a library supports strided buffers, given that this protocol + specifies contiguous buffers. Currently, if the flag is set to ``False`` + and a copy is needed, a ``RuntimeError`` will be raised. + """ + + def __init__(self, buff: pa.Buffer, allow_copy: bool = True) -> None: + """ + Handle only regular columns (= numpy arrays) for now. + """ + self._buff = buff + + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + return self._buff.size + + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + return self._buff.address + + def __dlpack__(self): + """ + DLPack not implemented in NumPy yet, so leave it out here. + + Produce DLPack capsule (see array API standard). + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + raise NotImplementedError("__dlpack__") + + def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. Enum members are:: + - CPU = 1 + - CUDA = 2 + - CPU_PINNED = 3 + - OPENCL = 4 + - VULKAN = 7 + - METAL = 8 + - VPI = 9 + - ROCM = 10 + Note: must be implemented even if ``__dlpack__`` is not. + """ + + return (DlpackDeviceType.CPU, None) + + def __repr__(self) -> str: + """ + Return a string representation for a particular ``Buffer``. + + Returns + ------- + str + """ + return ( + "Buffer(" + + str( + { + "bufsize": self.bufsize, + "ptr": self.ptr, + "device": self.__dlpack_device__()[0].name, + } + ) + + ")" + ) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py new file mode 100644 index 00000000000..b189c8f2dee --- /dev/null +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -0,0 +1,411 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pyarrow as pa +import pandas +import numpy as np + +from typing import Any, Optional, Tuple, Dict, Iterable +from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( + DTypeKind, + ColumnNullType, + pandas_dtype_to_arrow_c, +) +from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( + ProtocolColumn, +) +from .buffer import OmnisciProtocolBuffer +from .utils import arrow_dtype_to_arrow_c + + +class OmnisciProtocolColumn(ProtocolColumn): + """ + A column object, with only the methods and properties required by the interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Parameters + ---------- + column : DataFrame + A ``DataFrame`` object. + allow_copy : bool, default: True + A keyword that defines whether or not the library is allowed + to make a copy of the data. For example, copying data would be necessary + if a library supports strided buffers, given that this protocol + specifies contiguous buffers. Currently, if the flag is set to ``False`` + and a copy is needed, a ``RuntimeError`` will be raised. + offset : int, default: 0 + The offset of the first element + + Notes + ----- + This Column object can only be produced by ``__dataframe__``, + so doesn't need its own version or ``__column__`` protocol. + """ + + def __init__(self, column: "DataFrame") -> None: + """ + Note: doesn't deal with extension arrays yet, just assume a regular + Series/ndarray for now. + """ + self._col = column + + @property + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Returns + ------- + int + Size of the column, in elements. + """ + return self._col.num_rows() + + @property + def offset(self) -> int: + """ + Get the offset of first element. + + May be > 0 if using chunks; for example for a column + with N chunks of equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + + Returns + ------- + int + The offset of first element. + """ + return self._col._offset + + @property + def dtype(self) -> Tuple[DTypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, endianness)``, where + + * Kind : DTypeKind + * Bit-width : the number of bits as an integer + * Format string : data type description format string in Apache Arrow C + Data Interface format. + * Endianness : current only native endianness (``=``) is supported + + Kind : + + - INT = 0 # infer + - UINT = 1 # infer + - FLOAT = 2 # infer + - BOOL = 20 # infer + - STRING = 21 # infer? + - DATETIME = 22 # have to materialize to deduce resolution (always should be ns???) + - CATEGORICAL = 23 # not implemented error + + Notes + ----- + - Kind specifiers are aligned with DLPack where possible + (hence the jump to 20, leave enough room for future extension). + - Masks must be specified as boolean with either bit width 1 (for bit masks) + or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the future + we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and for categoricals. + - For categoricals, the format string describes the type of the categorical + in the data buffer. In case of a separate encoding of the categorical + (e.g. an integer to string mapping), this can be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, decimal, + and nested (list, struct, map, union) dtypes. + """ + dtype = self._pandas_dtype + + if pandas.api.types.is_datetime64_dtype(dtype): + return self._dtype_from_pyarrow(self._arrow_dtype) + elif pandas.api.types.is_categorical_dtype(dtype): + return ( + DTypeKind.CATEGORICAL, + 32, + pandas_dtype_to_arrow_c(np.dtype("int32")), + "=", + ) + elif pandas.api.types.is_string_dtype(dtype): + return (DTypeKind.STRING, 8, pandas_dtype_to_arrow_c(dtype), "=") + else: + return self._dtype_from_primitive_pandas(dtype) + + def _dtype_from_pyarrow(self, dtype): + kind = None + if ( + pa.types.is_timestamp(dtype) + or pa.types.is_date(dtype) + or pa.types.is_time(dtype) + ): + kind = DTypeKind.DATETIME + elif pa.types.is_dictionary(dtype): + kind = DTypeKind.CATEGORICAL + elif pa.types.is_string(dtype): + kind = DTypeKind.STRING + + if kind is not None: + return (kind, dtype.bit_width, arrow_dtype_to_arrow_c(dtype), "=") + else: + return self._dtype_from_primitive_pandas(np.dtype(dtype.to_pandas_dtype())) + + def _dtype_from_primitive_pandas(self, dtype) -> Tuple[DTypeKind, int, str, str]: + """ + See `self.dtype` for details. + """ + _np_kinds = { + "i": DTypeKind.INT, + "u": DTypeKind.UINT, + "f": DTypeKind.FLOAT, + "b": DTypeKind.BOOL, + } + kind = _np_kinds.get(dtype.kind, None) + if kind is None: + raise NotImplementedError( + f"Data type {dtype} not supported by exchange protocol" + ) + return ( + kind, + dtype.itemsize * 8, + pandas_dtype_to_arrow_c(dtype), + dtype.byteorder, + ) + + @property + def describe_categorical(self) -> Dict[str, Any]: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate dictionary-style encoding for categorical values. + + TBD: are there any other in-memory representations that are needed? + + Returns + ------- + dict + Content of returned dict: + - "is_ordered" : bool, whether the ordering of dictionary indices is + semantically meaningful. + - "is_dictionary" : bool, whether a dictionary-style mapping of + categorical values to other objects exists + - "mapping" : dict, Python-level only (e.g. ``{int: str}``). + None if not a dictionary-style categorical. + + Raises + ------ + ``RuntimeError`` if the dtype is not categorical. + """ + dtype = self._pandas_dtype + + if dtype != "category": + raise RuntimeError( + f"Column 'dtype' has to be categorical to be able to dectribe categiries, met: {dtype}" + ) + + ordered = dtype.ordered + mapping = {index: value for index, value in enumerate(dtype.categories)} + + return { + "is_ordered": ordered, + "is_dictionary": True, + "mapping": mapping, + } + + @property + def describe_null(self) -> Tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype uses. + + Return as a tuple ``(kind, value)``. + + * Kind: + - 0 : non-nullable + - 1 : NaN/NaT + - 2 : sentinel value + - 3 : bit mask + - 4 : byte mask + * Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. None + otherwise. + + Returns + ------- + tuple + ``(kind, value)``. + """ + null_buffer = self._pyarrow_table.column(0).chunks[0].buffers()[0] + if null_buffer is None: + return (ColumnNullType.NON_NULLABLE, None) + else: + return (ColumnNullType.USE_BITMASK, 0) + + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + ncount = self._pyarrow_table.column(0).null_count + return ncount if ncount >= 0 else None + + @property + def metadata(self) -> Dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + return {} + + @property + def _pandas_dtype(self): + return self._col._df.dtypes.iloc[0] + + @property + def _arrow_dtype(self): + return self._pyarrow_table.column(0).type + + @property + def _pyarrow_table(self): + return self._col._pyarrow_table + + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + + Returns + ------- + int + The number of chunks the column consists of. + """ + return self._col.num_chunks() + + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: + """ + Return an iterator yielding the chunks. + + By default ``n_chunks=None``, yields the chunks that the data is stored as by the producer. + If given, ``n_chunks`` must be a multiple of ``self.num_chunks()``, + meaning the producer must subdivide each chunk before yielding it. + + Parameters + ---------- + n_chunks : int, optional + Number of chunks to yield. + + Yields + ------ + DataFrame + A ``DataFrame`` object(s). + """ + for chunk in self._col.get_chunks(n_chunks): + yield OmnisciProtocolColumn(chunk) + + def get_buffers(self) -> Dict[str, Any]: + """ + Return a dictionary containing the underlying buffers. + + Returns + ------- + dict + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary data + (e.g., variable-length strings) and whose second element is the offsets + buffer's associated dtype. None if the data buffer does not have + an associated offsets buffer. + """ + if self.num_chunks() != 1: + raise NotImplementedError() + + external_dtype = self.dtype + internal_dtype = self._dtype_from_pyarrow(self._arrow_dtype) + + if external_dtype != internal_dtype: + at = self._propagate_dtype(external_dtype) + else: + at = self._pyarrow_table + pyarrow_array = at.column(0).chunks[0] + + result = dict() + result["data"] = self._get_data_buffer(pyarrow_array) + result["validity"] = self._get_validity_buffer(pyarrow_array) + result["offsets"] = self._get_offsets_buffer(pyarrow_array) + + return result + + def _get_data_buffer(self, arr): + arrow_type = self._dtype_from_pyarrow(arr.type) + + if arrow_type[0] == DTypeKind.CATEGORICAL: + arr = arr.indices + + data_buffer = OmnisciProtocolBuffer(arr.buffers()[-1]) + return data_buffer, arrow_type + + def _get_validity_buffer(self, arr): + validity_buffer = arr.buffers()[0] + if validity_buffer is None: + return validity_buffer + + return OmnisciProtocolBuffer(validity_buffer), ( + DTypeKind.BOOL, + 1, + pandas_dtype_to_arrow_c(np.dtype("bool")), + "=", + ) + + def _get_offsets_buffer(self, arr): + buffs = arr.buffers() + if len(buffs) < 3: + return None + + return OmnisciProtocolBuffer(buffs[1]), self._dtype_from_primitive_pandas( + np.dtype("int32") + ) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py new file mode 100644 index 00000000000..e25c3c1917d --- /dev/null +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -0,0 +1,419 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +""" +Dataframe exchange protocol implementation. + +See more in https://data-apis.org/dataframe-protocol/latest/index.html. + +Public API +---------- +from_dataframe : construct a DataFrame from an input data frame which + implements the exchange protocol. +Notes +----- +- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to + do in pure Python. It's more general but definitely less friendly than having + ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack + ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), + this is worth looking at again. +""" + +import collections +import numpy as np + +from typing import Optional, Iterable, Sequence +from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe +from modin.core.dataframe.base.exchange.dataframe_protocol import ProtocolDataframe + +from modin.experimental.core.execution.native.implementations.omnisci_on_native.df_algebra import ( + MaskNode, + FrameNode, + TransformNode, + UnionNode, +) +from .column import OmnisciProtocolColumn + + +class OmnisciProtocolDataframe(ProtocolDataframe): + """ + A data frame class, with only the methods required by the interchange protocol defined. + + Instances of this (private) class are returned from ``modin.pandas.DataFrame.__dataframe__`` + as objects with the methods and attributes defined on this class. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. Columns may be accessed by name or by position. + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + + Parameters + ---------- + df : ModinDataframe + A ``ModinDataframe`` object. + nan_as_null : bool, default:False + A keyword intended for the consumer to tell the producer + to overwrite null values in the data with ``NaN`` (or ``NaT``). + This currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + allow_copy : bool, default: True + A keyword that defines whether or not the library is allowed + to make a copy of the data. For example, copying data would be necessary + if a library supports strided buffers, given that this protocol + specifies contiguous buffers. Currently, if the flag is set to ``False`` + and a copy is needed, a ``RuntimeError`` will be raised. + """ + + def __init__( + self, + df: ModinDataframe, + nan_as_null: bool = False, + allow_copy: bool = True, + offset: int = 0, + ) -> None: + if nan_as_null: + raise NotImplementedError( + "Proccessing of `nan_as_null=True` is not yet supported." + ) + + self._df = df + self._nan_as_null = nan_as_null + self._allow_copy = allow_copy + self._offset = offset + + @property + def metadata(self): + # TODO: as the frame's index is stored as a separate column inside pyarrow table + # we may want to return the column's name here instead of materialized index. + # This will require the internal index column to be visible in the protocol's column + # accessor methods. + self._maybe_raise_if_materialize() + return {"index": self._df.index} + + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + + Returns + ------- + int + The number of columns in the DataFrame. + """ + return len(self._df.columns) + + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + + Returns + ------- + int + The number of rows in the DataFrame. + """ + if not self._allow_copy and not self._is_zero_copy_possible: + raise RuntimeError("Copy required with 'allow_copy=False'") + return len(self._df.index) + + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + + Returns + ------- + int + The number of chunks the DataFrame consists of. + """ + return len(self._chunk_slices) - 1 + + __chunk_slices = None + + @property + def _chunk_slices(self): + """ + Compute chunk start-stop indices in the underlying pyarrow table. + + Returns + ------- + np.ndarray + An array holding start-stop indices of the chunks, for ex. ``[0, 5, 10, 20]`` + describes 3 chunks bound by the following indices: + chunk1: [0, 5), + chunk2: [5, 10), + chunk3: [10, 20). + + Notes + ----- + Arrow table allows for the columns to be chunked independently, so in order to satisfy + the protocol's requirement of equally chunked columns, we have to align column chunks + with the minimal one. For example: + Originally chunked table: Aligned table: + |col0|col1| |col0|col1| + | | | | | | + |0 |a | |0 |a | + |----|b | |----|----| + |1 |----| |1 |b | + |2 |c | |----|----| + |3 |d | |2 |c | + |----|----| |3 |d | + |4 |e | |----|----| + |4 |e | + """ + if self.__chunk_slices is None: + at = self._pyarrow_table + col_slices = set({0}) + for col in at.columns: + col_slices = col_slices.union( + np.cumsum([len(chunk) for chunk in col.chunks]) + ) + self.__chunk_slices = np.sort( + np.fromiter(col_slices, dtype=int, count=len(col_slices)) + ) + + return self.__chunk_slices + + def _maybe_raise_if_materialize(self): + """Raise a ``RuntimeError`` if the way of retrieving the data violates the ``allow_copy`` flag.""" + if not self._allow_copy and not self._is_zero_copy_possible: + raise RuntimeError("Copy required with 'allow_copy=False'") + + __is_zero_copy_possible = None + + @property + def _is_zero_copy_possible(self): + """ + Check whether it's possible to retrieve data from the DataFrame zero-copy. + + The 'zero-copy' term also means that no extra computations or data transers + are needed to access the data. + + Returns + ------- + bool + """ + if self.__is_zero_copy_possible is None: + if self._df._has_arrow_table(): + self.__is_zero_copy_possible = True + elif not self._df._can_execute_arrow(): + self.__is_zero_copy_possible = False + else: + self.__is_zero_copy_possible = self._is_zero_copy_op(self._df.op) + return self.__is_zero_copy_possible + + @classmethod + def _is_zero_copy_op(cls, op): + """ + Check whether the passed node of the delayed computation tree could be executed zero-copy via pyarrow execution. + + Parameters + ---------- + op : DFAlgNode + + Returns + ------- + bool + """ + is_zero_copy_op = False + if isinstance(op, (FrameNode, TransformNode, UnionNode)): + is_zero_copy_op = True + elif isinstance(op, MaskNode) and ( + isinstance(op.row_positions, slice) or is_range_like(op.row_positions) + ): + is_zero_copy_op = True + return is_zero_copy_op and all( + cls._is_zero_copy_op(_op) for _op in getattr(op, "inputs", []) + ) + + @property + def _pyarrow_table(self): + """Get ``pyarrow.Table`` representing the dataframe.""" + self._maybe_raise_if_materialize() + + if not self._df._has_arrow_table(): + self._df._execute() + + at = self._df._partitions[0][0].arrow_table + assert at is not None + return at + + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + + Yields + ------ + str + The name of the column(s). + """ + for col in self._df.columns: + yield col + + def get_column(self, i: int) -> OmnisciProtocolColumn: + """ + Return the column at the indicated position. + + Returns + ------- + Column + The column at the indicated position. + """ + return OmnisciProtocolColumn( + OmnisciProtocolDataframe( + self._df.mask(col_positions=[i]), + allow_copy=self._allow_copy, + offset=self._offset, + ), + ) + + def get_column_by_name(self, name: str) -> OmnisciProtocolColumn: + """ + Return the column whose name is the indicated name. + + Returns + ------- + Column + The column whose name is the indicated name. + """ + return OmnisciProtocolColumn( + OmnisciProtocolDataframe( + self._df.mask(col_labels=[name]), + allow_copy=self._allow_copy, + offset=self._offset, + ), + ) + + def get_columns(self) -> Iterable[OmnisciProtocolColumn]: + """ + Return an iterator yielding the columns. + + Yields + ------ + Column + The ``Column`` object(s). + """ + for name in self._df.columns: + yield OmnisciProtocolColumn( + OmnisciProtocolDataframe( + self._df.mask(col_labels=[name]), + allow_copy=self._allow_copy, + offset=self._offset, + ), + ) + + def select_columns(self, indices: Sequence[int]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by index. + + Returns + ------- + DataFrame + A new DataFrame with selected a subset of columns by index. + """ + if not isinstance(indices, collections.Sequence): + raise ValueError("`indices` is not a sequence") + + return OmnisciProtocolDataframe( + self._df.mask(col_positions=list(indices)), + allow_copy=self._allow_copy, + offset=self._offset, + ) + + def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": + """ + Create a new DataFrame by selecting a subset of columns by name. + + Returns + ------- + DataFrame + A new DataFrame with selected a subset of columns by name. + """ + if not isinstance(names, collections.Sequence): + raise ValueError("`names` is not a sequence") + + return OmnisciProtocolDataframe( + self._df.mask(col_labels=list(names)), + allow_copy=self._allow_copy, + offset=self._offset, + ) + + def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: + """ + Return an iterator yielding the chunks. + + By default ``n_chunks=None``, yields the chunks that the data is stored as by the producer. + If given, ``n_chunks`` must be a multiple of ``self.num_chunks()``, + meaning the producer must subdivide each chunk before yielding it. + + Parameters + ---------- + n_chunks : int, optional + Number of chunks to yield. + + Yields + ------ + DataFrame + A ``DataFrame`` object(s). + """ + if n_chunks is None: + return self._yield_chunks(self._chunk_slices) + + if n_chunks % self.num_chunks() != 0: + raise RuntimeError( + "The passed `n_chunks` has to be a multiple of `num_chunks`." + ) + + extra_chunks = n_chunks - self.num_chunks() + subdivided_slices = self._chunk_slices.copy() + + for _ in range(extra_chunks): + # 1. Find the biggest chunk + # 2. Split it in the middle + biggest_chunk_idx = np.argmax(np.diff(subdivided_slices)) + new_chunk_offset = ( + subdivided_slices[biggest_chunk_idx + 1] + - subdivided_slices[biggest_chunk_idx] + ) // 2 + if new_chunk_offset == 0: + raise RuntimeError( + "The passed `n_chunks` value is bigger than the amout of rows in the frame." + ) + subdivided_slices = np.insert( + subdivided_slices, + biggest_chunk_idx + 1, + subdivided_slices[biggest_chunk_idx] + new_chunk_offset, + ) + + return self._yield_chunks(subdivided_slices) + + def _yield_chunks(self, chunk_slices): + """ + Yield dataframe chunks according to the passed chunking. + + Parameters + ---------- + chunk_slices : list + + Yield + ----- + DataFrame + """ + for i in range(len(chunk_slices) - 1): + yield OmnisciProtocolDataframe( + df=self._df.mask( + row_positions=range(chunk_slices[i], chunk_slices[i + 1]) + ), + allow_copy=self._allow_copy, + nan_as_null=self._nan_as_null, + offset=chunk_slices[i], + ) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py new file mode 100644 index 00000000000..00fa6f77a09 --- /dev/null +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py @@ -0,0 +1,32 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pyarrow as pa + +from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( + ArrowCTypes, + pandas_dtype_to_arrow_c, +) + + +def arrow_dtype_to_arrow_c(dtype): + if pa.types.is_timestamp(dtype): + return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit, tz=dtype.tz or "") + elif pa.types.is_date(dtype): + return getattr(ArrowCTypes, f"DATE{dtype.bit_width}", "DATE64") + elif pa.types.is_time(dtype): + return ArrowCTypes.TIME.format(resolution=dtype.unit) + elif pa.types.is_dictionary(dtype): + return arrow_dtype_to_arrow_c(dtype.index_type) + else: + return pandas_dtype_to_arrow_c(np.dtype(dtype.to_pandas_dtype())) diff --git a/modin/experimental/core/storage_formats/omnisci/query_compiler.py b/modin/experimental/core/storage_formats/omnisci/query_compiler.py index 56643a4e55f..90678601d9b 100644 --- a/modin/experimental/core/storage_formats/omnisci/query_compiler.py +++ b/modin/experimental/core/storage_formats/omnisci/query_compiler.py @@ -204,9 +204,7 @@ def from_arrow(cls, at, data_cls): # Dataframe exchange protocol def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True) -> dict: - raise NotImplementedError( - "The selected execution does not implement the DataFrame exchange protocol yet." - ) + return self._modin_frame.__dataframe__()["dataframe"] @classmethod def from_dataframe(cls, df, data_cls): diff --git a/modin/test/exchange/dataframe_protocol/omnisci/__init__.py b/modin/test/exchange/dataframe_protocol/omnisci/__init__.py new file mode 100644 index 00000000000..cae6413e559 --- /dev/null +++ b/modin/test/exchange/dataframe_protocol/omnisci/__init__.py @@ -0,0 +1,12 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py new file mode 100644 index 00000000000..7c527d5164b --- /dev/null +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -0,0 +1,18 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Dataframe exchange protocol tests that are specific for OmniSci implementation.""" + + +def test_zero_copy(): + pass diff --git a/modin/test/exchange/dataframe_protocol/test_general.py b/modin/test/exchange/dataframe_protocol/test_general.py new file mode 100644 index 00000000000..ace72c7b0b3 --- /dev/null +++ b/modin/test/exchange/dataframe_protocol/test_general.py @@ -0,0 +1,173 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Dataframe exchange protocol tests that are common for every implementation.""" + +import pytest +import math +import ctypes + +import modin.pandas as pd + + +@pytest.fixture +def df_from_dict(): + def maker(dct, is_categorical=False): + df = pd.DataFrame(dct, dtype=("category" if is_categorical else None)) + return df + + return maker + + +@pytest.mark.parametrize( + "test_data", + [ + {"a": ["foo", "bar"], "b": ["baz", "qux"]}, + {"a": [1.5, 2.5, 3.5], "b": [9.2, 10.5, 11.8]}, + {"A": [1, 2, 3, 4], "B": [1, 2, 3, 4]}, + ], + ids=["str_data", "float_data", "int_data"], +) +def test_only_one_dtype(test_data, df_from_dict): + columns = list(test_data.keys()) + df = df_from_dict(test_data) + dfX = df.__dataframe__() + + column_size = len(test_data[columns[0]]) + for column in columns: + assert dfX.get_column_by_name(column).null_count == 0 + assert dfX.get_column_by_name(column).size == column_size + assert dfX.get_column_by_name(column).offset == 0 + + +def test_float_int(df_from_dict): + df = df_from_dict( + { + "a": [1, 2, 3], + "b": [3, 4, 5], + "c": [1.5, 2.5, 3.5], + "d": [9, 10, 11], + "e": [True, False, True], + "f": ["a", "", "c"], + } + ) + dfX = df.__dataframe__() + columns = {"a": 0, "b": 0, "c": 2, "d": 0, "e": 20, "f": 21} + + for column, kind in columns.items(): + colX = dfX.get_column_by_name(column) + assert colX.null_count == 0 + assert colX.size == 3 + assert colX.offset == 0 + + assert colX.dtype[0] == kind + + assert dfX.get_column_by_name("c").dtype[1] == 64 + + +def test_na_float(df_from_dict): + df = df_from_dict({"a": [1.0, math.nan, 2.0]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name("a") + assert colX.null_count == 1 + + +def test_noncategorical(df_from_dict): + df = df_from_dict({"a": [1, 2, 3]}) + dfX = df.__dataframe__() + colX = dfX.get_column_by_name("a") + with pytest.raises(RuntimeError): + colX.describe_categorical + + +def test_categorical(df_from_dict): + df = df_from_dict( + {"weekday": ["Mon", "Tue", "Mon", "Wed", "Mon", "Thu", "Fri", "Sat", "Sun"]}, + is_categorical=True, + ) + + colX = df.__dataframe__().get_column_by_name("weekday") + is_ordered, is_dictionary, _ = colX.describe_categorical.values() + assert isinstance(is_ordered, bool) + assert isinstance(is_dictionary, bool) + + +def test_dataframe(df_from_dict): + df = df_from_dict( + {"x": [True, True, False], "y": [1, 2, 0], "z": [9.2, 10.5, 11.8]} + ) + dfX = df.__dataframe__() + + assert dfX.num_columns() == 3 + assert dfX.num_rows() == 3 + assert dfX.num_chunks() == 1 + assert list(dfX.column_names()) == ["x", "y", "z"] + assert list(dfX.select_columns((0, 2)).column_names()) == list( + dfX.select_columns_by_name(("x", "z")).column_names() + ) + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_df_get_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.num_rows() for chunk in chunks) == size + + +@pytest.mark.parametrize(["size", "n_chunks"], [(10, 3), (12, 3), (12, 5)]) +def test_column_get_chunks(size, n_chunks, df_from_dict): + df = df_from_dict({"x": list(range(size))}) + dfX = df.__dataframe__() + chunks = list(dfX.get_column(0).get_chunks(n_chunks)) + assert len(chunks) == n_chunks + assert sum(chunk.size for chunk in chunks) == size + + +def test_get_columns(df_from_dict): + df = df_from_dict({"a": [0, 1], "b": [2.5, 3.5]}) + dfX = df.__dataframe__() + for colX in dfX.get_columns(): + assert colX.size == 2 + assert colX.num_chunks() == 1 + assert dfX.get_column(0).dtype[0] == 0 + assert dfX.get_column(1).dtype[0] == 2 + + +def test_buffer(df_from_dict): + arr = [0, 1, -1] + df = df_from_dict({"a": arr}) + dfX = df.__dataframe__() + colX = dfX.get_column(0) + bufX = colX.get_buffers() + + dataBuf, dataDtype = bufX["data"] + assert dataBuf.bufsize > 0 + assert dataBuf.ptr != 0 + device, _ = dataBuf.__dlpack_device__() + + assert dataDtype[0] == 0 + + if device == 1: # CPU-only as we're going to directly read memory here + bitwidth = dataDtype[1] + ctype = { + 8: ctypes.c_int8, + 16: ctypes.c_int16, + 32: ctypes.c_int32, + 64: ctypes.c_int64, + }[bitwidth] + + for idx, truth in enumerate(arr): + val = ctype.from_address(dataBuf.ptr + idx * (bitwidth // 8)).value + assert val == truth, f"Buffer at index {idx} mismatch" From bb9b898026a8bf259e82b0562952fe17a7dc73aa Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 28 Feb 2022 20:36:31 +0300 Subject: [PATCH 02/33] Implement 'from_dataframe' and other various fixes Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/__utils.py | 328 ++++++++++++++++++ .../exchange/dataframe_protocol/buffer.py | 7 +- .../exchange/dataframe_protocol/column.py | 122 +++++-- .../exchange/dataframe_protocol/dataframe.py | 11 +- .../exchange/dataframe_protocol/utils.py | 10 +- .../omnisci/test_protocol.py | 63 +++- 6 files changed, 506 insertions(+), 35 deletions(-) create mode 100644 modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py new file mode 100644 index 00000000000..93be29cb710 --- /dev/null +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py @@ -0,0 +1,328 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +""" +This module contains draft implementations of the functions, converting __dataframe__ +object to `pandas.DataFrame`. The location and implementations of the functions is a +subject to change, however, the contract of `from_dataframe` is supposed to stay the same. +""" + +import pandas +import ctypes +import numpy as np + +from typing import Optional +from modin.core.dataframe.base.exchange.dataframe_protocol.utils import DTypeKind +from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( + ProtocolDataframe, + ProtocolColumn, +) + + +def from_dataframe( + df: ProtocolDataframe, allow_copy: bool = True, nchunks: Optional[int] = None +): + """ + Build ``pandas.DataFrame`` from an object supporting DataFrame exchange protocol (__dataframe__). + + Parameters + ---------- + df : ProtocolDataframe + Object supporting the exchange protocol (__dataframe__). + allow_copy : bool, default True + Whether to allow for `df` providing a copy of underlying data. + nchunks : int, optional + Number of chunks to split `df`. + + Returns + ------- + pandas.DataFrame + """ + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + df = df.__dataframe__()["dataframe"] + + def _get_pandas_df(df): + # We need a dict of columns here, with each column being a numpy array (at + # least for now, deal with non-numpy dtypes later). + columns = dict() + _k = DTypeKind + _buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + # Simple numerical or bool dtype, turn into numpy array + columns[name], _buf = convert_column_to_ndarray(col) + elif dtype == _k.CATEGORICAL: + columns[name], _buf = convert_categorical_column(col) + elif dtype == _k.STRING: + columns[name], _buf = convert_string_column(col) + elif dtype == _k.DATETIME: + columns[name], _buf = convert_datetime_col(col) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + _buffers.append(_buf) + + pandas_df = pandas.DataFrame(columns) + pandas_df._buffers = _buffers + return pandas_df + + pandas_dfs = [] + for chunk in df.get_chunks(nchunks): + pandas_df = _get_pandas_df(chunk) + pandas_dfs.append(pandas_df) + # Can't preserve index for now + pandas_df = pandas.concat(pandas_dfs, axis=0, ignore_index=True) + return pandas_df + + +def convert_datetime_col(col): + if col.describe_null[0] not in (0, 3): + raise NotImplementedError( + "Null values represented as masks or " "sentinel values not handled yet" + ) + + _, _, fmt, _ = col.dtype + dbuf, dtype = col.get_buffers()["data"] + data = buffer_to_ndarray(dbuf, (DTypeKind.UINT, dtype[1], "u", "="), col.offset) + if fmt.startswith("ts"): + # timestamp ts{unit}:tz + meta = fmt[2:].split(":") + if len(meta) == 1: + unit = meta[0] + tz = "" + else: + unit, tz = meta + if tz != "": + raise NotImplementedError("Timezones are not supported yet") + if unit != "s": + unit += "s" + data = data.astype(f"datetime64[{unit}]") + elif fmt.startswith("td"): + # date td{Days/Ms} + unit = fmt[2:] + if unit == "D": + # to seconds (converting to uint64 to avoid overflow) + data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") + elif unit == "m": + data = data.astype("datetime64[ms]") + else: + raise NotImplementedError(f"Date unit is not supported: {unit}") + else: + raise NotImplementedError(f"Datetime is not supported: {fmt}") + + if col.describe_null[0] == 3: + null_mask = get_null_positions_from_bit_mask( + col.get_buffers()["validity"][0], col.offset, col.size + ) + data[null_mask] = None + elif col.describe_null[0] in (0, 1, 2): + pass + else: + raise NotImplementedError( + "Such null kind is not supported for datetime conversion" + ) + + return data, dbuf + + +def convert_column_to_ndarray(col: ProtocolColumn) -> np.ndarray: + """ + Convert an int, uint, float or bool column to a numpy array. + """ + + if col.describe_null[0] not in (0, 1, 3): + raise NotImplementedError( + "Null values represented as masks or " "sentinel values not handled yet" + ) + + _buffer, _dtype = col.get_buffers()["data"] + data, _bfr = buffer_to_ndarray(_buffer, _dtype, col.offset), _buffer + + if col.describe_null[0] == 3: + null_pos = get_null_positions_from_bit_mask( + col.get_buffers()["validity"][0], col.offset, col.size + ) + if np.any(null_pos): + # convert to null-able type + data = data.astype(float) + data[null_pos] = np.nan + + return data, _bfr + + +def buffer_to_ndarray(_buffer, _dtype, offset, allow_none_buffer=False) -> np.ndarray: + # Handle the dtype + if allow_none_buffer and _buffer is None: + return None + kind = _dtype[0] + bitwidth = _dtype[1] + _k = DTypeKind + if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): + raise RuntimeError("Not a boolean, integer or floating-point dtype") + + _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} + _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} + _floats = {32: np.float32, 64: np.float64} + _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} + column_dtype = _np_dtypes[kind][bitwidth] + + # No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast( + _buffer.ptr + offset * (bitwidth // 8), ctypes.POINTER(ctypes_type) + ) + + # NOTE: `x` does not own its memory, so the caller of this function must + # either make a copy or hold on to a reference of the column or + # buffer! (not done yet, this is pretty awful ...) + x = np.ctypeslib.as_array(data_pointer, shape=(_buffer.bufsize // (bitwidth // 8),)) + return x + + +def convert_categorical_column(col: ProtocolColumn) -> pandas.Series: + """ + Convert a categorical column to a Series instance. + """ + ordered, is_dict, mapping = col.describe_categorical.values() + if not is_dict: + raise NotImplementedError("Non-dictionary categoricals not supported yet") + + categories = np.asarray(list(mapping.values())) + codes_buffer, codes_dtype = col.get_buffers()["data"] + codes = buffer_to_ndarray(codes_buffer, codes_dtype, col.offset) + values = categories[codes] + + cat = pandas.Categorical(values, categories=categories, ordered=ordered) + series = pandas.Series(cat) + null_kind = col.describe_null[0] + if null_kind == 2: # sentinel value + sentinel = col.describe_null[1] + series[codes == sentinel] = np.nan + elif null_kind == 3: + null_values = get_null_positions_from_bit_mask( + col.get_buffers()["validity"][0], col.offset, col.size + ) + series[null_values] = np.nan + elif null_kind == 0: + pass + else: + raise NotImplementedError( + "Only categorical columns with sentinel " "value supported at the moment" + ) + + return series, codes_buffer + + +def get_null_positions_from_bit_mask(buffer, offset, mask_length): + ctypes_type = np.ctypeslib.as_ctypes_type(np.uint8) + data_pointer = ctypes.cast((buffer.ptr + offset // 8), ctypes.POINTER(ctypes_type)) + + first_byte_offset = offset % 8 + x = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) + + null_mask = np.zeros(mask_length, dtype=bool) + # Proccessing the first byte separately as it has its own offset + val = x[0] + mask_idx = 0 + for j in range(min(8 - first_byte_offset, mask_length)): + if not val & (1 << (j + first_byte_offset)): + null_mask[mask_idx] = True + mask_idx += 1 + + for i in range(1, mask_length // 8): + val = x[i] + for j in range(8): + if not val & (1 << j): + null_mask[mask_idx] = True + mask_idx += 1 + + if len(x) > 1: + # Processing reminder of last byte + val = x[-1] + for j in range(len(null_mask) - mask_idx): + if not val & (1 << j): + null_mask[mask_idx] = True + mask_idx += 1 + + return null_mask + + +def convert_string_column(col: ProtocolColumn) -> np.ndarray: + """ + Convert a string column to a NumPy array. + """ + # Retrieve the data buffers + # breakpoint() + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + dbuffer, bdtype = buffers["data"] + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + obuffer, odtype = buffers["offsets"] + + # Retrieve the mask buffer indicating the presence of missing values + mbuffer, mdtype = buffers["validity"] or (None, None) + # Retrieve the missing value encoding + null_kind, null_value = col.describe_null + + # Convert the buffers to NumPy arrays + dt = ( + DTypeKind.UINT, + 8, + None, + None, + ) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) + dbuf = buffer_to_ndarray(dbuffer, dt, 0) + # breakpoint() + obuf = buffer_to_ndarray(obuffer, odtype, col.offset) + # breakpoint() + if null_kind == 4: + mbuf = buffer_to_ndarray(mbuffer, mdtype, col.offset, allow_none_buffer=True) + elif null_kind == 3: + mbuf = get_null_positions_from_bit_mask(mbuffer, col.offset, col.size) + + # Assemble the strings from the code units + str_list = [] + for i in range(obuf.size - 1): + # Check for missing values + if null_kind == 3 and mbuf[i]: # bit mask + str_list.append(np.nan) + continue + elif null_kind == 4 and mbuf[i] == null_value: # byte mask + str_list.append(np.nan) + continue + + # Extract a range of code units + units = dbuf[obuf[i] : obuf[i + 1]] + + # Convert the list of code units to bytes + b = bytes(units) + + # Create the string + s = b.decode(encoding="utf-8") + + # Add to our list of strings + str_list.append(s) + # breakpoint() + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object"), buffers diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py index cde70dfd793..e33ebcf63e1 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py @@ -13,7 +13,7 @@ import pyarrow as pa -from typing import Tuple +from typing import Tuple, Optional from modin.core.dataframe.base.exchange.dataframe_protocol.utils import DlpackDeviceType from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( ProtocolBuffer, @@ -46,18 +46,19 @@ class OmnisciProtocolBuffer(ProtocolBuffer): and a copy is needed, a ``RuntimeError`` will be raised. """ - def __init__(self, buff: pa.Buffer, allow_copy: bool = True) -> None: + def __init__(self, buff: pa.Buffer, size: Optional[int] = None) -> None: """ Handle only regular columns (= numpy arrays) for now. """ self._buff = buff + self._size = self._buff.size if size is None else size @property def bufsize(self) -> int: """ Buffer size in bytes. """ - return self._buff.size + return self._size @property def ptr(self) -> int: diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index b189c8f2dee..7247d2be910 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -16,6 +16,7 @@ import numpy as np from typing import Any, Optional, Tuple, Dict, Iterable +from math import ceil from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( DTypeKind, ColumnNullType, @@ -117,7 +118,11 @@ def offset(self) -> int: int The offset of first element. """ - return self._col._offset + # The offset may change if it would require to cast buffers as the casted ones + # no longer depend on their parent tables. So materializing casted buffers + # before accessing the offset + self._materialize_actual_buffers() + return self._pyarrow_table.column(0).chunks[0].offset @property def dtype(self) -> Tuple[DTypeKind, int, str, str]: @@ -182,13 +187,16 @@ def _dtype_from_pyarrow(self, dtype): or pa.types.is_time(dtype) ): kind = DTypeKind.DATETIME + bit_width = dtype.bit_width elif pa.types.is_dictionary(dtype): kind = DTypeKind.CATEGORICAL + bit_width = dtype.bit_width elif pa.types.is_string(dtype): kind = DTypeKind.STRING + bit_width = 8 if kind is not None: - return (kind, dtype.bit_width, arrow_dtype_to_arrow_c(dtype), "=") + return (kind, bit_width, arrow_dtype_to_arrow_c(dtype), "=") else: return self._dtype_from_primitive_pandas(np.dtype(dtype.to_pandas_dtype())) @@ -364,13 +372,8 @@ def get_buffers(self) -> Dict[str, Any]: if self.num_chunks() != 1: raise NotImplementedError() - external_dtype = self.dtype - internal_dtype = self._dtype_from_pyarrow(self._arrow_dtype) - - if external_dtype != internal_dtype: - at = self._propagate_dtype(external_dtype) - else: - at = self._pyarrow_table + self._materialize_actual_buffers() + at = self._pyarrow_table pyarrow_array = at.column(0).chunks[0] result = dict() @@ -380,25 +383,46 @@ def get_buffers(self) -> Dict[str, Any]: return result + def _materialize_actual_buffers(self): + external_dtype = self.dtype + internal_dtype = self._dtype_from_pyarrow(self._arrow_dtype) + + if external_dtype[0] != internal_dtype[0]: + self._propagate_dtype(external_dtype) + + def _get_buffer_size(self, bit_width, is_offset_buffer=False): + elements_in_buffer = self.size + 1 if is_offset_buffer else self.size + return ceil((bit_width * elements_in_buffer) / 8) + def _get_data_buffer(self, arr): + if self.dtype[0] == DTypeKind.CATEGORICAL: + arr = arr.indices + arrow_type = self._dtype_from_pyarrow(arr.type) - if arrow_type[0] == DTypeKind.CATEGORICAL: - arr = arr.indices + buff_size = ( + self._get_buffer_size(bit_width=arrow_type[1]) + if self.dtype[0] != DTypeKind.STRING + else None + ) - data_buffer = OmnisciProtocolBuffer(arr.buffers()[-1]) - return data_buffer, arrow_type + return ( + OmnisciProtocolBuffer(arr.buffers()[-1], buff_size), + arrow_type, + ) def _get_validity_buffer(self, arr): validity_buffer = arr.buffers()[0] if validity_buffer is None: return validity_buffer - return OmnisciProtocolBuffer(validity_buffer), ( - DTypeKind.BOOL, - 1, - pandas_dtype_to_arrow_c(np.dtype("bool")), - "=", + data_size = self._get_buffer_size(bit_width=1) + if self.offset % 8 + self.size > data_size * 8: + data_size += 1 + + return ( + OmnisciProtocolBuffer(validity_buffer, data_size), + self._dtype_from_primitive_pandas(np.dtype("uint8")), ) def _get_offsets_buffer(self, arr): @@ -406,6 +430,64 @@ def _get_offsets_buffer(self, arr): if len(buffs) < 3: return None - return OmnisciProtocolBuffer(buffs[1]), self._dtype_from_primitive_pandas( - np.dtype("int32") + offset_buff = buffs[1] + + dtype = self._dtype_from_primitive_pandas(np.dtype("int32")) + return ( + OmnisciProtocolBuffer( + offset_buff, + self._get_buffer_size(bit_width=dtype[1], is_offset_buffer=True), + ), + dtype, ) + + def _propagate_dtype(self, dtype): + if not self._col._allow_copy: + raise RuntimeError("Copy required with 'allow_copy=False' flag") + + arrow_types_map = { + DTypeKind.BOOL: {8: pa.bool_()}, + DTypeKind.INT: { + 8: pa.int8(), + 16: pa.int16(), + 32: pa.int32(), + 64: pa.int64(), + }, + DTypeKind.UINT: { + 8: pa.uint8(), + 16: pa.uint16(), + 32: pa.uint32(), + 64: pa.uint64(), + }, + DTypeKind.FLOAT: {16: pa.float16(), 32: pa.float32(), 64: pa.float64()}, + DTypeKind.STRING: {8: pa.string()}, + } + kind, bit_width, format_str, _ = dtype + arrow_type = None + + if kind in arrow_types_map: + arrow_type = arrow_types_map[kind].get(bit_width, None) + elif kind == DTypeKind.DATETIME: + arrow_type = pa.timestamp("ns") + elif kind == DTypeKind.CATEGORICAL: + arrow_type = pa.dictionary( + index_type=arrow_types_map[DTypeKind.INT][bit_width], + value_type=pa.string(), + ) + + if arrow_type is None: + raise NotImplementedError(f"Propagation for type {dtype} is not supported.") + + at = self._pyarrow_table + schema_to_cast = at.schema + field = at.schema[0] + + schema_to_cast = schema_to_cast.set( + 0, pa.field(field.name, arrow_type, field.nullable) + ) + + # TODO: currently, each column chunk casts its buffers independently which results + # in an `NCHUNKS - 1` amount of redundant casts. We can make the pyarrow table + # being shared across all the chunks, so the cast being triggered in a single chunk + # propagate to all of them. + self._col._replace_at(at.cast(schema_to_cast)) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index e25c3c1917d..1be8d3188b0 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -81,7 +81,6 @@ def __init__( df: ModinDataframe, nan_as_null: bool = False, allow_copy: bool = True, - offset: int = 0, ) -> None: if nan_as_null: raise NotImplementedError( @@ -91,7 +90,6 @@ def __init__( self._df = df self._nan_as_null = nan_as_null self._allow_copy = allow_copy - self._offset = offset @property def metadata(self): @@ -247,6 +245,9 @@ def _pyarrow_table(self): assert at is not None return at + def _replace_at(self, at): + self._df = self._df.from_arrow(at) + def column_names(self) -> Iterable[str]: """ Return an iterator yielding the column names. @@ -272,7 +273,6 @@ def get_column(self, i: int) -> OmnisciProtocolColumn: OmnisciProtocolDataframe( self._df.mask(col_positions=[i]), allow_copy=self._allow_copy, - offset=self._offset, ), ) @@ -289,7 +289,6 @@ def get_column_by_name(self, name: str) -> OmnisciProtocolColumn: OmnisciProtocolDataframe( self._df.mask(col_labels=[name]), allow_copy=self._allow_copy, - offset=self._offset, ), ) @@ -307,7 +306,6 @@ def get_columns(self) -> Iterable[OmnisciProtocolColumn]: OmnisciProtocolDataframe( self._df.mask(col_labels=[name]), allow_copy=self._allow_copy, - offset=self._offset, ), ) @@ -326,7 +324,6 @@ def select_columns(self, indices: Sequence[int]) -> "DataFrame": return OmnisciProtocolDataframe( self._df.mask(col_positions=list(indices)), allow_copy=self._allow_copy, - offset=self._offset, ) def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": @@ -344,7 +341,6 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": return OmnisciProtocolDataframe( self._df.mask(col_labels=list(names)), allow_copy=self._allow_copy, - offset=self._offset, ) def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: @@ -415,5 +411,4 @@ def _yield_chunks(self, chunk_slices): ), allow_copy=self._allow_copy, nan_as_null=self._nan_as_null, - offset=chunk_slices[i], ) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py index 00fa6f77a09..3ff6c70e18d 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py @@ -12,6 +12,7 @@ # governing permissions and limitations under the License. import pyarrow as pa +import numpy as np from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( ArrowCTypes, @@ -21,11 +22,16 @@ def arrow_dtype_to_arrow_c(dtype): if pa.types.is_timestamp(dtype): - return ArrowCTypes.TIMESTAMP.format(resolution=dtype.unit, tz=dtype.tz or "") + return ArrowCTypes.TIMESTAMP.format( + resolution=dtype.unit[:1], tz=dtype.tz or "" + ) elif pa.types.is_date(dtype): return getattr(ArrowCTypes, f"DATE{dtype.bit_width}", "DATE64") elif pa.types.is_time(dtype): - return ArrowCTypes.TIME.format(resolution=dtype.unit) + # TODO: for some reason `time32` type doesn't have a `unit` attribute, + # always return "s" for now. + # return ArrowCTypes.TIME.format(resolution=dtype.unit[:1]) + return ArrowCTypes.TIME.format(resolution="s") elif pa.types.is_dictionary(dtype): return arrow_dtype_to_arrow_c(dtype.index_type) else: diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 7c527d5164b..7d9099947a0 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -13,6 +13,65 @@ """Dataframe exchange protocol tests that are specific for OmniSci implementation.""" +from modin.experimental.core.execution.native.implementations.omnisci_on_native.exchange.dataframe_protocol.__utils import ( + from_dataframe, +) -def test_zero_copy(): - pass +import modin.pandas as pd +import pandas +import numpy as np + +from modin.pandas.test.utils import df_equals + +data = { + "a": np.array([1, -2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "b": np.array( + [2**64 - 1, 2**64 - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.uint64 + ), + "c": np.array(np.arange(12), dtype="datetime64[ns]"), + "d": np.array(["a", "b", "c"] * 4), + "e": pandas.Categorical(["a", "b", "c"] * 4), +} + + +def test_export(): + md_df = pd.DataFrame(data) + exported_df = from_dataframe(md_df._query_compiler._modin_frame) + df_equals(md_df, exported_df) + + exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=3) + df_equals(md_df, exported_df) + + exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=5) + df_equals(md_df, exported_df) + + exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=12) + df_equals(md_df, exported_df) + + +data_null = { + "a": np.array([1, -2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.int64), + "b": np.array( + [2**64 - 1, 2**64 - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.uint64 + ), + "c": np.array( + [1, 2, None, 4, 5, 6, None, 8, 9, None, None, 12], dtype="datetime64[ns]" + ), + "d": np.array(["a", "b", None] * 4), + "e": pandas.Categorical(["a", None, "c"] * 4), +} + + +def test_export_nulls(): + md_df = pd.DataFrame(data_null) + exported_df = from_dataframe(md_df._query_compiler._modin_frame) + df_equals(md_df, exported_df) + + exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=3) + df_equals(md_df, exported_df) + + exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=5) + df_equals(md_df, exported_df) + + exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=12) + df_equals(md_df, exported_df) From 609c86f20e4b06c4594a6688d5e29726db4cd50c Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 2 Mar 2022 14:43:35 +0300 Subject: [PATCH 03/33] Fix formatting Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/__utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py index 93be29cb710..9cbba8efe9c 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py @@ -96,7 +96,7 @@ def _get_pandas_df(df): def convert_datetime_col(col): if col.describe_null[0] not in (0, 3): raise NotImplementedError( - "Null values represented as masks or " "sentinel values not handled yet" + "Null values represented as masks or sentinel values not handled yet" ) _, _, fmt, _ = col.dtype @@ -150,7 +150,7 @@ def convert_column_to_ndarray(col: ProtocolColumn) -> np.ndarray: if col.describe_null[0] not in (0, 1, 3): raise NotImplementedError( - "Null values represented as masks or " "sentinel values not handled yet" + "Null values represented as masks or sentinel values not handled yet" ) _buffer, _dtype = col.get_buffers()["data"] @@ -226,7 +226,7 @@ def convert_categorical_column(col: ProtocolColumn) -> pandas.Series: pass else: raise NotImplementedError( - "Only categorical columns with sentinel " "value supported at the moment" + "Only categorical columns with sentinel value supported at the moment" ) return series, codes_buffer From be045ceb88ba008497fece663467bc6f044e5579 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 7 Mar 2022 13:34:24 +0300 Subject: [PATCH 04/33] Adding more dtypes for testing Signed-off-by: Dmitry Chigarev --- .../omnisci/test_protocol.py | 125 +++++++++++++----- 1 file changed, 92 insertions(+), 33 deletions(-) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 7d9099947a0..e5e63dcbcf2 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -16,25 +16,92 @@ from modin.experimental.core.execution.native.implementations.omnisci_on_native.exchange.dataframe_protocol.__utils import ( from_dataframe, ) +from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( + ForceOmnisciImport, +) +import pytest import modin.pandas as pd import pandas import numpy as np from modin.pandas.test.utils import df_equals -data = { - "a": np.array([1, -2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "b": np.array( - [2**64 - 1, 2**64 - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.uint64 - ), - "c": np.array(np.arange(12), dtype="datetime64[ns]"), - "d": np.array(["a", "b", "c"] * 4), - "e": pandas.Categorical(["a", "b", "c"] * 4), -} - -def test_export(): +def get_all_types(has_nulls=False): + int_data = {} + uint_data = {} + float_data = {} + datetime_data = {} + string_data = {} + category_data = {} + + # int + for width in (8, 16, 32, 64): + dtype = getattr(np, f"int{width}") + max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min + int_data[f"int{width}_col"] = np.array( + [max_val, max_val - 1, min_val, min_val + 1] * 10, dtype=dtype + ) + + # uint + for width in (8, 16, 32, 64): + dtype = getattr(np, f"uint{width}") + max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min + uint_data[f"uint{width}_col"] = np.array( + [max_val, max_val - 1, min_val, min_val + 1] * 10, dtype=dtype + ) + + # float + for width in (32, 64): + dtype = getattr(np, f"float{width}") + max_val, min_val = np.finfo(dtype).max, np.finfo(dtype).min + float_data[f"float{width}_col"] = np.array( + [max_val, max_val - 1, min_val, min_val + 1] * 10, dtype=dtype + ) + if has_nulls: + float_data[f"float{width}_null_col"] = np.array( + [max_val, None, min_val, min_val + 1] * 10, dtype=dtype + ) + + # datetime + for unit in ("s", "ms", "ns"): + datetime_data[f"datetime64[{unit}]_col"] = np.array( + [0, 1, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") + ) + if has_nulls: + datetime_data[f"datetime64[{unit}]_null_col"] = np.array( + [0, None, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") + ) + + # string + string_data["string_col"] = np.array(["Sample", "te", "", "xt"] * 10) + if has_nulls: + string_data["string_null_col"] = np.array(["Sample", None, "", "xt"] * 10) + + # category + category_data["category_int_col"] = pandas.Categorical([1, 2, 3, 4] * 10) + category_data["category_string_col"] = pandas.Categorical( + ["Sample", "te", "", "xt"] * 10 + ) + if has_nulls: + category_data["category_string_null_col"] = pandas.Categorical( + ["Sample", None, "", "xt"] * 10 + ) + + return { + **int_data, + **uint_data, + **float_data, + **datetime_data, + **string_data, + **category_data, + } + + +@pytest.mark.parametrize("data_has_nulls", [True, False]) +def test_simple_export(data_has_nulls): + data = get_all_types(has_nulls=data_has_nulls) md_df = pd.DataFrame(data) exported_df = from_dataframe(md_df._query_compiler._modin_frame) df_equals(md_df, exported_df) @@ -49,29 +116,21 @@ def test_export(): df_equals(md_df, exported_df) -data_null = { - "a": np.array([1, -2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.int64), - "b": np.array( - [2**64 - 1, 2**64 - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=np.uint64 - ), - "c": np.array( - [1, 2, None, 4, 5, 6, None, 8, 9, None, None, 12], dtype="datetime64[ns]" - ), - "d": np.array(["a", "b", None] * 4), - "e": pandas.Categorical(["a", None, "c"] * 4), -} +# @pytest.mark.parametrize("data_has_nulls", [True, False]) +# def test_export_from_omnisci(data_has_nulls): +# data = get_all_types(has_nulls=data_has_nulls) +# md_df = pd.DataFrame(data) +# with ForceOmnisciImport(md_df) as instance: +# # md_df_exported +# exported_df = from_dataframe(md_df._query_compiler._modin_frame) +# df_equals(md_df, exported_df) -def test_export_nulls(): - md_df = pd.DataFrame(data_null) - exported_df = from_dataframe(md_df._query_compiler._modin_frame) - df_equals(md_df, exported_df) +# exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=3) +# df_equals(md_df, exported_df) - exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=3) - df_equals(md_df, exported_df) +# exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=5) +# df_equals(md_df, exported_df) - exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=5) - df_equals(md_df, exported_df) - - exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=12) - df_equals(md_df, exported_df) +# exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=12) +# df_equals(md_df, exported_df) From ad68ef6bd266e966f47b938a6824ceedef7ff686 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 7 Mar 2022 16:04:30 +0300 Subject: [PATCH 05/33] Add more tests Signed-off-by: Dmitry Chigarev --- .../omnisci_on_native/dataframe/dataframe.py | 1 - .../exchange/dataframe_protocol/__utils.py | 32 +-- .../exchange/dataframe_protocol/column.py | 64 ++++-- .../omnisci/test_protocol.py | 209 ++++++++++-------- .../dataframe_protocol/omnisci/utils.py | 171 ++++++++++++++ 5 files changed, 355 insertions(+), 122 deletions(-) create mode 100644 modin/test/exchange/dataframe_protocol/omnisci/utils.py diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index 0481a79bed8..d2bd2ad7a37 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -665,7 +665,6 @@ def fillna(self, value=None, method=None, axis=None, limit=None, downcast=None): index_cols=self._index_cols, force_execution_mode=self._force_execution_mode, ) - return new_frame def dropna(self, subset, how="any"): diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py index 9cbba8efe9c..cad08b686b5 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py @@ -129,7 +129,7 @@ def convert_datetime_col(col): raise NotImplementedError(f"Datetime is not supported: {fmt}") if col.describe_null[0] == 3: - null_mask = get_null_positions_from_bit_mask( + null_mask = ~bitmask_to_bool_array( col.get_buffers()["validity"][0], col.offset, col.size ) data[null_mask] = None @@ -152,12 +152,12 @@ def convert_column_to_ndarray(col: ProtocolColumn) -> np.ndarray: raise NotImplementedError( "Null values represented as masks or sentinel values not handled yet" ) - + # breakpoint() _buffer, _dtype = col.get_buffers()["data"] - data, _bfr = buffer_to_ndarray(_buffer, _dtype, col.offset), _buffer + data, _bfr = buffer_to_ndarray(_buffer, _dtype, col.offset, col.size), _buffer if col.describe_null[0] == 3: - null_pos = get_null_positions_from_bit_mask( + null_pos = ~bitmask_to_bool_array( col.get_buffers()["validity"][0], col.offset, col.size ) if np.any(null_pos): @@ -168,7 +168,9 @@ def convert_column_to_ndarray(col: ProtocolColumn) -> np.ndarray: return data, _bfr -def buffer_to_ndarray(_buffer, _dtype, offset, allow_none_buffer=False) -> np.ndarray: +def buffer_to_ndarray( + _buffer, _dtype, offset, length=None, allow_none_buffer=False +) -> np.ndarray: # Handle the dtype if allow_none_buffer and _buffer is None: return None @@ -178,6 +180,9 @@ def buffer_to_ndarray(_buffer, _dtype, offset, allow_none_buffer=False) -> np.nd if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): raise RuntimeError("Not a boolean, integer or floating-point dtype") + if bitwidth == 1: + return bitmask_to_bool_array(_buffer, offset, length) + _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} _floats = {32: np.float32, 64: np.float64} @@ -209,7 +214,8 @@ def convert_categorical_column(col: ProtocolColumn) -> pandas.Series: categories = np.asarray(list(mapping.values())) codes_buffer, codes_dtype = col.get_buffers()["data"] codes = buffer_to_ndarray(codes_buffer, codes_dtype, col.offset) - values = categories[codes] + # Doing module in order to not get IndexError for negative sentinel values in the `codes` + values = categories[codes % len(categories)] cat = pandas.Categorical(values, categories=categories, ordered=ordered) series = pandas.Series(cat) @@ -218,7 +224,7 @@ def convert_categorical_column(col: ProtocolColumn) -> pandas.Series: sentinel = col.describe_null[1] series[codes == sentinel] = np.nan elif null_kind == 3: - null_values = get_null_positions_from_bit_mask( + null_values = ~bitmask_to_bool_array( col.get_buffers()["validity"][0], col.offset, col.size ) series[null_values] = np.nan @@ -232,10 +238,10 @@ def convert_categorical_column(col: ProtocolColumn) -> pandas.Series: return series, codes_buffer -def get_null_positions_from_bit_mask(buffer, offset, mask_length): +def bitmask_to_bool_array(buffer, offset, mask_length): ctypes_type = np.ctypeslib.as_ctypes_type(np.uint8) data_pointer = ctypes.cast((buffer.ptr + offset // 8), ctypes.POINTER(ctypes_type)) - + # breakpoint() first_byte_offset = offset % 8 x = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) @@ -244,14 +250,14 @@ def get_null_positions_from_bit_mask(buffer, offset, mask_length): val = x[0] mask_idx = 0 for j in range(min(8 - first_byte_offset, mask_length)): - if not val & (1 << (j + first_byte_offset)): + if val & (1 << (j + first_byte_offset)): null_mask[mask_idx] = True mask_idx += 1 for i in range(1, mask_length // 8): val = x[i] for j in range(8): - if not val & (1 << j): + if val & (1 << j): null_mask[mask_idx] = True mask_idx += 1 @@ -259,7 +265,7 @@ def get_null_positions_from_bit_mask(buffer, offset, mask_length): # Processing reminder of last byte val = x[-1] for j in range(len(null_mask) - mask_idx): - if not val & (1 << j): + if val & (1 << j): null_mask[mask_idx] = True mask_idx += 1 @@ -299,7 +305,7 @@ def convert_string_column(col: ProtocolColumn) -> np.ndarray: if null_kind == 4: mbuf = buffer_to_ndarray(mbuffer, mdtype, col.offset, allow_none_buffer=True) elif null_kind == 3: - mbuf = get_null_positions_from_bit_mask(mbuffer, col.offset, col.size) + mbuf = ~bitmask_to_bool_array(mbuffer, col.offset, col.size) # Assemble the strings from the code units str_list = [] diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index 7247d2be910..3f9ef01db51 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -165,15 +165,13 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]: """ dtype = self._pandas_dtype - if pandas.api.types.is_datetime64_dtype(dtype): + if pandas.api.types.is_bool_dtype(dtype): + return (DTypeKind.BOOL, 1, pandas_dtype_to_arrow_c(np.dtype("bool")), "=") + elif pandas.api.types.is_datetime64_dtype( + dtype + ) or pandas.api.types.is_categorical_dtype(dtype): + # For these types we have to use internal arrow dtype to get proper metadata return self._dtype_from_pyarrow(self._arrow_dtype) - elif pandas.api.types.is_categorical_dtype(dtype): - return ( - DTypeKind.CATEGORICAL, - 32, - pandas_dtype_to_arrow_c(np.dtype("int32")), - "=", - ) elif pandas.api.types.is_string_dtype(dtype): return (DTypeKind.STRING, 8, pandas_dtype_to_arrow_c(dtype), "=") else: @@ -194,6 +192,9 @@ def _dtype_from_pyarrow(self, dtype): elif pa.types.is_string(dtype): kind = DTypeKind.STRING bit_width = 8 + elif pa.types.is_boolean(dtype): + kind = DTypeKind.BOOL + bit_width = dtype.bit_width if kind is not None: return (kind, bit_width, arrow_dtype_to_arrow_c(dtype), "=") @@ -254,7 +255,18 @@ def describe_categorical(self) -> Dict[str, Any]: ) ordered = dtype.ordered - mapping = {index: value for index, value in enumerate(dtype.categories)} + + # Category codes may change during materialization flow, so trigger + # materialization before returning the codes + self._materialize_actual_buffers() + col = self._pyarrow_table.column(0) + if len(col.chunks) > 1: + if not self._col._allow_copy: + raise RuntimeError("Copy required but 'allow_copy=False'") + col = col.combine_chunks() + col = col.chunks[0] + + mapping = {index: value for index, value in enumerate(col.dictionary.tolist())} return { "is_ordered": ordered, @@ -391,8 +403,35 @@ def _materialize_actual_buffers(self): self._propagate_dtype(external_dtype) def _get_buffer_size(self, bit_width, is_offset_buffer=False): + """ + Compute chunk size in bytes. + + Parameters + ---------- + bit_width : int + Bit width of the underlying data type. + is_offset_buffer : bool, default: False + Whether the buffer describes element offsets. + + Returns + ------- + int + Number of bytes to read from the start of the buffer to read the whole chunk. + """ + # Offset buffer always has `size + 1` elements in it as it describes slice bounds elements_in_buffer = self.size + 1 if is_offset_buffer else self.size - return ceil((bit_width * elements_in_buffer) / 8) + result = ceil((bit_width * elements_in_buffer) / 8) + # For a bitmask, if the chunk started in the middle of the byte then we need to + # read one extra byte from the buffer to retrieve the tail in the last byte. Example: + # Bitmask of 3 bytes, the chunk offset is 3 elements and its size is 16 + # |* * * * * * * *|* * * * * * * *|* * * * * * * *| + # ^- the chunk starts here ^- the chunk ends here + # Although ``ceil(bit_width * elements_in_buffer / 8)`` gives us '2 bytes', + # the chunk is located in 3 bytes, that's why we assume the chunk's buffer size + # to be 'result += 1' in this case: + if bit_width == 1 and self.offset % 8 + self.size > result * 8: + result += 1 + return result def _get_data_buffer(self, arr): if self.dtype[0] == DTypeKind.CATEGORICAL: @@ -417,12 +456,11 @@ def _get_validity_buffer(self, arr): return validity_buffer data_size = self._get_buffer_size(bit_width=1) - if self.offset % 8 + self.size > data_size * 8: - data_size += 1 return ( OmnisciProtocolBuffer(validity_buffer, data_size), - self._dtype_from_primitive_pandas(np.dtype("uint8")), + # self._dtype_from_primitive_pandas(np.dtype("uint8")), + (DTypeKind.BOOL, 1, "b", "="), ) def _get_offsets_buffer(self, arr): diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index e5e63dcbcf2..dd604fdfa01 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -13,124 +13,143 @@ """Dataframe exchange protocol tests that are specific for OmniSci implementation.""" -from modin.experimental.core.execution.native.implementations.omnisci_on_native.exchange.dataframe_protocol.__utils import ( - from_dataframe, -) -from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( - ForceOmnisciImport, -) - import pytest import modin.pandas as pd +import pyarrow as pa import pandas -import numpy as np from modin.pandas.test.utils import df_equals +from modin.pandas.utils import from_arrow +from .utils import get_all_types, split_df_into_chunks, export_frame -def get_all_types(has_nulls=False): - int_data = {} - uint_data = {} - float_data = {} - datetime_data = {} - string_data = {} - category_data = {} - - # int - for width in (8, 16, 32, 64): - dtype = getattr(np, f"int{width}") - max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min - int_data[f"int{width}_col"] = np.array( - [max_val, max_val - 1, min_val, min_val + 1] * 10, dtype=dtype - ) - # uint - for width in (8, 16, 32, 64): - dtype = getattr(np, f"uint{width}") - max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min - uint_data[f"uint{width}_col"] = np.array( - [max_val, max_val - 1, min_val, min_val + 1] * 10, dtype=dtype - ) +@pytest.mark.parametrize("data_has_nulls", [True, False]) +@pytest.mark.parametrize("from_omnisci", [True, False]) +def test_simple_export(data_has_nulls, from_omnisci): + if from_omnisci: + # OmniSci can't import 'uint64' as well as booleans + # issue for bool: https://github.com/modin-project/modin/issues/4299 + exclude_dtypes = ["bool", "uint64"] + else: + exclude_dtypes = None + + data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=exclude_dtypes) + md_df = pd.DataFrame(data) - # float - for width in (32, 64): - dtype = getattr(np, f"float{width}") - max_val, min_val = np.finfo(dtype).max, np.finfo(dtype).min - float_data[f"float{width}_col"] = np.array( - [max_val, max_val - 1, min_val, min_val + 1] * 10, dtype=dtype - ) - if has_nulls: - float_data[f"float{width}_null_col"] = np.array( - [max_val, None, min_val, min_val + 1] * 10, dtype=dtype - ) - - # datetime - for unit in ("s", "ms", "ns"): - datetime_data[f"datetime64[{unit}]_col"] = np.array( - [0, 1, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") - ) - if has_nulls: - datetime_data[f"datetime64[{unit}]_null_col"] = np.array( - [0, None, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") - ) - - # string - string_data["string_col"] = np.array(["Sample", "te", "", "xt"] * 10) - if has_nulls: - string_data["string_null_col"] = np.array(["Sample", None, "", "xt"] * 10) - - # category - category_data["category_int_col"] = pandas.Categorical([1, 2, 3, 4] * 10) - category_data["category_string_col"] = pandas.Categorical( - ["Sample", "te", "", "xt"] * 10 - ) - if has_nulls: - category_data["category_string_null_col"] = pandas.Categorical( - ["Sample", None, "", "xt"] * 10 - ) + exported_df = export_frame(md_df, from_omnisci) + df_equals(md_df, exported_df) + + exported_df = export_frame(md_df, from_omnisci, nchunks=3) + df_equals(md_df, exported_df) + + exported_df = export_frame(md_df, from_omnisci, nchunks=5) + df_equals(md_df, exported_df) - return { - **int_data, - **uint_data, - **float_data, - **datetime_data, - **string_data, - **category_data, - } + exported_df = export_frame(md_df, from_omnisci, nchunks=12) + df_equals(md_df, exported_df) +@pytest.mark.parametrize("nchunks", [2, 4, 7]) @pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_simple_export(data_has_nulls): - data = get_all_types(has_nulls=data_has_nulls) - md_df = pd.DataFrame(data) - exported_df = from_dataframe(md_df._query_compiler._modin_frame) +def test_export_aligned_at_chunks(nchunks, data_has_nulls): + """Test export from DataFrame exchange protocol when internal arrow table is equaly chunked.""" + # Modin DataFrame constructor can't process pyarrow's category, so exclude it + data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) + pd_df = pandas.DataFrame(data) + pd_chunks = split_df_into_chunks(pd_df, nchunks) + + chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) + md_df = from_arrow(chunked_at) + assert ( + len(md_df._query_compiler._modin_frame._partitions[0][0].get().column(0).chunks) + == nchunks + ) + + exported_df = export_frame(md_df) + df_equals(md_df, exported_df) + + exported_df = export_frame(md_df, nchunks=nchunks) + df_equals(md_df, exported_df) + + exported_df = export_frame(md_df, nchunks=nchunks * 2) df_equals(md_df, exported_df) - exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=3) + exported_df = export_frame(md_df, nchunks=nchunks * 3) df_equals(md_df, exported_df) - exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=5) + +@pytest.mark.parametrize("data_has_nulls", [True, False]) +def test_export_unaligned_at_chunks(data_has_nulls): + """ + Test export from DataFrame exchange protocol when internal arrow table's chunks are unaligned. + + Arrow table allows for its columns to be chunked independently. Unaligned chunking means that + each column has its individual chunking and so some preprocessing is required in order + to emulate equaly chunked columns in the protocol. + """ + # Modin DataFrame constructor can't process pyarrow's category, so exclude it + data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) + pd_df = pandas.DataFrame(data) + # divide columns in 3 groups: unchunked, 2-chunked, 7-chunked + chunk_groups = [1, 2, 7] + chunk_col_ilocs = [ + slice( + i * len(pd_df.columns) // len(chunk_groups), + (i + 1) * len(pd_df.columns) // len(chunk_groups), + ) + for i in range(len(chunk_groups)) + ] + + pd_chunk_groups = [ + split_df_into_chunks(pd_df.iloc[:, cols], nchunks) + for nchunks, cols in zip(chunk_groups, chunk_col_ilocs) + ] + at_chunk_groups = [ + pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in chunk_group]) + for chunk_group in pd_chunk_groups + ] + + chunked_at = at_chunk_groups[0] + # TODO: appending columns one by one looks inefficient, is there a better way? + for _at in at_chunk_groups[1:]: + for field in _at.schema: + chunked_at = chunked_at.append_column(field, _at[field.name]) + md_df = from_arrow(chunked_at) + + # verify that test generated the correct chunking + internal_at = md_df._query_compiler._modin_frame._partitions[0][0].get() + for nchunks_group, cols in zip(chunk_groups, chunk_col_ilocs): + for col in internal_at.select(range(cols.start, cols.stop)).columns: + assert len(col.chunks) == nchunks_group + + nchunks = md_df.__dataframe__().num_chunks() + + exported_df = export_frame(md_df) df_equals(md_df, exported_df) - exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=12) + exported_df = export_frame(md_df, nchunks=nchunks) df_equals(md_df, exported_df) + exported_df = export_frame(md_df, nchunks=nchunks * 2) + df_equals(md_df, exported_df) -# @pytest.mark.parametrize("data_has_nulls", [True, False]) -# def test_export_from_omnisci(data_has_nulls): -# data = get_all_types(has_nulls=data_has_nulls) -# md_df = pd.DataFrame(data) + exported_df = export_frame(md_df, nchunks=nchunks * 3) + df_equals(md_df, exported_df) -# with ForceOmnisciImport(md_df) as instance: -# # md_df_exported -# exported_df = from_dataframe(md_df._query_compiler._modin_frame) -# df_equals(md_df, exported_df) -# exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=3) -# df_equals(md_df, exported_df) +def test_export_when_delayed_computations(): + # OmniSci can't import 'uint64' as well as booleans, so exclude them + # issue for bool: https://github.com/modin-project/modin/issues/4299 + data = get_all_types(has_nulls=True, exclude_dtypes=["uint64", "bool"]) + md_df = pd.DataFrame(data) + pd_df = pandas.DataFrame(data) -# exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=5) -# df_equals(md_df, exported_df) + md_res = md_df.fillna({"float32_null": 32.0, "float64_null": 64.0}) + pd_res = pd_df.fillna({"float32_null": 32.0, "float64_null": 64.0}) + assert ( + not md_res._query_compiler._modin_frame._has_arrow_table() + ), "There are no delayed computations for the frame" -# exported_df = from_dataframe(md_df._query_compiler._modin_frame, nchunks=12) -# df_equals(md_df, exported_df) + exported_df = export_frame(md_res) + df_equals(exported_df, pd_res) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py new file mode 100644 index 00000000000..11845f6154b --- /dev/null +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -0,0 +1,171 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Utility function for testing OmniSciOnNative implementation for DataFrame exchange protocol.""" + +import pandas +import numpy as np + +from modin.experimental.core.execution.native.implementations.omnisci_on_native.exchange.dataframe_protocol.__utils import ( + from_dataframe, +) +from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( + ForceOmnisciImport, +) + + +def split_df_into_chunks(df, nchunks): + """ + Split passed DataFrame into `nchunks` along row axis. + + Parameters + ---------- + df : DataFrame + DataFrame to split into chunks. + nchunks : int + Number of chunks to split `df` into. + + Returns + ------- + list of DataFrames + """ + chunks = [] + for i in range(nchunks): + start = i * len(df) // nchunks + end = (i + 1) * len(df) // nchunks + chunks.append(df.iloc[start:end]) + + return chunks + + +def export_frame(md_df, from_omnisci=False, **kwargs): + """ + Construct ``pandas.DataFrame`` from ``modin.pandas.DataFrame`` using DataFrame exchange protocol. + + Parameters + ---------- + md_df : modin.pandas.DataFrame + DataFrame to convert to pandas. + from_omnisci : bool, default: False + Whether to forcibly use data exported from OmniSci. If `True`, import DataFrame's + data into OmniSci and then export it back, so the origin for underlying `md_df` + data is OmniSci. + **kwargs : dict + Additional parameters to pass to the ``from_dataframe`` function. + + Returns + ------- + pandas.DataFrame + """ + if not from_omnisci: + return from_dataframe(md_df._query_compiler._modin_frame, **kwargs) + + with ForceOmnisciImport(md_df) as instance: + md_df_exported = instance.export_frames()[0] + exported_df = from_dataframe( + md_df_exported._query_compiler._modin_frame, **kwargs + ) + + return exported_df + + +def get_all_types(has_nulls=False, exclude_dtypes=None): + bool_data = {} + int_data = {} + uint_data = {} + float_data = {} + datetime_data = {} + string_data = {} + category_data = {} + + # bool + bool_data["bool"] = np.array([True, False, True, True] * 10, dtype=bool) + + # int + for width in (8, 16, 32, 64): + dtype = getattr(np, f"int{width}") + max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min + int_data[f"int{width}"] = np.array( + [max_val, max_val - 1, min_val + 1, min_val + 2] * 10, dtype=dtype + ) + + # uint + for width in (8, 16, 32, 64): + dtype = getattr(np, f"uint{width}") + max_val, min_val = np.iinfo(dtype).max, np.iinfo(dtype).min + uint_data[f"uint{width}"] = np.array( + [max_val, max_val - 1, min_val + 1, min_val + 2] * 10, dtype=dtype + ) + + # float + for width in (32, 64): + dtype = getattr(np, f"float{width}") + max_val, min_val = np.finfo(dtype).max, np.finfo(dtype).min + float_data[f"float{width}"] = np.array( + [max_val, max_val - 1, min_val + 1, min_val + 2] * 10, dtype=dtype + ) + if has_nulls: + float_data[f"float{width}_null"] = np.array( + [max_val, None, min_val + 1, min_val + 2] * 10, dtype=dtype + ) + + # datetime + for unit in ("s", "ms", "ns"): + datetime_data[f"datetime64[{unit}]"] = np.array( + [0, 1, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") + ) + if has_nulls: + datetime_data[f"datetime64[{unit}]_null"] = np.array( + [0, None, 2, 3] * 10, dtype=np.dtype(f"datetime64[{unit}]") + ) + + # string + string_data["string"] = np.array( + ["English: test string", " ", "Chinese: 测试字符串", "Russian: тестовая строка"] * 10 + ) + if has_nulls: + string_data["string_null"] = np.array( + ["English: test string", None, "Chinese: 测试字符串", "Russian: тестовая строка"] + * 10 + ) + + # category + category_data["category_string"] = pandas.Categorical( + ["Sample", "te", " ", "xt"] * 10 + ) + # OmniSci does not support non-string categories + # category_data["category_int"] = pandas.Categorical([1, 2, 3, 4] * 10) + if has_nulls: + category_data["category_string_null"] = pandas.Categorical( + ["Sample", None, " ", "xt"] * 10 + ) + + data = { + **bool_data, + **int_data, + **uint_data, + **float_data, + **datetime_data, + **string_data, + **category_data, + } + + if exclude_dtypes is not None: + filtered_keys = ( + key + for key in data.keys() + if not any(key.startswith(dtype) for dtype in exclude_dtypes) + ) + data = {key: data[key] for key in filtered_keys} + + return data From 11f9230c1ca68412a7d6ae0315c1a6408014eee2 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 7 Mar 2022 23:12:47 +0300 Subject: [PATCH 06/33] Write proper doc-strings Signed-off-by: Dmitry Chigarev --- .github/workflows/ci.yml | 1 + .../exchange/dataframe_protocol/__utils.py | 0 .../omnisci_on_native/dataframe/dataframe.py | 1 + .../exchange/dataframe_protocol/buffer.py | 63 +-- .../exchange/dataframe_protocol/column.py | 412 ++++++++---------- .../exchange/dataframe_protocol/dataframe.py | 219 +++------- .../exchange/dataframe_protocol/utils.py | 17 +- .../dataframe_protocol/omnisci/utils.py | 2 +- 8 files changed, 279 insertions(+), 436 deletions(-) rename modin/{experimental/core/execution/native/implementations/omnisci_on_native => core/dataframe/base}/exchange/dataframe_protocol/__utils.py (100%) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9fb3d42bb94..ff0292f5c72 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -115,6 +115,7 @@ jobs: modin/experimental/core/execution/native/implementations/omnisci_on_native/expr.py \ modin/experimental/core/execution/native/implementations/omnisci_on_native/omnisci_worker.py \ - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/omnisci + - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol lint-flake8: name: lint (flake8) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py b/modin/core/dataframe/base/exchange/dataframe_protocol/__utils.py similarity index 100% rename from modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__utils.py rename to modin/core/dataframe/base/exchange/dataframe_protocol/__utils.py diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index d2bd2ad7a37..0481a79bed8 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -665,6 +665,7 @@ def fillna(self, value=None, method=None, axis=None, limit=None, downcast=None): index_cols=self._index_cols, force_execution_mode=self._force_execution_mode, ) + return new_frame def dropna(self, subset, how="any"): diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py index e33ebcf63e1..2dcaafbf38d 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +"""The module houses OmnisciOnNative implementation of the Buffer class of DataFrame exchange protocol.""" + import pyarrow as pa from typing import Tuple, Optional @@ -19,87 +21,44 @@ ProtocolBuffer, ) +from modin.utils import _inherit_docstrings + +@_inherit_docstrings(ProtocolBuffer) class OmnisciProtocolBuffer(ProtocolBuffer): """ - Data in the buffer is guaranteed to be contiguous in memory. - - Note that there is no dtype attribute present, a buffer can be thought of - as simply a block of memory. However, if the column that the buffer is - attached to has a dtype that's supported by DLPack and ``__dlpack__`` is - implemented, then that dtype information will be contained in the return - value from ``__dlpack__``. - - This distinction is useful to support both (a) data exchange via DLPack on a - buffer and (b) dtypes like variable-length strings which do not have a - fixed number of bytes per element. + Wrapper of the ``pyarrow.Buffer`` object representing a continuous segment of memory. Parameters ---------- - x : np.ndarray + buff : pyarrow.Buffer Data to be held by ``Buffer``. - allow_copy : bool, default: True - A keyword that defines whether or not the library is allowed - to make a copy of the data. For example, copying data would be necessary - if a library supports strided buffers, given that this protocol - specifies contiguous buffers. Currently, if the flag is set to ``False`` - and a copy is needed, a ``RuntimeError`` will be raised. + size : int, optional + Size of the buffer in bytes, if not specified use ``buff.size``. + The parameter may be usefull for specifying the size of a virtual chunk. """ def __init__(self, buff: pa.Buffer, size: Optional[int] = None) -> None: - """ - Handle only regular columns (= numpy arrays) for now. - """ self._buff = buff self._size = self._buff.size if size is None else size @property def bufsize(self) -> int: - """ - Buffer size in bytes. - """ return self._size @property def ptr(self) -> int: - """ - Pointer to start of the buffer as an integer. - """ return self._buff.address def __dlpack__(self): - """ - DLPack not implemented in NumPy yet, so leave it out here. - - Produce DLPack capsule (see array API standard). - Raises: - - TypeError : if the buffer contains unsupported dtypes. - - NotImplementedError : if DLPack support is not implemented - Useful to have to connect to array libraries. Support optional because - it's not completely trivial to implement for a Python-only library. - """ raise NotImplementedError("__dlpack__") def __dlpack_device__(self) -> Tuple[DlpackDeviceType, int]: - """ - Device type and device ID for where the data in the buffer resides. - Uses device type codes matching DLPack. Enum members are:: - - CPU = 1 - - CUDA = 2 - - CPU_PINNED = 3 - - OPENCL = 4 - - VULKAN = 7 - - METAL = 8 - - VPI = 9 - - ROCM = 10 - Note: must be implemented even if ``__dlpack__`` is not. - """ - return (DlpackDeviceType.CPU, None) def __repr__(self) -> str: """ - Return a string representation for a particular ``Buffer``. + Produce string representation of the buffer. Returns ------- diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index 3f9ef01db51..d350378a04c 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +"""The module houses OmnisciOnNative implementation of the Column class of DataFrame exchange protocol.""" + import pyarrow as pa import pandas import numpy as np @@ -25,144 +27,51 @@ from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( ProtocolColumn, ) +from modin.utils import _inherit_docstrings from .buffer import OmnisciProtocolBuffer from .utils import arrow_dtype_to_arrow_c +@_inherit_docstrings(ProtocolColumn) class OmnisciProtocolColumn(ProtocolColumn): """ - A column object, with only the methods and properties required by the interchange protocol defined. - - A column can contain one or more chunks. Each chunk can contain up to three - buffers - a data buffer, a mask buffer (depending on null representation), - and an offsets buffer (if variable-size binary; e.g., variable-length strings). - - TBD: Arrow has a separate "null" dtype, and has no separate mask concept. - Instead, it seems to use "children" for both columns with a bit mask, - and for nested dtypes. Unclear whether this is elegant or confusing. - This design requires checking the null representation explicitly. - The Arrow design requires checking: - 1. the ARROW_FLAG_NULLABLE (for sentinel values) - 2. if a column has two children, combined with one of those children - having a null dtype. - Making the mask concept explicit seems useful. One null dtype would - not be enough to cover both bit and byte masks, so that would mean - even more checking if we did it the Arrow way. - TBD: there's also the "chunk" concept here, which is implicit in Arrow as - multiple buffers per array (= column here). Semantically it may make - sense to have both: chunks were meant for example for lazy evaluation - of data which doesn't fit in memory, while multiple buffers per column - could also come from doing a selection operation on a single - contiguous buffer. - Given these concepts, one would expect chunks to be all of the same - size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), - while multiple buffers could have data-dependent lengths. Not an issue - in pandas if one column is backed by a single NumPy array, but in - Arrow it seems possible. - Are multiple chunks *and* multiple buffers per column necessary for - the purposes of this interchange protocol, or must producers either - reuse the chunk concept for this or copy the data? + Wrapper of ``OmnisciProtocolDataframe`` holding a single column. + + The Column object wraps a ``ProtocolDataframe`` to ease referencing original + Modin DataFrame with no materialization of PyArrow table where possible. + ``ProtocolDataframe`` also already implements methods like chunking and ``allow_copy`` + checks, so we can just forward calls for the methods to ``ProtocolDataFrame`` without + reimplementing them. Parameters ---------- - column : DataFrame - A ``DataFrame`` object. - allow_copy : bool, default: True - A keyword that defines whether or not the library is allowed - to make a copy of the data. For example, copying data would be necessary - if a library supports strided buffers, given that this protocol - specifies contiguous buffers. Currently, if the flag is set to ``False`` - and a copy is needed, a ``RuntimeError`` will be raised. - offset : int, default: 0 - The offset of the first element + column : OmnisciProtocolDataframe + DataFrame protocol object holding a PyArrow table with a single column. Notes ----- - This Column object can only be produced by ``__dataframe__``, - so doesn't need its own version or ``__column__`` protocol. + The object could be modified inplace due to casting PyArrow buffers to a new dtype: + ``_propagate_dtype``, ``_cast_at``), the methods replace the wrapped + ``OmnisciProtocolDataframe`` object with the new one holding the casted PyArrow table. """ - def __init__(self, column: "DataFrame") -> None: - """ - Note: doesn't deal with extension arrays yet, just assume a regular - Series/ndarray for now. - """ + def __init__(self, column: "OmnisciProtocolDataframe") -> None: self._col = column @property def size(self) -> int: - """ - Size of the column, in elements. - - Corresponds to DataFrame.num_rows() if column is a single chunk; - equal to size of this current chunk otherwise. - - Returns - ------- - int - Size of the column, in elements. - """ return self._col.num_rows() @property def offset(self) -> int: - """ - Get the offset of first element. - - May be > 0 if using chunks; for example for a column - with N chunks of equal size M (only the last chunk may be shorter), - ``offset = n * M``, ``n = 0 .. N-1``. - - Returns - ------- - int - The offset of first element. - """ # The offset may change if it would require to cast buffers as the casted ones - # no longer depend on their parent tables. So materializing casted buffers - # before accessing the offset + # no longer depend on their parent tables. So materializing buffers + # before returning the offset self._materialize_actual_buffers() return self._pyarrow_table.column(0).chunks[0].offset @property def dtype(self) -> Tuple[DTypeKind, int, str, str]: - """ - Dtype description as a tuple ``(kind, bit-width, format string, endianness)``, where - - * Kind : DTypeKind - * Bit-width : the number of bits as an integer - * Format string : data type description format string in Apache Arrow C - Data Interface format. - * Endianness : current only native endianness (``=``) is supported - - Kind : - - - INT = 0 # infer - - UINT = 1 # infer - - FLOAT = 2 # infer - - BOOL = 20 # infer - - STRING = 21 # infer? - - DATETIME = 22 # have to materialize to deduce resolution (always should be ns???) - - CATEGORICAL = 23 # not implemented error - - Notes - ----- - - Kind specifiers are aligned with DLPack where possible - (hence the jump to 20, leave enough room for future extension). - - Masks must be specified as boolean with either bit width 1 (for bit masks) - or 8 (for byte masks). - - Dtype width in bits was preferred over bytes - - Endianness isn't too useful, but included now in case in the future - we need to support non-native endianness - - Went with Apache Arrow format strings over NumPy format strings - because they're more complete from a dataframe perspective - - Format strings are mostly useful for datetime specification, and for categoricals. - - For categoricals, the format string describes the type of the categorical - in the data buffer. In case of a separate encoding of the categorical - (e.g. an integer to string mapping), this can be derived from ``self.describe_categorical``. - - Data types not included: complex, Arrow-style null, binary, decimal, - and nested (list, struct, map, union) dtypes. - """ dtype = self._pandas_dtype if pandas.api.types.is_bool_dtype(dtype): @@ -178,6 +87,18 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]: return self._dtype_from_primitive_pandas(dtype) def _dtype_from_pyarrow(self, dtype): + """ + Build protocol dtype from PyArrow type. + + Parameters + ---------- + dtype : pyarrow.DataType + Data type to convert from. + + Returns + ------- + tuple(DTypeKind, bitwidth: int, format_str: str, edianess: str) + """ kind = None if ( pa.types.is_timestamp(dtype) @@ -203,15 +124,24 @@ def _dtype_from_pyarrow(self, dtype): def _dtype_from_primitive_pandas(self, dtype) -> Tuple[DTypeKind, int, str, str]: """ - See `self.dtype` for details. + Build protocol dtype from primitive pandas dtype. + + Parameters + ---------- + dtype : {np.int, np.uint, np.float, np.bool} + Data type to convert from. + + Returns + ------- + tuple(DTypeKind, bitwidth: int, format_str: str, edianess: str) """ - _np_kinds = { + np_kinds = { "i": DTypeKind.INT, "u": DTypeKind.UINT, "f": DTypeKind.FLOAT, "b": DTypeKind.BOOL, } - kind = _np_kinds.get(dtype.kind, None) + kind = np_kinds.get(dtype.kind, None) if kind is None: raise NotImplementedError( f"Data type {dtype} not supported by exchange protocol" @@ -225,28 +155,6 @@ def _dtype_from_primitive_pandas(self, dtype) -> Tuple[DTypeKind, int, str, str] @property def describe_categorical(self) -> Dict[str, Any]: - """ - If the dtype is categorical, there are two options: - - There are only values in the data buffer. - - There is a separate dictionary-style encoding for categorical values. - - TBD: are there any other in-memory representations that are needed? - - Returns - ------- - dict - Content of returned dict: - - "is_ordered" : bool, whether the ordering of dictionary indices is - semantically meaningful. - - "is_dictionary" : bool, whether a dictionary-style mapping of - categorical values to other objects exists - - "mapping" : dict, Python-level only (e.g. ``{int: str}``). - None if not a dictionary-style categorical. - - Raises - ------ - ``RuntimeError`` if the dtype is not categorical. - """ dtype = self._pandas_dtype if dtype != "category": @@ -259,13 +167,17 @@ def describe_categorical(self) -> Dict[str, Any]: # Category codes may change during materialization flow, so trigger # materialization before returning the codes self._materialize_actual_buffers() + + # Although we can retrieve codes from pandas dtype, they're unsynced with + # the actual PyArrow data most of the time. So getting the mapping directly + # from materialized PyArrow table. col = self._pyarrow_table.column(0) if len(col.chunks) > 1: if not self._col._allow_copy: raise RuntimeError("Copy required but 'allow_copy=False'") col = col.combine_chunks() - col = col.chunks[0] + col = col.chunks[0] mapping = {index: value for index, value in enumerate(col.dictionary.tolist())} return { @@ -276,26 +188,6 @@ def describe_categorical(self) -> Dict[str, Any]: @property def describe_null(self) -> Tuple[ColumnNullType, Any]: - """ - Return the missing value (or "null") representation the column dtype uses. - - Return as a tuple ``(kind, value)``. - - * Kind: - - 0 : non-nullable - - 1 : NaN/NaT - - 2 : sentinel value - - 3 : bit mask - - 4 : byte mask - * Value : if kind is "sentinel value", the actual value. If kind is a bit - mask or a byte mask, the value (0 or 1) indicating a missing value. None - otherwise. - - Returns - ------- - tuple - ``(kind, value)``. - """ null_buffer = self._pyarrow_table.column(0).chunks[0].buffers()[0] if null_buffer is None: return (ColumnNullType.NON_NULLABLE, None) @@ -304,84 +196,56 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: @property def null_count(self) -> int: - """ - Number of null elements, if known. - Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. - """ ncount = self._pyarrow_table.column(0).null_count return ncount if ncount >= 0 else None @property def metadata(self) -> Dict[str, Any]: - """ - The metadata for the column. See `DataFrame.metadata` for more details. - """ - return {} + return self._col.metadata @property - def _pandas_dtype(self): + def _pandas_dtype(self) -> np.dtype: + """ + Get column's dtype representation in Modin DataFrame. + + Returns + ------- + numpy.dtype + """ return self._col._df.dtypes.iloc[0] @property - def _arrow_dtype(self): + def _arrow_dtype(self) -> pa.DataType: + """ + Get column's dtype representation in underlying PyArrow table. + + Returns + ------- + pyarrow.DataType + """ return self._pyarrow_table.column(0).type @property - def _pyarrow_table(self): - return self._col._pyarrow_table - - def num_chunks(self) -> int: + def _pyarrow_table(self) -> pa.Table: """ - Return the number of chunks the column consists of. + Get PyArrow table representing the column. Returns ------- - int - The number of chunks the column consists of. + pyarrow.Table """ + return self._col._pyarrow_table + + def num_chunks(self) -> int: return self._col.num_chunks() def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: - """ - Return an iterator yielding the chunks. - - By default ``n_chunks=None``, yields the chunks that the data is stored as by the producer. - If given, ``n_chunks`` must be a multiple of ``self.num_chunks()``, - meaning the producer must subdivide each chunk before yielding it. - - Parameters - ---------- - n_chunks : int, optional - Number of chunks to yield. - - Yields - ------ - DataFrame - A ``DataFrame`` object(s). - """ for chunk in self._col.get_chunks(n_chunks): yield OmnisciProtocolColumn(chunk) def get_buffers(self) -> Dict[str, Any]: - """ - Return a dictionary containing the underlying buffers. - - Returns - ------- - dict - - "data": a two-element tuple whose first element is a buffer - containing the data and whose second element is the data buffer's associated dtype. - - "validity": a two-element tuple whose first element is a buffer - containing mask values indicating missing data and - whose second element is the mask value buffer's - associated dtype. None if the null representation is not a bit or byte mask. - - "offsets": a two-element tuple whose first element is a buffer - containing the offset values for variable-size binary data - (e.g., variable-length strings) and whose second element is the offsets - buffer's associated dtype. None if the data buffer does not have - an associated offsets buffer. - """ if self.num_chunks() != 1: + # TODO: do chunks combining raise NotImplementedError() self._materialize_actual_buffers() @@ -396,33 +260,42 @@ def get_buffers(self) -> Dict[str, Any]: return result def _materialize_actual_buffers(self): + """ + Materialize PyArrow table's buffers that can be zero-copy returned to a consumer, if they aren't already materialized. + + Besides materializing PyArrow table itself (if there were some delayed computations) + the function also propagates external dtypes to the PyArrow table. For example, + if ``self.dtype`` is a string kind, but internal PyArrow dtype is a dictionary + (if the table were just exported from OmniSci), then the dictionary will be casted + to string dtype. + """ external_dtype = self.dtype internal_dtype = self._dtype_from_pyarrow(self._arrow_dtype) if external_dtype[0] != internal_dtype[0]: self._propagate_dtype(external_dtype) - def _get_buffer_size(self, bit_width, is_offset_buffer=False): + def _get_buffer_size(self, bit_width: int, is_offset_buffer: bool = False) -> int: """ - Compute chunk size in bytes. + Compute buffer's size in bytes for the current chunk. Parameters ---------- bit_width : int Bit width of the underlying data type. is_offset_buffer : bool, default: False - Whether the buffer describes element offsets. + Whether the buffer describes offsets. Returns ------- int - Number of bytes to read from the start of the buffer to read the whole chunk. + Number of bytes to read from the start of the buffer + offset to retrieve the whole chunk. """ - # Offset buffer always has `size + 1` elements in it as it describes slice bounds + # Offset buffer always has ``size + 1`` elements in it as it describes slices bounds elements_in_buffer = self.size + 1 if is_offset_buffer else self.size result = ceil((bit_width * elements_in_buffer) / 8) # For a bitmask, if the chunk started in the middle of the byte then we need to - # read one extra byte from the buffer to retrieve the tail in the last byte. Example: + # read one extra byte from the buffer to retrieve the chunk's tail in the last byte. Example: # Bitmask of 3 bytes, the chunk offset is 3 elements and its size is 16 # |* * * * * * * *|* * * * * * * *|* * * * * * * *| # ^- the chunk starts here ^- the chunk ends here @@ -433,15 +306,32 @@ def _get_buffer_size(self, bit_width, is_offset_buffer=False): result += 1 return result - def _get_data_buffer(self, arr): + def _get_data_buffer( + self, arr: pa.Array + ) -> Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]: + """ + Get column's data buffer. + + Parameters + ---------- + arr : pa.Array + PyArrow array holding column's data. + + Returns + ------- + tuple + Tuple of OmnisciProtocolBuffer and protocol dtype representation of the buffer's underlying data. + """ if self.dtype[0] == DTypeKind.CATEGORICAL: + # For dictionary data the buffer has to return categories codes arr = arr.indices arrow_type = self._dtype_from_pyarrow(arr.type) - buff_size = ( self._get_buffer_size(bit_width=arrow_type[1]) if self.dtype[0] != DTypeKind.STRING + # We don't chunk string buffers as it would require modifying offset values, + # so just return the whole data buffer for every chunk. else None ) @@ -450,26 +340,58 @@ def _get_data_buffer(self, arr): arrow_type, ) - def _get_validity_buffer(self, arr): + def _get_validity_buffer( + self, arr: pa.Array + ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]]: + """ + Get column's validity buffer. + + Parameters + ---------- + arr : pa.Array + PyArrow array holding column's data. + + Returns + ------- + tuple or None + Tuple of OmnisciProtocolBuffer and protocol dtype representation of the buffer's underlying data. + None if column is non-nullable (``self.describe_null == `ColumnNullType.NON_NULLABLE``). + """ validity_buffer = arr.buffers()[0] if validity_buffer is None: return validity_buffer + # If exist, validity buffer is always a bit-mask. data_size = self._get_buffer_size(bit_width=1) - return ( OmnisciProtocolBuffer(validity_buffer, data_size), # self._dtype_from_primitive_pandas(np.dtype("uint8")), (DTypeKind.BOOL, 1, "b", "="), ) - def _get_offsets_buffer(self, arr): + def _get_offsets_buffer( + self, arr: pa.Array + ) -> Optional[Tuple[OmnisciProtocolBuffer, Tuple[DTypeKind, int, str, str]]]: + """ + Get column's offsets buffer. + + Parameters + ---------- + arr : pa.Array + PyArrow array holding column's data. + + Returns + ------- + tuple or None + Tuple of OmnisciProtocolBuffer and protocol dtype representation of the buffer's underlying data. + None if the column's dtype is fixed-size. + """ buffs = arr.buffers() if len(buffs) < 3: return None offset_buff = buffs[1] - + # According to Arrow's data layout, the offset buffer type is "int32" dtype = self._dtype_from_primitive_pandas(np.dtype("int32")) return ( OmnisciProtocolBuffer( @@ -479,7 +401,18 @@ def _get_offsets_buffer(self, arr): dtype, ) - def _propagate_dtype(self, dtype): + def _propagate_dtype(self, dtype: Tuple[DTypeKind, int, str, str]): + """ + Propagate `dtype` to the underlying PyArrow table. + + Modifies the column object inplace by replacing underlying PyArrow table with + the casted one. + + Parameters + ---------- + dtype : tuple + Data type conforming protocol dtypes format to cast underlying PyArrow table. + """ if not self._col._allow_copy: raise RuntimeError("Copy required with 'allow_copy=False' flag") @@ -508,10 +441,14 @@ def _propagate_dtype(self, dtype): elif kind == DTypeKind.DATETIME: arrow_type = pa.timestamp("ns") elif kind == DTypeKind.CATEGORICAL: - arrow_type = pa.dictionary( - index_type=arrow_types_map[DTypeKind.INT][bit_width], - value_type=pa.string(), - ) + index_type = arrow_types_map[DTypeKind.INT].get(bit_width, None) + if index_type is not None: + arrow_type = pa.dictionary( + index_type=index_type, + # There is no way to deduce an actual value type, so casting to a string + # as it's the most common one + value_type=pa.string(), + ) if arrow_type is None: raise NotImplementedError(f"Propagation for type {dtype} is not supported.") @@ -525,7 +462,28 @@ def _propagate_dtype(self, dtype): ) # TODO: currently, each column chunk casts its buffers independently which results - # in an `NCHUNKS - 1` amount of redundant casts. We can make the pyarrow table + # in an `NCHUNKS - 1` amount of redundant casts. We can make the PyArrow table # being shared across all the chunks, so the cast being triggered in a single chunk # propagate to all of them. - self._col._replace_at(at.cast(schema_to_cast)) + self._cast_at(schema_to_cast) + + def _cast_at(self, new_schema: pa.Schema): + """ + Cast underlying PyArrow table with the passed schema. + + Parameters + ---------- + new_schema : pyarrow.Schema + New schema to cast the table. + + Notes + ----- + This method modifies the column inplace by replacing the wrapped ``OmnisciProtocolDataframe`` + with the new one holding the casted PyArrow table. + """ + casted_at = self._pyarrow_table.cast(new_schema) + self._col = type(self._col)( + self._col._df.from_arrow(casted_at), + self._col._nan_as_null, + self._col._allow_copy, + ) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index 1be8d3188b0..f1a396fe28a 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -11,30 +11,18 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -""" -Dataframe exchange protocol implementation. - -See more in https://data-apis.org/dataframe-protocol/latest/index.html. - -Public API ----------- -from_dataframe : construct a DataFrame from an input data frame which - implements the exchange protocol. -Notes ------ -- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and unsafe to - do in pure Python. It's more general but definitely less friendly than having - ``to_arrow`` and ``to_numpy`` methods. So for the buffers which lack - ``__dlpack__`` (e.g., because the column dtype isn't supported by DLPack), - this is worth looking at again. -""" +"""The module houses OmnisciOnNative implementation of the Dataframe class of DataFrame exchange protocol.""" import collections import numpy as np +import pyarrow as pa -from typing import Optional, Iterable, Sequence -from modin.core.dataframe.base.dataframe.dataframe import ModinDataframe +from typing import Optional, Iterable, Sequence, Dict, Any +from modin.experimental.core.execution.native.implementations.omnisci_on_native.dataframe.dataframe import ( + OmnisciOnNativeDataframe, +) from modin.core.dataframe.base.exchange.dataframe_protocol import ProtocolDataframe +from modin.utils import _inherit_docstrings from modin.experimental.core.execution.native.implementations.omnisci_on_native.df_algebra import ( MaskNode, @@ -45,40 +33,25 @@ from .column import OmnisciProtocolColumn +@_inherit_docstrings(ProtocolDataframe) class OmnisciProtocolDataframe(ProtocolDataframe): """ - A data frame class, with only the methods required by the interchange protocol defined. - - Instances of this (private) class are returned from ``modin.pandas.DataFrame.__dataframe__`` - as objects with the methods and attributes defined on this class. - - A "data frame" represents an ordered collection of named columns. - A column's "name" must be a unique string. Columns may be accessed by name or by position. - This could be a public data frame class, or an object with the methods and - attributes defined on this DataFrame class could be returned from the - ``__dataframe__`` method of a public data frame class in a library adhering - to the dataframe interchange protocol specification. + Implement DataFrame exchange protocol class for OmniSciOnNative execution. Parameters ---------- - df : ModinDataframe - A ``ModinDataframe`` object. - nan_as_null : bool, default:False - A keyword intended for the consumer to tell the producer - to overwrite null values in the data with ``NaN`` (or ``NaT``). - This currently has no effect; once support for nullable extension - dtypes is added, this value should be propagated to columns. + df : OmnisciOnNativeDataframe + DataFrame object that holds the data. + nan_as_null : bool, default: False + Whether to overwrite null values in the data with ``NaN``. allow_copy : bool, default: True - A keyword that defines whether or not the library is allowed - to make a copy of the data. For example, copying data would be necessary - if a library supports strided buffers, given that this protocol - specifies contiguous buffers. Currently, if the flag is set to ``False`` - and a copy is needed, a ``RuntimeError`` will be raised. + Whether allow to doing copy of the underlying data during export flow. + If a copy or any kind of data transfer/materialization would be required raise ``RuntimeError``. """ def __init__( self, - df: ModinDataframe, + df: OmnisciOnNativeDataframe, nan_as_null: bool = False, allow_copy: bool = True, ) -> None: @@ -92,7 +65,7 @@ def __init__( self._allow_copy = allow_copy @property - def metadata(self): + def metadata(self) -> Dict[str, Any]: # TODO: as the frame's index is stored as a separate column inside pyarrow table # we may want to return the column's name here instead of materialized index. # This will require the internal index column to be visible in the protocol's column @@ -101,46 +74,22 @@ def metadata(self): return {"index": self._df.index} def num_columns(self) -> int: - """ - Return the number of columns in the DataFrame. - - Returns - ------- - int - The number of columns in the DataFrame. - """ return len(self._df.columns) def num_rows(self) -> int: - """ - Return the number of rows in the DataFrame, if available. - - Returns - ------- - int - The number of rows in the DataFrame. - """ if not self._allow_copy and not self._is_zero_copy_possible: raise RuntimeError("Copy required with 'allow_copy=False'") return len(self._df.index) def num_chunks(self) -> int: - """ - Return the number of chunks the DataFrame consists of. - - Returns - ------- - int - The number of chunks the DataFrame consists of. - """ return len(self._chunk_slices) - 1 __chunk_slices = None @property - def _chunk_slices(self): + def _chunk_slices(self) -> np.ndarray: """ - Compute chunk start-stop indices in the underlying pyarrow table. + Compute chunks start-stop indices in the underlying PyArrow table. Returns ------- @@ -189,7 +138,7 @@ def _maybe_raise_if_materialize(self): __is_zero_copy_possible = None @property - def _is_zero_copy_possible(self): + def _is_zero_copy_possible(self) -> bool: """ Check whether it's possible to retrieve data from the DataFrame zero-copy. @@ -202,17 +151,22 @@ def _is_zero_copy_possible(self): """ if self.__is_zero_copy_possible is None: if self._df._has_arrow_table(): + # If PyArrow is already materialized table then we can + # retrieve the data zero-copy self.__is_zero_copy_possible = True elif not self._df._can_execute_arrow(): + # When not able to execute the plan via Arrow means + # that we have to involve OmniSci, so no zero-copy. self.__is_zero_copy_possible = False else: + # Check whether the plan for PyArrow can be executed zero-copy self.__is_zero_copy_possible = self._is_zero_copy_op(self._df.op) return self.__is_zero_copy_possible @classmethod - def _is_zero_copy_op(cls, op): + def _is_zero_copy_op(cls, op) -> bool: """ - Check whether the passed node of the delayed computation tree could be executed zero-copy via pyarrow execution. + Check whether the passed node of the delayed computation tree could be executed zero-copy via PyArrow execution. Parameters ---------- @@ -224,18 +178,30 @@ def _is_zero_copy_op(cls, op): """ is_zero_copy_op = False if isinstance(op, (FrameNode, TransformNode, UnionNode)): + # - FrameNode: already materialized PyArrow table + # - TransformNode: select certain columns of the table, implemented zero-copy (``df._arrow_select``) + # - UnionNode: concatenate PyArrow tables, implemented zero-copy (``df._arrow_concat``) is_zero_copy_op = True elif isinstance(op, MaskNode) and ( isinstance(op.row_positions, slice) or is_range_like(op.row_positions) ): + # Can select rows zero-copy if indexer is a slice-like (``df._arrow_row_slice``) is_zero_copy_op = True return is_zero_copy_op and all( - cls._is_zero_copy_op(_op) for _op in getattr(op, "inputs", []) + # Walk the computation tree + cls._is_zero_copy_op(_op) + for _op in getattr(op, "inputs", []) ) @property - def _pyarrow_table(self): - """Get ``pyarrow.Table`` representing the dataframe.""" + def _pyarrow_table(self) -> pa.Table: + """ + Get PyArrow table representing the column. + + Returns + ------- + pyarrow.Table + """ self._maybe_raise_if_materialize() if not self._df._has_arrow_table(): @@ -245,30 +211,11 @@ def _pyarrow_table(self): assert at is not None return at - def _replace_at(self, at): - self._df = self._df.from_arrow(at) - def column_names(self) -> Iterable[str]: - """ - Return an iterator yielding the column names. - - Yields - ------ - str - The name of the column(s). - """ for col in self._df.columns: yield col def get_column(self, i: int) -> OmnisciProtocolColumn: - """ - Return the column at the indicated position. - - Returns - ------- - Column - The column at the indicated position. - """ return OmnisciProtocolColumn( OmnisciProtocolDataframe( self._df.mask(col_positions=[i]), @@ -277,14 +224,6 @@ def get_column(self, i: int) -> OmnisciProtocolColumn: ) def get_column_by_name(self, name: str) -> OmnisciProtocolColumn: - """ - Return the column whose name is the indicated name. - - Returns - ------- - Column - The column whose name is the indicated name. - """ return OmnisciProtocolColumn( OmnisciProtocolDataframe( self._df.mask(col_labels=[name]), @@ -293,14 +232,6 @@ def get_column_by_name(self, name: str) -> OmnisciProtocolColumn: ) def get_columns(self) -> Iterable[OmnisciProtocolColumn]: - """ - Return an iterator yielding the columns. - - Yields - ------ - Column - The ``Column`` object(s). - """ for name in self._df.columns: yield OmnisciProtocolColumn( OmnisciProtocolDataframe( @@ -309,15 +240,7 @@ def get_columns(self) -> Iterable[OmnisciProtocolColumn]: ), ) - def select_columns(self, indices: Sequence[int]) -> "DataFrame": - """ - Create a new DataFrame by selecting a subset of columns by index. - - Returns - ------- - DataFrame - A new DataFrame with selected a subset of columns by index. - """ + def select_columns(self, indices: Sequence[int]) -> "OmnisciProtocolDataframe": if not isinstance(indices, collections.Sequence): raise ValueError("`indices` is not a sequence") @@ -326,15 +249,9 @@ def select_columns(self, indices: Sequence[int]) -> "DataFrame": allow_copy=self._allow_copy, ) - def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": - """ - Create a new DataFrame by selecting a subset of columns by name. - - Returns - ------- - DataFrame - A new DataFrame with selected a subset of columns by name. - """ + def select_columns_by_name( + self, names: Sequence[str] + ) -> "OmnisciProtocolDataframe": if not isinstance(names, collections.Sequence): raise ValueError("`names` is not a sequence") @@ -343,24 +260,9 @@ def select_columns_by_name(self, names: Sequence[str]) -> "DataFrame": allow_copy=self._allow_copy, ) - def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: - """ - Return an iterator yielding the chunks. - - By default ``n_chunks=None``, yields the chunks that the data is stored as by the producer. - If given, ``n_chunks`` must be a multiple of ``self.num_chunks()``, - meaning the producer must subdivide each chunk before yielding it. - - Parameters - ---------- - n_chunks : int, optional - Number of chunks to yield. - - Yields - ------ - DataFrame - A ``DataFrame`` object(s). - """ + def get_chunks( + self, n_chunks: Optional[int] = None + ) -> Iterable["OmnisciProtocolDataframe"]: if n_chunks is None: return self._yield_chunks(self._chunk_slices) @@ -369,9 +271,17 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: "The passed `n_chunks` has to be a multiple of `num_chunks`." ) + if n_chunks > self.num_rows(): + raise RuntimeError( + "The passed `n_chunks` value is bigger than the amout of rows in the frame." + ) + extra_chunks = n_chunks - self.num_chunks() subdivided_slices = self._chunk_slices.copy() + # The subdividing behavior is a bit different from "subdividing each chunk", + # instead it subdivides the biggest chunks first, so overall chunking be as + # equal as possible for _ in range(extra_chunks): # 1. Find the biggest chunk # 2. Split it in the middle @@ -381,9 +291,7 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: - subdivided_slices[biggest_chunk_idx] ) // 2 if new_chunk_offset == 0: - raise RuntimeError( - "The passed `n_chunks` value is bigger than the amout of rows in the frame." - ) + raise RuntimeError("No more chunks to subdivide.") subdivided_slices = np.insert( subdivided_slices, biggest_chunk_idx + 1, @@ -392,17 +300,18 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["DataFrame"]: return self._yield_chunks(subdivided_slices) - def _yield_chunks(self, chunk_slices): + def _yield_chunks(self, chunk_slices) -> "OmnisciProtocolDataframe": """ - Yield dataframe chunks according to the passed chunking. + Yield DataFrame chunks according to the passed offsets. Parameters ---------- chunk_slices : list + Chunking offsets. - Yield - ----- - DataFrame + Yields + ------ + OmnisciProtocolDataframe """ for i in range(len(chunk_slices) - 1): yield OmnisciProtocolDataframe( diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py index 3ff6c70e18d..d6b0bc7ddac 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +"""Utility functions of DataFrame exchange protocol implementation for OmnisciOnNative execution.""" + import pyarrow as pa import numpy as np @@ -20,7 +22,20 @@ ) -def arrow_dtype_to_arrow_c(dtype): +def arrow_dtype_to_arrow_c(dtype: pa.DataType) -> str: + """ + Represent PyArrow `dtype` as a format string in Apache Arrow C notation. + + Parameters + ---------- + dtype : pa.DataType + Datatype of PyArrow table to represent. + + Returns + ------- + str + Format string in Apache Arrow C notation of the given `dtype`. + """ if pa.types.is_timestamp(dtype): return ArrowCTypes.TIMESTAMP.format( resolution=dtype.unit[:1], tz=dtype.tz or "" diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index 11845f6154b..d8dc1061cf0 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -16,7 +16,7 @@ import pandas import numpy as np -from modin.experimental.core.execution.native.implementations.omnisci_on_native.exchange.dataframe_protocol.__utils import ( +from modin.core.dataframe.base.exchange.dataframe_protocol.__utils import ( from_dataframe, ) from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( From 577fe37102968f86ef3daedfa10cefae85cbb28a Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 8 Mar 2022 12:11:30 +0300 Subject: [PATCH 07/33] Refactor 'from_dataframe' module Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/__utils.py | 334 ------------------ .../dataframe_protocol/omnisci/utils.py | 6 +- 2 files changed, 4 insertions(+), 336 deletions(-) delete mode 100644 modin/core/dataframe/base/exchange/dataframe_protocol/__utils.py diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/__utils.py b/modin/core/dataframe/base/exchange/dataframe_protocol/__utils.py deleted file mode 100644 index cad08b686b5..00000000000 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/__utils.py +++ /dev/null @@ -1,334 +0,0 @@ -# Licensed to Modin Development Team under one or more contributor license agreements. -# See the NOTICE file distributed with this work for additional information regarding -# copyright ownership. The Modin Development Team licenses this file to you under the -# Apache License, Version 2.0 (the "License"); you may not use this file except in -# compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under -# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific language -# governing permissions and limitations under the License. - -""" -This module contains draft implementations of the functions, converting __dataframe__ -object to `pandas.DataFrame`. The location and implementations of the functions is a -subject to change, however, the contract of `from_dataframe` is supposed to stay the same. -""" - -import pandas -import ctypes -import numpy as np - -from typing import Optional -from modin.core.dataframe.base.exchange.dataframe_protocol.utils import DTypeKind -from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( - ProtocolDataframe, - ProtocolColumn, -) - - -def from_dataframe( - df: ProtocolDataframe, allow_copy: bool = True, nchunks: Optional[int] = None -): - """ - Build ``pandas.DataFrame`` from an object supporting DataFrame exchange protocol (__dataframe__). - - Parameters - ---------- - df : ProtocolDataframe - Object supporting the exchange protocol (__dataframe__). - allow_copy : bool, default True - Whether to allow for `df` providing a copy of underlying data. - nchunks : int, optional - Number of chunks to split `df`. - - Returns - ------- - pandas.DataFrame - """ - if not hasattr(df, "__dataframe__"): - raise ValueError("`df` does not support __dataframe__") - - df = df.__dataframe__()["dataframe"] - - def _get_pandas_df(df): - # We need a dict of columns here, with each column being a numpy array (at - # least for now, deal with non-numpy dtypes later). - columns = dict() - _k = DTypeKind - _buffers = [] # hold on to buffers, keeps memory alive - for name in df.column_names(): - if not isinstance(name, str): - raise ValueError(f"Column {name} is not a string") - if name in columns: - raise ValueError(f"Column {name} is not unique") - col = df.get_column_by_name(name) - dtype = col.dtype[0] - if dtype in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - # Simple numerical or bool dtype, turn into numpy array - columns[name], _buf = convert_column_to_ndarray(col) - elif dtype == _k.CATEGORICAL: - columns[name], _buf = convert_categorical_column(col) - elif dtype == _k.STRING: - columns[name], _buf = convert_string_column(col) - elif dtype == _k.DATETIME: - columns[name], _buf = convert_datetime_col(col) - else: - raise NotImplementedError(f"Data type {dtype} not handled yet") - - _buffers.append(_buf) - - pandas_df = pandas.DataFrame(columns) - pandas_df._buffers = _buffers - return pandas_df - - pandas_dfs = [] - for chunk in df.get_chunks(nchunks): - pandas_df = _get_pandas_df(chunk) - pandas_dfs.append(pandas_df) - # Can't preserve index for now - pandas_df = pandas.concat(pandas_dfs, axis=0, ignore_index=True) - return pandas_df - - -def convert_datetime_col(col): - if col.describe_null[0] not in (0, 3): - raise NotImplementedError( - "Null values represented as masks or sentinel values not handled yet" - ) - - _, _, fmt, _ = col.dtype - dbuf, dtype = col.get_buffers()["data"] - data = buffer_to_ndarray(dbuf, (DTypeKind.UINT, dtype[1], "u", "="), col.offset) - if fmt.startswith("ts"): - # timestamp ts{unit}:tz - meta = fmt[2:].split(":") - if len(meta) == 1: - unit = meta[0] - tz = "" - else: - unit, tz = meta - if tz != "": - raise NotImplementedError("Timezones are not supported yet") - if unit != "s": - unit += "s" - data = data.astype(f"datetime64[{unit}]") - elif fmt.startswith("td"): - # date td{Days/Ms} - unit = fmt[2:] - if unit == "D": - # to seconds (converting to uint64 to avoid overflow) - data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") - elif unit == "m": - data = data.astype("datetime64[ms]") - else: - raise NotImplementedError(f"Date unit is not supported: {unit}") - else: - raise NotImplementedError(f"Datetime is not supported: {fmt}") - - if col.describe_null[0] == 3: - null_mask = ~bitmask_to_bool_array( - col.get_buffers()["validity"][0], col.offset, col.size - ) - data[null_mask] = None - elif col.describe_null[0] in (0, 1, 2): - pass - else: - raise NotImplementedError( - "Such null kind is not supported for datetime conversion" - ) - - return data, dbuf - - -def convert_column_to_ndarray(col: ProtocolColumn) -> np.ndarray: - """ - Convert an int, uint, float or bool column to a numpy array. - """ - - if col.describe_null[0] not in (0, 1, 3): - raise NotImplementedError( - "Null values represented as masks or sentinel values not handled yet" - ) - # breakpoint() - _buffer, _dtype = col.get_buffers()["data"] - data, _bfr = buffer_to_ndarray(_buffer, _dtype, col.offset, col.size), _buffer - - if col.describe_null[0] == 3: - null_pos = ~bitmask_to_bool_array( - col.get_buffers()["validity"][0], col.offset, col.size - ) - if np.any(null_pos): - # convert to null-able type - data = data.astype(float) - data[null_pos] = np.nan - - return data, _bfr - - -def buffer_to_ndarray( - _buffer, _dtype, offset, length=None, allow_none_buffer=False -) -> np.ndarray: - # Handle the dtype - if allow_none_buffer and _buffer is None: - return None - kind = _dtype[0] - bitwidth = _dtype[1] - _k = DTypeKind - if _dtype[0] not in (_k.INT, _k.UINT, _k.FLOAT, _k.BOOL): - raise RuntimeError("Not a boolean, integer or floating-point dtype") - - if bitwidth == 1: - return bitmask_to_bool_array(_buffer, offset, length) - - _ints = {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64} - _uints = {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64} - _floats = {32: np.float32, 64: np.float64} - _np_dtypes = {0: _ints, 1: _uints, 2: _floats, 20: {8: bool}} - column_dtype = _np_dtypes[kind][bitwidth] - - # No DLPack yet, so need to construct a new ndarray from the data pointer - # and size in the buffer plus the dtype on the column - ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) - data_pointer = ctypes.cast( - _buffer.ptr + offset * (bitwidth // 8), ctypes.POINTER(ctypes_type) - ) - - # NOTE: `x` does not own its memory, so the caller of this function must - # either make a copy or hold on to a reference of the column or - # buffer! (not done yet, this is pretty awful ...) - x = np.ctypeslib.as_array(data_pointer, shape=(_buffer.bufsize // (bitwidth // 8),)) - return x - - -def convert_categorical_column(col: ProtocolColumn) -> pandas.Series: - """ - Convert a categorical column to a Series instance. - """ - ordered, is_dict, mapping = col.describe_categorical.values() - if not is_dict: - raise NotImplementedError("Non-dictionary categoricals not supported yet") - - categories = np.asarray(list(mapping.values())) - codes_buffer, codes_dtype = col.get_buffers()["data"] - codes = buffer_to_ndarray(codes_buffer, codes_dtype, col.offset) - # Doing module in order to not get IndexError for negative sentinel values in the `codes` - values = categories[codes % len(categories)] - - cat = pandas.Categorical(values, categories=categories, ordered=ordered) - series = pandas.Series(cat) - null_kind = col.describe_null[0] - if null_kind == 2: # sentinel value - sentinel = col.describe_null[1] - series[codes == sentinel] = np.nan - elif null_kind == 3: - null_values = ~bitmask_to_bool_array( - col.get_buffers()["validity"][0], col.offset, col.size - ) - series[null_values] = np.nan - elif null_kind == 0: - pass - else: - raise NotImplementedError( - "Only categorical columns with sentinel value supported at the moment" - ) - - return series, codes_buffer - - -def bitmask_to_bool_array(buffer, offset, mask_length): - ctypes_type = np.ctypeslib.as_ctypes_type(np.uint8) - data_pointer = ctypes.cast((buffer.ptr + offset // 8), ctypes.POINTER(ctypes_type)) - # breakpoint() - first_byte_offset = offset % 8 - x = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) - - null_mask = np.zeros(mask_length, dtype=bool) - # Proccessing the first byte separately as it has its own offset - val = x[0] - mask_idx = 0 - for j in range(min(8 - first_byte_offset, mask_length)): - if val & (1 << (j + first_byte_offset)): - null_mask[mask_idx] = True - mask_idx += 1 - - for i in range(1, mask_length // 8): - val = x[i] - for j in range(8): - if val & (1 << j): - null_mask[mask_idx] = True - mask_idx += 1 - - if len(x) > 1: - # Processing reminder of last byte - val = x[-1] - for j in range(len(null_mask) - mask_idx): - if val & (1 << j): - null_mask[mask_idx] = True - mask_idx += 1 - - return null_mask - - -def convert_string_column(col: ProtocolColumn) -> np.ndarray: - """ - Convert a string column to a NumPy array. - """ - # Retrieve the data buffers - # breakpoint() - buffers = col.get_buffers() - - # Retrieve the data buffer containing the UTF-8 code units - dbuffer, bdtype = buffers["data"] - - # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string - obuffer, odtype = buffers["offsets"] - - # Retrieve the mask buffer indicating the presence of missing values - mbuffer, mdtype = buffers["validity"] or (None, None) - # Retrieve the missing value encoding - null_kind, null_value = col.describe_null - - # Convert the buffers to NumPy arrays - dt = ( - DTypeKind.UINT, - 8, - None, - None, - ) # note: in order to go from STRING to an equivalent ndarray, we claim that the buffer is uint8 (i.e., a byte array) - dbuf = buffer_to_ndarray(dbuffer, dt, 0) - # breakpoint() - obuf = buffer_to_ndarray(obuffer, odtype, col.offset) - # breakpoint() - if null_kind == 4: - mbuf = buffer_to_ndarray(mbuffer, mdtype, col.offset, allow_none_buffer=True) - elif null_kind == 3: - mbuf = ~bitmask_to_bool_array(mbuffer, col.offset, col.size) - - # Assemble the strings from the code units - str_list = [] - for i in range(obuf.size - 1): - # Check for missing values - if null_kind == 3 and mbuf[i]: # bit mask - str_list.append(np.nan) - continue - elif null_kind == 4 and mbuf[i] == null_value: # byte mask - str_list.append(np.nan) - continue - - # Extract a range of code units - units = dbuf[obuf[i] : obuf[i + 1]] - - # Convert the list of code units to bytes - b = bytes(units) - - # Create the string - s = b.decode(encoding="utf-8") - - # Add to our list of strings - str_list.append(s) - # breakpoint() - # Convert the string list to a NumPy array - return np.asarray(str_list, dtype="object"), buffers diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index d8dc1061cf0..063c3b21db7 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -16,7 +16,7 @@ import pandas import numpy as np -from modin.core.dataframe.base.exchange.dataframe_protocol.__utils import ( +from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( from_dataframe, ) from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( @@ -131,7 +131,9 @@ def get_all_types(has_nulls=False, exclude_dtypes=None): # string string_data["string"] = np.array( - ["English: test string", " ", "Chinese: 测试字符串", "Russian: тестовая строка"] * 10 + # Test multi-byte characters as well to ensure that the chunking works correctly for them + ["English: test string", " ", "Chinese: 测试字符串", "Russian: тестовая строка"] + * 10 ) if has_nulls: string_data["string_null"] = np.array( From 7fa75ae019f776889a410bb75742d40dcbd0e445 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 8 Mar 2022 12:16:59 +0300 Subject: [PATCH 08/33] Add import tests Signed-off-by: Dmitry Chigarev --- .../core/storage_formats/omnisci/query_compiler.py | 7 +++++-- .../dataframe_protocol/omnisci/test_protocol.py | 12 +++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/modin/experimental/core/storage_formats/omnisci/query_compiler.py b/modin/experimental/core/storage_formats/omnisci/query_compiler.py index 90678601d9b..f5dbe934565 100644 --- a/modin/experimental/core/storage_formats/omnisci/query_compiler.py +++ b/modin/experimental/core/storage_formats/omnisci/query_compiler.py @@ -208,10 +208,13 @@ def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True) -> di @classmethod def from_dataframe(cls, df, data_cls): - raise NotImplementedError( - "The selected execution does not implement the DataFrame exchange protocol yet." + from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( + from_dataframe, ) + pd_df = from_dataframe(df) + return data_cls.from_pandas(pd_df) + # END Dataframe exchange protocol default_to_pandas = PandasQueryCompiler.default_to_pandas diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index dd604fdfa01..73321b2cd2e 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -19,7 +19,7 @@ import pandas from modin.pandas.test.utils import df_equals -from modin.pandas.utils import from_arrow +from modin.pandas.utils import from_arrow, from_dataframe as md_from_dataframe from .utils import get_all_types, split_df_into_chunks, export_frame @@ -153,3 +153,13 @@ def test_export_when_delayed_computations(): exported_df = export_frame(md_res) df_equals(exported_df, pd_res) + + +@pytest.mark.parametrize("data_has_nulls", [True, False]) +def test_simple_import(data_has_nulls): + data = get_all_types(data_has_nulls) + + md_df_source = pd.DataFrame(data) + md_df_consumer = md_from_dataframe(md_df_source._query_compiler._modin_frame) + + df_equals(md_df_source, md_df_consumer) From 17b96dcc76222ec0a491930c86dcc6767690b0c1 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 8 Mar 2022 12:52:14 +0300 Subject: [PATCH 09/33] Add zero-copy testing Signed-off-by: Dmitry Chigarev --- .../dataframe_protocol/from_dataframe.py | 458 ++++++++++++++++++ .../exchange/dataframe_protocol/dataframe.py | 2 +- .../storage_formats/omnisci/query_compiler.py | 4 +- .../omnisci/test_protocol.py | 41 ++ .../dataframe_protocol/omnisci/utils.py | 10 +- 5 files changed, 512 insertions(+), 3 deletions(-) create mode 100644 modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py new file mode 100644 index 00000000000..6700c7bd526 --- /dev/null +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py @@ -0,0 +1,458 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +"""Module houses functions building a ``pandas.DataFrame`` from DataFrame exchange protocol object.""" + +import pandas +import ctypes +import numpy as np + +from typing import Optional, Tuple, Any, Union +from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( + DTypeKind, + ColumnNullType, +) +from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( + ProtocolDataframe, + ProtocolColumn, + ProtocolBuffer, +) + + +def from_dataframe( + df: ProtocolDataframe, allow_copy: bool = True, nchunks: Optional[int] = None +): + """ + Build ``pandas.DataFrame`` from an object supporting DataFrame exchange protocol (__dataframe__). + + Parameters + ---------- + df : ProtocolDataframe + Object supporting the exchange protocol (__dataframe__). + allow_copy : bool, default: True + Whether to allow for `df` providing a copy of underlying data. + nchunks : int, optional + Number of chunks to split `df`. + + Returns + ------- + pandas.DataFrame + """ + if not hasattr(df, "__dataframe__"): + raise ValueError("`df` does not support __dataframe__") + + df = df.__dataframe__()["dataframe"] + + def get_pandas_df(df): + # We need a dict of columns here, with each column being a numpy array (at + # least for now, deal with non-numpy dtypes later). + columns = dict() + buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in ( + DTypeKind.INT, + DTypeKind.UINT, + DTypeKind.FLOAT, + DTypeKind.BOOL, + ): + columns[name], buf = convert_primitive_column_to_ndarray(col) + elif dtype == DTypeKind.CATEGORICAL: + columns[name], buf = convert_categorical_column(col) + elif dtype == DTypeKind.STRING: + columns[name], buf = convert_string_column(col) + elif dtype == DTypeKind.DATETIME: + columns[name], buf = convert_datetime_col(col) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + buffers.append(buf) + + pandas_df = pandas.DataFrame(columns) + pandas_df._buffers = buffers + return pandas_df + + pandas_dfs = [] + for chunk in df.get_chunks(nchunks): + pandas_df = get_pandas_df(chunk) + pandas_dfs.append(pandas_df) + + pandas_df = pandas.concat(pandas_dfs, axis=0, ignore_index=True) + + if "index" in df.metadata: + pandas_df.index = df.metadata["index"] + + return pandas_df + + +def convert_primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: + """ + Convert Column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. + + Parameters + ---------- + col : ProtocolColumn + + Returns + ------- + tuple + Tuple of numpy.ndarray holding the data and the memory owner object that keeps the memory alive. + """ + # breakpoint() + buffers = col.get_buffers() + + data_buff, data_dtype = buffers["data"] + data = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def convert_categorical_column(col: ProtocolColumn) -> Tuple[pandas.Series, Any]: + """ + Convert Column holding categorical data to a pandas Series. + + Parameters + ---------- + col : ProtocolColumn + + Returns + ------- + tuple + Tuple of pandas.Series holding the data and the memory owner object that keeps the memory alive. + """ + ordered, is_dict, mapping = col.describe_categorical.values() + + if not is_dict: + raise NotImplementedError("Non-dictionary categoricals not supported yet") + + categories = np.array(list(mapping.values())) + buffers = col.get_buffers() + + codes_buff, codes_dtype = buffers["data"] + codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size) + + # Doing module in order to not get IndexError for out-of-bounds sentinel values in `codes` + values = categories[codes % len(categories)] + + cat = pandas.Categorical(values, categories=categories, ordered=ordered) + data = pandas.Series(cat) + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def convert_string_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: + """ + Convert Column holding string data to a NumPy array. + + Parameters + ---------- + col : ProtocolColumn + + Returns + ------- + tuple + Tuple of numpy.ndarray holding the data and the memory owner object that keeps the memory alive. + """ + if col.describe_null[0] not in ( + ColumnNullType.NON_NULLABLE, + ColumnNullType.USE_BITMASK, + ColumnNullType.USE_BYTEMASK, + ): + raise NotImplementedError( + f"{col.describe_null[0]} null kind is not yet supported for string columns." + ) + + buffers = col.get_buffers() + + # Retrieve the data buffer containing the UTF-8 code units + data_buff, _ = buffers["data"] + # Convert the buffers to NumPy arrays, in order to go from STRING to an equivalent ndarray, + # we claim that the buffer is uint8 (i.e., a byte array) + data_dtype = ( + DTypeKind.UINT, + 8, + None, + None, + ) + # Specify zero offset as we don't want to chunk the string data + data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size) + + # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string + offset_buff, offset_dtype = buffers["offsets"] + offsets = buffer_to_ndarray(offset_buff, offset_dtype, col.offset, col.size + 1) + + null_kind, sentinel_val = col.describe_null + null_pos = None + + if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + valid_buff, valid_dtype = buffers["validity"] + null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) + if sentinel_val == 0: + null_pos = ~null_pos + + # Assemble the strings from the code units + str_list = [] + for i in range(offsets.size - 1): + # Check for missing values + if null_pos is not None and null_pos[i]: + str_list.append(np.nan) + continue + + # Extract a range of code units + units = data[offsets[i] : offsets[i + 1]] + + # Convert the list of code units to bytes + str_bytes = bytes(units) + + # Create the string + string = str_bytes.decode(encoding="utf-8") + + # Add to our list of strings + str_list.append(string) + + # Convert the string list to a NumPy array + return np.asarray(str_list, dtype="object"), buffers + + +def convert_datetime_col(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: + """ + Convert Column holding DateTime data to a NumPy array. + + Parameters + ---------- + col : ProtocolColumn + + Returns + ------- + tuple + Tuple of numpy.ndarray holding the data and the memory owner object that keeps the memory alive. + """ + buffers = col.get_buffers() + + _, _, format_str, _ = col.dtype + dbuf, dtype = buffers["data"] + # Consider dtype being `uint` to get number of units passed since the 01.01.1970 + data = buffer_to_ndarray( + dbuf, (DTypeKind.UINT, dtype[1], "u", "="), col.offset, col.size + ) + + if format_str.startswith("ts"): + # timestamp 'ts{unit}:tz' + meta = format_str[2:].split(":") + if len(meta) == 1: + unit = meta[0] + tz = "" + else: + unit, tz = meta + if tz != "": + raise NotImplementedError("Timezones are not supported yet") + if unit != "s": + # the format string describes only a first letter of the unit, add one extra + # letter to make the unit in numpy-style: 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' + unit += "s" + data = data.astype(f"datetime64[{unit}]") + elif format_str.startswith("td"): + # date 'td{Days/Ms}' + unit = format_str[2:] + if unit == "D": + # numpy doesn't support DAY unit, so converting days to seconds + # (converting to uint64 to avoid overflow) + data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") + elif unit == "m": + data = data.astype("datetime64[ms]") + else: + raise NotImplementedError(f"Date unit is not supported: {unit}") + else: + raise NotImplementedError(f"DateTime kind is not supported: {format_str}") + + data = set_nulls(data, col, buffers["validity"]) + return data, buffers + + +def buffer_to_ndarray( + buffer: ProtocolBuffer, + dtype: Tuple[DTypeKind, int, str, str], + offset: int = 0, + length: Optional[int] = None, +) -> np.ndarray: + """ + Build a NumPy array from the passed buffer. + + Parameters + ---------- + buffer : ProtocolBuffer + Buffer to build a NumPy array from. + dtype : tuple + Data type of the buffer conforming protocol dtypes format. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + length : int, optional + If the buffer is a bit-mask, specifies a number of bits to read + from the buffer. Has no effect otherwise. + + Returns + ------- + np.ndarray + + Notes + ----- + The returned array doesn't own the memory. A user of the function must keep the memory + owner object alive as long as the returned NumPy array is being used. + """ + kind, bit_width, _, _ = dtype + + if kind not in (DTypeKind.INT, DTypeKind.UINT, DTypeKind.FLOAT, DTypeKind.BOOL): + raise RuntimeError("Not a boolean, integer or floating-point dtype") + + np_kinds = { + DTypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, + DTypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, + DTypeKind.FLOAT: {32: np.float32, 64: np.float64}, + # Consider bitmask to be a uint8 dtype to parse the bits later + DTypeKind.BOOL: {1: np.uint8, 8: bool}, + } + + column_dtype = np_kinds[kind].get(bit_width, None) + if column_dtype is None: + raise NotImplementedError(f"Convertion for {dtype} is not yet supported.") + + # No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column + ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) + data_pointer = ctypes.cast( + buffer.ptr + (offset) * (bit_width // 8), ctypes.POINTER(ctypes_type) + ) + + if bit_width == 1: + assert length is not None, "`length` must be specified for a bit-mask buffer." + arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) + return bitmask_to_bool_array(arr, length, first_byte_offset=offset % 8) + else: + return np.ctypeslib.as_array( + data_pointer, shape=(buffer.bufsize // (bit_width // 8),) + ) + + +def bitmask_to_bool_array( + bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 +) -> np.ndarray: + """ + Convert bit-mask to a boolean NumPy array. + + Parameters + ---------- + bitmask : np.ndarray[uint8] + NumPy array of uint8 dtype representing the bitmask. + mask_length : int + Number of elements in the mask to interpret. + first_byte_offset : int, default: 0 + Number of elements to offset from the start of the first byte. + + Returns + ------- + np.ndarray[bool] + """ + if first_byte_offset > 8: + raise ValueError( + f"First byte offset can't be more than 8, met: {first_byte_offset}" + ) + + bool_mask = np.zeros(mask_length, dtype=bool) + + # Proccessing the first byte separately as it has its own offset + val = bitmask[0] + mask_idx = 0 + for j in range(min(8 - first_byte_offset, mask_length)): + if val & (1 << (j + first_byte_offset)): + bool_mask[mask_idx] = True + mask_idx += 1 + + for i in range(1, mask_length // 8): + val = bitmask[i] + for j in range(8): + if val & (1 << j): + bool_mask[mask_idx] = True + mask_idx += 1 + + if len(bitmask) > 1: + # Processing reminder of last byte + val = bitmask[-1] + for j in range(len(bool_mask) - mask_idx): + if val & (1 << j): + bool_mask[mask_idx] = True + mask_idx += 1 + + return bool_mask + + +def set_nulls( + data: Union[np.ndarray, pandas.Series], + col: ProtocolColumn, + validity: Tuple[ProtocolBuffer, Tuple[DTypeKind, int, str, str]], + allow_modify_inplace: bool = True, +): + """ + Set null values for the data according to the column null kind. + + Parameters + ---------- + data : numpy.ndarray or pandas.Series + Data to set nulls in. + col : ProtocolColumn + Column object that describes the `data`. + validity : tuple(ProtocolBuffer, dtype) or None + The return value of ``col.buffers()``. We do not access the ``col.buffers()`` + here to not take the ownership of the memory of buffer objects. + allow_modify_inplace : bool, default: True + Whether to modify the `data` inplace when zero-copy is possible (True) or always + modify a copy of the `data` (False). + + Returns + ------- + numpy.ndarray of pandas.Series + Data with the nulls being set. + """ + null_kind, sentinel_val = col.describe_null + null_pos = None + + if null_kind == ColumnNullType.USE_SENTINEL: + null_pos = data == sentinel_val + elif null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): + valid_buff, valid_dtype = validity + null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) + if sentinel_val == 0: + null_pos = ~null_pos + elif null_kind in (ColumnNullType.NON_NULLABLE, ColumnNullType.USE_NAN): + pass + else: + raise NotImplementedError(f"Null kind {null_kind} is not yet supported.") + + if null_pos is not None and np.any(null_pos): + if not allow_modify_inplace: + data = data.copy() + try: + data[null_pos] = None + except TypeError: + # TypeError happens if the `data` dtype appears to be non-nullable in numpy notation + # (bool, int, uint), if such happens, cast the `data` to nullable float dtype. + data = data.astype(float) + data[null_pos] = None + + return data diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index f1a396fe28a..920510a4a5a 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -160,7 +160,7 @@ def _is_zero_copy_possible(self) -> bool: self.__is_zero_copy_possible = False else: # Check whether the plan for PyArrow can be executed zero-copy - self.__is_zero_copy_possible = self._is_zero_copy_op(self._df.op) + self.__is_zero_copy_possible = self._is_zero_copy_op(self._df._op) return self.__is_zero_copy_possible @classmethod diff --git a/modin/experimental/core/storage_formats/omnisci/query_compiler.py b/modin/experimental/core/storage_formats/omnisci/query_compiler.py index f5dbe934565..44032fa0c03 100644 --- a/modin/experimental/core/storage_formats/omnisci/query_compiler.py +++ b/modin/experimental/core/storage_formats/omnisci/query_compiler.py @@ -204,7 +204,9 @@ def from_arrow(cls, at, data_cls): # Dataframe exchange protocol def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True) -> dict: - return self._modin_frame.__dataframe__()["dataframe"] + return self._modin_frame.__dataframe__( + nan_as_null=nan_as_null, allow_copy=allow_copy + )["dataframe"] @classmethod def from_dataframe(cls, df, data_cls): diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 73321b2cd2e..8590f20e9f1 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -21,6 +21,10 @@ from modin.pandas.test.utils import df_equals from modin.pandas.utils import from_arrow, from_dataframe as md_from_dataframe +from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( + convert_primitive_column_to_ndarray, +) + from .utils import get_all_types, split_df_into_chunks, export_frame @@ -139,6 +143,13 @@ def test_export_unaligned_at_chunks(data_has_nulls): def test_export_when_delayed_computations(): + """ + Test that export works properly when OmnisciOnNative has delayed computations. + + If there are delayed functions and export is required, it has to trigger the execution + first prior materializing protocol's buffers, so the buffers contain actual result + of the computations. + """ # OmniSci can't import 'uint64' as well as booleans, so exclude them # issue for bool: https://github.com/modin-project/modin/issues/4299 data = get_all_types(has_nulls=True, exclude_dtypes=["uint64", "bool"]) @@ -157,9 +168,39 @@ def test_export_when_delayed_computations(): @pytest.mark.parametrize("data_has_nulls", [True, False]) def test_simple_import(data_has_nulls): + """Test that ``modin.pandas.utils.from_dataframe`` works properly.""" data = get_all_types(data_has_nulls) md_df_source = pd.DataFrame(data) md_df_consumer = md_from_dataframe(md_df_source._query_compiler._modin_frame) df_equals(md_df_source, md_df_consumer) + + +@pytest.mark.parametrize("data_has_nulls", [True, False]) +def test_zero_copy_export_for_primitives(data_has_nulls): + """Test that basic data types can be zero-copy exported from OmnisciOnNative dataframe.""" + data = get_all_types( + has_nulls=data_has_nulls, include_dtypes=["int", "uint", "float"] + ) + at = pa.Table.from_pydict(data) + + md_df = from_arrow(at) + protocol_df = md_df.__dataframe__(allow_copy=False) + + for i, col in enumerate(protocol_df.get_columns()): + col_arr, memory_owner = convert_primitive_column_to_ndarray(col) + + exported_ptr = col_arr.__array_interface__["data"][0] + source_ptr = at.column(i).chunks[0].buffers()[-1].address + # Verify that the pointers of source and exported objects point to the same data + assert source_ptr == exported_ptr + + # Can't export `md_df` zero-copy no more as it has delayed 'fillna' operation + md_df = md_df.fillna({"float32": 32.0}) + non_zero_copy_protocol_df = md_df.__dataframe__(allow_copy=False) + + with pytest.raises(RuntimeError): + col_arr, memory_owner = convert_primitive_column_to_ndarray( + non_zero_copy_protocol_df.get_column_by_name("float32") + ) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index 063c3b21db7..0de457b4c29 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -79,7 +79,7 @@ def export_frame(md_df, from_omnisci=False, **kwargs): return exported_df -def get_all_types(has_nulls=False, exclude_dtypes=None): +def get_all_types(has_nulls=False, exclude_dtypes=None, include_dtypes=None): bool_data = {} int_data = {} uint_data = {} @@ -162,6 +162,14 @@ def get_all_types(has_nulls=False, exclude_dtypes=None): **category_data, } + if include_dtypes is not None: + filtered_keys = ( + key + for key in data.keys() + if any(key.startswith(dtype) for dtype in include_dtypes) + ) + data = {key: data[key] for key in filtered_keys} + if exclude_dtypes is not None: filtered_keys = ( key From b551590de074e0fb03ac5640a093a32618090ce2 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 8 Mar 2022 13:31:54 +0300 Subject: [PATCH 10/33] Fix chunking of bitmask Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/from_dataframe.py | 12 +++++++----- .../dataframe_protocol/omnisci/test_protocol.py | 12 ++++++++++++ 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py index 6700c7bd526..c386fcd6b70 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py @@ -113,7 +113,6 @@ def convert_primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray tuple Tuple of numpy.ndarray holding the data and the memory owner object that keeps the memory alive. """ - # breakpoint() buffers = col.get_buffers() data_buff, data_dtype = buffers["data"] @@ -337,7 +336,7 @@ def buffer_to_ndarray( # and size in the buffer plus the dtype on the column ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) data_pointer = ctypes.cast( - buffer.ptr + (offset) * (bit_width // 8), ctypes.POINTER(ctypes_type) + buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) ) if bit_width == 1: @@ -379,13 +378,16 @@ def bitmask_to_bool_array( # Proccessing the first byte separately as it has its own offset val = bitmask[0] mask_idx = 0 - for j in range(min(8 - first_byte_offset, mask_length)): + bits_in_first_byte = min(8 - first_byte_offset, mask_length) + for j in range(bits_in_first_byte): if val & (1 << (j + first_byte_offset)): bool_mask[mask_idx] = True mask_idx += 1 - for i in range(1, mask_length // 8): - val = bitmask[i] + # `mask_length // 8` describes how many full bytes to process + for i in range((mask_length - bits_in_first_byte) // 8): + # doing `+ 1` as we already processed the first byte + val = bitmask[i + 1] for j in range(8): if val & (1 << j): bool_mask[mask_idx] = True diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 8590f20e9f1..d9de52d715e 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -204,3 +204,15 @@ def test_zero_copy_export_for_primitives(data_has_nulls): col_arr, memory_owner = convert_primitive_column_to_ndarray( non_zero_copy_protocol_df.get_column_by_name("float32") ) + + +def test_bitmask_chunking(): + """Test that making a virtual chunk in a middle of a byte of a bitmask doesn't cause problems.""" + at = pa.Table.from_pydict({"col": [True, False, True, True, False] * 5}) + assert at["col"].type.bit_width == 1 + + md_df = from_arrow(at) + # Column length is 25, nchunks is 2, meaning that the split will occur in the middle + # of the second byte + exported_df = export_frame(md_df, nchunks=2) + df_equals(md_df, exported_df) From 8711a511f47e735669dd043622d57c7b84b3d17f Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 8 Mar 2022 14:03:08 +0300 Subject: [PATCH 11/33] Minor formatting fixes Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/column.py | 7 +++---- .../exchange/dataframe_protocol/dataframe.py | 17 ++++++++++------- .../dataframe_protocol/omnisci/test_protocol.py | 8 ++++---- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index d350378a04c..70b6c76ee0b 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -51,7 +51,7 @@ class OmnisciProtocolColumn(ProtocolColumn): Notes ----- The object could be modified inplace due to casting PyArrow buffers to a new dtype: - ``_propagate_dtype``, ``_cast_at``), the methods replace the wrapped + ``_propagate_dtype``, ``_cast_at`` - the methods replace the wrapped ``OmnisciProtocolDataframe`` object with the new one holding the casted PyArrow table. """ @@ -170,7 +170,7 @@ def describe_categorical(self) -> Dict[str, Any]: # Although we can retrieve codes from pandas dtype, they're unsynced with # the actual PyArrow data most of the time. So getting the mapping directly - # from materialized PyArrow table. + # from the materialized PyArrow table. col = self._pyarrow_table.column(0) if len(col.chunks) > 1: if not self._col._allow_copy: @@ -355,7 +355,7 @@ def _get_validity_buffer( ------- tuple or None Tuple of OmnisciProtocolBuffer and protocol dtype representation of the buffer's underlying data. - None if column is non-nullable (``self.describe_null == `ColumnNullType.NON_NULLABLE``). + None if column is non-nullable (``self.describe_null == ColumnNullType.NON_NULLABLE``). """ validity_buffer = arr.buffers()[0] if validity_buffer is None: @@ -365,7 +365,6 @@ def _get_validity_buffer( data_size = self._get_buffer_size(bit_width=1) return ( OmnisciProtocolBuffer(validity_buffer, data_size), - # self._dtype_from_primitive_pandas(np.dtype("uint8")), (DTypeKind.BOOL, 1, "b", "="), ) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index 920510a4a5a..300c1691acc 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -66,7 +66,7 @@ def __init__( @property def metadata(self) -> Dict[str, Any]: - # TODO: as the frame's index is stored as a separate column inside pyarrow table + # TODO: as the frame's index is stored as a separate column inside PyArrow table # we may want to return the column's name here instead of materialized index. # This will require the internal index column to be visible in the protocol's column # accessor methods. @@ -77,8 +77,7 @@ def num_columns(self) -> int: return len(self._df.columns) def num_rows(self) -> int: - if not self._allow_copy and not self._is_zero_copy_possible: - raise RuntimeError("Copy required with 'allow_copy=False'") + self._maybe_raise_if_materialize() return len(self._df.index) def num_chunks(self) -> int: @@ -119,6 +118,7 @@ def _chunk_slices(self) -> np.ndarray: """ if self.__chunk_slices is None: at = self._pyarrow_table + # What we need to do is to union offsets of all the columns col_slices = set({0}) for col in at.columns: col_slices = col_slices.union( @@ -151,11 +151,11 @@ def _is_zero_copy_possible(self) -> bool: """ if self.__is_zero_copy_possible is None: if self._df._has_arrow_table(): - # If PyArrow is already materialized table then we can - # retrieve the data zero-copy + # If PyArrow table is already materialized then we can + # retrieve data zero-copy self.__is_zero_copy_possible = True elif not self._df._can_execute_arrow(): - # When not able to execute the plan via Arrow means + # When not able to execute the plan via PyArrow means # that we have to involve OmniSci, so no zero-copy. self.__is_zero_copy_possible = False else: @@ -236,6 +236,7 @@ def get_columns(self) -> Iterable[OmnisciProtocolColumn]: yield OmnisciProtocolColumn( OmnisciProtocolDataframe( self._df.mask(col_labels=[name]), + nan_as_null=self._nan_as_null, allow_copy=self._allow_copy, ), ) @@ -246,6 +247,7 @@ def select_columns(self, indices: Sequence[int]) -> "OmnisciProtocolDataframe": return OmnisciProtocolDataframe( self._df.mask(col_positions=list(indices)), + nan_as_null=self._nan_as_null, allow_copy=self._allow_copy, ) @@ -257,6 +259,7 @@ def select_columns_by_name( return OmnisciProtocolDataframe( self._df.mask(col_labels=list(names)), + nan_as_null=self._nan_as_null, allow_copy=self._allow_copy, ) @@ -318,6 +321,6 @@ def _yield_chunks(self, chunk_slices) -> "OmnisciProtocolDataframe": df=self._df.mask( row_positions=range(chunk_slices[i], chunk_slices[i + 1]) ), - allow_copy=self._allow_copy, nan_as_null=self._nan_as_null, + allow_copy=self._allow_copy, ) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index d9de52d715e..2d0ae335bfb 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -57,8 +57,8 @@ def test_simple_export(data_has_nulls, from_omnisci): @pytest.mark.parametrize("nchunks", [2, 4, 7]) @pytest.mark.parametrize("data_has_nulls", [True, False]) def test_export_aligned_at_chunks(nchunks, data_has_nulls): - """Test export from DataFrame exchange protocol when internal arrow table is equaly chunked.""" - # Modin DataFrame constructor can't process pyarrow's category, so exclude it + """Test export from DataFrame exchange protocol when internal PyArrow table is equaly chunked.""" + # Modin DataFrame constructor can't process PyArrow's category, so exclude it data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) pd_chunks = split_df_into_chunks(pd_df, nchunks) @@ -86,13 +86,13 @@ def test_export_aligned_at_chunks(nchunks, data_has_nulls): @pytest.mark.parametrize("data_has_nulls", [True, False]) def test_export_unaligned_at_chunks(data_has_nulls): """ - Test export from DataFrame exchange protocol when internal arrow table's chunks are unaligned. + Test export from DataFrame exchange protocol when internal PyArrow table's chunks are unaligned. Arrow table allows for its columns to be chunked independently. Unaligned chunking means that each column has its individual chunking and so some preprocessing is required in order to emulate equaly chunked columns in the protocol. """ - # Modin DataFrame constructor can't process pyarrow's category, so exclude it + # Modin DataFrame constructor can't process PyArrow's category, so exclude it data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) # divide columns in 3 groups: unchunked, 2-chunked, 7-chunked From 80f819adf1950b67a71fc7b705d481a489dc3bb9 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Mon, 14 Mar 2022 19:26:43 +0300 Subject: [PATCH 12/33] Apply suggestions from code review Co-authored-by: Yaroslav Igoshev --- .../exchange/dataframe_protocol/buffer.py | 3 +-- .../exchange/dataframe_protocol/column.py | 2 +- .../exchange/dataframe_protocol/dataframe.py | 9 ++++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py index 2dcaafbf38d..583097872cd 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/buffer.py @@ -14,13 +14,12 @@ """The module houses OmnisciOnNative implementation of the Buffer class of DataFrame exchange protocol.""" import pyarrow as pa - from typing import Tuple, Optional + from modin.core.dataframe.base.exchange.dataframe_protocol.utils import DlpackDeviceType from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( ProtocolBuffer, ) - from modin.utils import _inherit_docstrings diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index 70b6c76ee0b..f384c40b42f 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -16,9 +16,9 @@ import pyarrow as pa import pandas import numpy as np - from typing import Any, Optional, Tuple, Dict, Iterable from math import ceil + from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( DTypeKind, ColumnNullType, diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index 300c1691acc..1a714549804 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -16,14 +16,13 @@ import collections import numpy as np import pyarrow as pa - from typing import Optional, Iterable, Sequence, Dict, Any + from modin.experimental.core.execution.native.implementations.omnisci_on_native.dataframe.dataframe import ( OmnisciOnNativeDataframe, ) from modin.core.dataframe.base.exchange.dataframe_protocol import ProtocolDataframe from modin.utils import _inherit_docstrings - from modin.experimental.core.execution.native.implementations.omnisci_on_native.df_algebra import ( MaskNode, FrameNode, @@ -36,7 +35,7 @@ @_inherit_docstrings(ProtocolDataframe) class OmnisciProtocolDataframe(ProtocolDataframe): """ - Implement DataFrame exchange protocol class for OmniSciOnNative execution. + Implement the DataFrame exchange protocol class for ``OmnisciOnNative`` execution. Parameters ---------- @@ -71,7 +70,7 @@ def metadata(self) -> Dict[str, Any]: # This will require the internal index column to be visible in the protocol's column # accessor methods. self._maybe_raise_if_materialize() - return {"index": self._df.index} + return {"modin.index": self._df.index} def num_columns(self) -> int: return len(self._df.columns) @@ -164,7 +163,7 @@ def _is_zero_copy_possible(self) -> bool: return self.__is_zero_copy_possible @classmethod - def _is_zero_copy_op(cls, op) -> bool: + def _is_zero_copy_arrow_op(cls, op) -> bool: """ Check whether the passed node of the delayed computation tree could be executed zero-copy via PyArrow execution. From 037c03303a1f7a7501191d7e4eca9651ac11a8e4 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 14 Mar 2022 19:48:34 +0300 Subject: [PATCH 13/33] Apply review suggestions Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/from_dataframe.py | 4 +++- .../omnisci_on_native/dataframe/dataframe.py | 15 ++++++--------- .../exchange/dataframe_protocol/dataframe.py | 15 ++++++++++----- .../storage_formats/omnisci/query_compiler.py | 4 ++-- .../exchange/dataframe_protocol/omnisci/utils.py | 6 ++---- 5 files changed, 23 insertions(+), 21 deletions(-) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py index c386fcd6b70..fcb236a4cfe 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py @@ -51,7 +51,9 @@ def from_dataframe( if not hasattr(df, "__dataframe__"): raise ValueError("`df` does not support __dataframe__") - df = df.__dataframe__()["dataframe"] + df = df.__dataframe__() + if isinstance(df, dict): + df = df["dataframe"] def get_pandas_df(df): # We need a dict of columns here, with each column being a numpy array (at diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index 0481a79bed8..6c8d9f146b8 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2037,7 +2037,7 @@ def _get_columns(self): """ return super(OmnisciOnNativeDataframe, self)._get_columns() - def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> dict: + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): """ Get a DataFrame exchange protocol object representing data of the Modin DataFrame. @@ -2057,8 +2057,8 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> d Returns ------- - dict - A dictionary object following the dataframe protocol specification. + ProtocolDataframe + A dataframe object following the dataframe protocol specification. """ if self._has_unsupported_data: pd_df = self.to_pandas() @@ -2071,12 +2071,9 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> d from ..exchange.dataframe_protocol import OmnisciProtocolDataframe - return { - "dataframe": OmnisciProtocolDataframe( - self, nan_as_null=nan_as_null, allow_copy=allow_copy - ), - "version": 0, - } + return OmnisciProtocolDataframe( + self, nan_as_null=nan_as_null, allow_copy=allow_copy + ) columns = property(_get_columns) index = property(_get_index) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index 1a714549804..c5735bf917d 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -23,6 +23,7 @@ ) from modin.core.dataframe.base.exchange.dataframe_protocol import ProtocolDataframe from modin.utils import _inherit_docstrings +from modin.error_message import ErrorMessage from modin.experimental.core.execution.native.implementations.omnisci_on_native.df_algebra import ( MaskNode, FrameNode, @@ -159,7 +160,7 @@ def _is_zero_copy_possible(self) -> bool: self.__is_zero_copy_possible = False else: # Check whether the plan for PyArrow can be executed zero-copy - self.__is_zero_copy_possible = self._is_zero_copy_op(self._df._op) + self.__is_zero_copy_possible = self._is_zero_copy_arrow_op(self._df._op) return self.__is_zero_copy_possible @classmethod @@ -188,14 +189,14 @@ def _is_zero_copy_arrow_op(cls, op) -> bool: is_zero_copy_op = True return is_zero_copy_op and all( # Walk the computation tree - cls._is_zero_copy_op(_op) + cls._is_zero_copy_arrow_op(_op) for _op in getattr(op, "inputs", []) ) @property def _pyarrow_table(self) -> pa.Table: """ - Get PyArrow table representing the column. + Get PyArrow table representing the DataFrame. Returns ------- @@ -279,6 +280,8 @@ def get_chunks( ) extra_chunks = n_chunks - self.num_chunks() + # `._chunk_slices` is a cached property, we don't want to modify the property's + # array inplace, so doing a copy here subdivided_slices = self._chunk_slices.copy() # The subdividing behavior is a bit different from "subdividing each chunk", @@ -292,8 +295,10 @@ def get_chunks( subdivided_slices[biggest_chunk_idx + 1] - subdivided_slices[biggest_chunk_idx] ) // 2 - if new_chunk_offset == 0: - raise RuntimeError("No more chunks to subdivide.") + ErrorMessage.catch_bugs_and_request_email( + failure_condition=new_chunk_offset == 0, + extra_log="No more chunks to subdivide", + ) subdivided_slices = np.insert( subdivided_slices, biggest_chunk_idx + 1, diff --git a/modin/experimental/core/storage_formats/omnisci/query_compiler.py b/modin/experimental/core/storage_formats/omnisci/query_compiler.py index 44032fa0c03..9564fc7ecce 100644 --- a/modin/experimental/core/storage_formats/omnisci/query_compiler.py +++ b/modin/experimental/core/storage_formats/omnisci/query_compiler.py @@ -203,10 +203,10 @@ def from_arrow(cls, at, data_cls): # Dataframe exchange protocol - def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True) -> dict: + def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): return self._modin_frame.__dataframe__( nan_as_null=nan_as_null, allow_copy=allow_copy - )["dataframe"] + ) @classmethod def from_dataframe(cls, df, data_cls): diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index 0de457b4c29..c1060caa775 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -68,13 +68,11 @@ def export_frame(md_df, from_omnisci=False, **kwargs): pandas.DataFrame """ if not from_omnisci: - return from_dataframe(md_df._query_compiler._modin_frame, **kwargs) + return from_dataframe(md_df, **kwargs) with ForceOmnisciImport(md_df) as instance: md_df_exported = instance.export_frames()[0] - exported_df = from_dataframe( - md_df_exported._query_compiler._modin_frame, **kwargs - ) + exported_df = from_dataframe(md_df_exported, **kwargs) return exported_df From 333917296f06a1cfe31fe417fcbe422236ba54b3 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Mon, 14 Mar 2022 19:54:19 +0300 Subject: [PATCH 14/33] Align imports with current master branch Signed-off-by: Dmitry Chigarev --- .../base/exchange/dataframe_protocol/from_dataframe.py | 5 +++-- .../exchange/dataframe_protocol/dataframe.py | 4 +++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py index fcb236a4cfe..f6cca28baea 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py @@ -96,8 +96,9 @@ def get_pandas_df(df): pandas_df = pandas.concat(pandas_dfs, axis=0, ignore_index=True) - if "index" in df.metadata: - pandas_df.index = df.metadata["index"] + index_obj = df.metadata.get("modin.index", df.metadata.get("pandas.index", None)) + if index_obj is not None: + pandas_df.index = index_obj return pandas_df diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index c5735bf917d..4a686e1bac9 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -21,7 +21,9 @@ from modin.experimental.core.execution.native.implementations.omnisci_on_native.dataframe.dataframe import ( OmnisciOnNativeDataframe, ) -from modin.core.dataframe.base.exchange.dataframe_protocol import ProtocolDataframe +from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( + ProtocolDataframe, +) from modin.utils import _inherit_docstrings from modin.error_message import ErrorMessage from modin.experimental.core.execution.native.implementations.omnisci_on_native.df_algebra import ( From 83f40b4f526a5980b9c9c452a72de77754edeb48 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 15 Mar 2022 13:40:06 +0300 Subject: [PATCH 15/33] Allow to access buffers of a physically chunked column Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/column.py | 42 +++++++++++----- .../omnisci/test_protocol.py | 48 +++++++++++++++++++ 2 files changed, 79 insertions(+), 11 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index f384c40b42f..50919ab4672 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -50,9 +50,10 @@ class OmnisciProtocolColumn(ProtocolColumn): Notes ----- - The object could be modified inplace due to casting PyArrow buffers to a new dtype: - ``_propagate_dtype``, ``_cast_at`` - the methods replace the wrapped - ``OmnisciProtocolDataframe`` object with the new one holding the casted PyArrow table. + The object could be modified inplace due to either casting PyArrow buffers to a new dtype + or combining physical chunks into a single congingous buffer: + ``_propagate_dtype``, ``_cast_at``, ``_combine_chunks`` - the methods replace the wrapped + ``OmnisciProtocolDataframe`` object with the new one holding the modified PyArrow table. """ def __init__(self, column: "OmnisciProtocolDataframe") -> None: @@ -244,10 +245,6 @@ def get_chunks(self, n_chunks: Optional[int] = None) -> Iterable["Column"]: yield OmnisciProtocolColumn(chunk) def get_buffers(self) -> Dict[str, Any]: - if self.num_chunks() != 1: - # TODO: do chunks combining - raise NotImplementedError() - self._materialize_actual_buffers() at = self._pyarrow_table pyarrow_array = at.column(0).chunks[0] @@ -264,11 +261,18 @@ def _materialize_actual_buffers(self): Materialize PyArrow table's buffers that can be zero-copy returned to a consumer, if they aren't already materialized. Besides materializing PyArrow table itself (if there were some delayed computations) - the function also propagates external dtypes to the PyArrow table. For example, - if ``self.dtype`` is a string kind, but internal PyArrow dtype is a dictionary - (if the table were just exported from OmniSci), then the dictionary will be casted - to string dtype. + the function also may do the following if required: + 1. Propagate external dtypes to the PyArrow table. For example, + if ``self.dtype`` is a string kind, but internal PyArrow dtype is a dictionary + (if the table were just exported from OmniSci), then the dictionary will be casted + to string dtype. + 2. Combine physical chunks of PyArrow table into a single contiguous buffer. """ + if self.num_chunks() != 1: + if not self._col._allow_copy: + raise RuntimeError("Copy required with 'allow_copy=False' flag") + self._combine_chunks() + external_dtype = self.dtype internal_dtype = self._dtype_from_pyarrow(self._arrow_dtype) @@ -486,3 +490,19 @@ def _cast_at(self, new_schema: pa.Schema): self._col._nan_as_null, self._col._allow_copy, ) + + def _combine_chunks(self): + """ + Combine physical chunks of underlying PyArrow table. + + Notes + ----- + This method modifies the column inplace by replacing the wrapped ``OmnisciProtocolDataframe`` + with the new one holding PyArrow table with the column's data placed in a single contingous buffer. + """ + contiguous_at = self._pyarrow_table.combine_chunks() + self._col = type(self._col)( + self._col._df.from_arrow(contiguous_at), + self._col._nan_as_null, + self._col._allow_copy, + ) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 2d0ae335bfb..22b85a8d811 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -17,12 +17,16 @@ import modin.pandas as pd import pyarrow as pa import pandas +import numpy as np from modin.pandas.test.utils import df_equals +from modin.test.test_utils import warns_that_defaulting_to_pandas from modin.pandas.utils import from_arrow, from_dataframe as md_from_dataframe from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( convert_primitive_column_to_ndarray, + buffer_to_ndarray, + set_nulls, ) from .utils import get_all_types, split_df_into_chunks, export_frame @@ -216,3 +220,47 @@ def test_bitmask_chunking(): # of the second byte exported_df = export_frame(md_df, nchunks=2) df_equals(md_df, exported_df) + + +@pytest.mark.parametrize("data_has_nulls", [True, False]) +@pytest.mark.parametrize("nchunks", [2, 9]) +def test_buffer_of_chunked_at(data_has_nulls, nchunks): + """Test that getting buffers of physically chunked column works properly.""" + data = get_all_types( + # For the simplicity of the test include only primitive types, so the test can use + # only one function to export a column instead of if-elsing to find a type-according one + has_nulls=data_has_nulls, + include_dtypes=["bool", "int", "uint", "float"], + ) + + pd_df = pandas.DataFrame(data) + pd_chunks = split_df_into_chunks(pd_df, nchunks) + + chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) + md_df = from_arrow(chunked_at) + + protocol_df = md_df.__dataframe__() + for i, col in enumerate(protocol_df.get_columns()): + assert col.num_chunks() > 1 + assert len(col._pyarrow_table.column(0).chunks) > 1 + + buffers = col.get_buffers() + data_buff, data_dtype = buffers["data"] + result = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size) + result = set_nulls(result, col, buffers["validity"]) + + # Our configuration in pytest.ini requires that we explicitly catch all + # instances of defaulting to pandas, this one raises a warning on `.to_numpy()` + with warns_that_defaulting_to_pandas(): + reference = md_df.iloc[:, i].to_numpy() + + np.testing.assert_array_equal(reference, result) + + protocol_df = md_df.__dataframe__(allow_copy=False) + for i, col in enumerate(protocol_df.get_columns()): + assert col.num_chunks() > 1 + assert len(col._pyarrow_table.column(0).chunks) > 1 + + # Catch exception on attempt of doing a copy due to chunks combining + with pytest.raises(RuntimeError): + col.get_buffers() From 854c91a4638a385b6ab23a6bfa89221e92d0921d Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 15 Mar 2022 16:34:26 +0300 Subject: [PATCH 16/33] Apply review suggestions Signed-off-by: Dmitry Chigarev --- .../omnisci_on_native/dataframe/dataframe.py | 2 +- .../exchange/dataframe_protocol/__init__.py | 4 --- .../exchange/dataframe_protocol/column.py | 19 ++++++++++--- .../exchange/dataframe_protocol/utils.py | 2 +- .../omnisci/test_protocol.py | 28 +++++++++---------- .../dataframe_protocol/omnisci/utils.py | 28 ++++++++++++++++++- 6 files changed, 58 insertions(+), 25 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index 6c8d9f146b8..23f39701303 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2069,7 +2069,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): + "that are unsupported by OmniSci." ) - from ..exchange.dataframe_protocol import OmnisciProtocolDataframe + from ..exchange.dataframe_protocol.dataframe import OmnisciProtocolDataframe return OmnisciProtocolDataframe( self, nan_as_null=nan_as_null, allow_copy=allow_copy diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py index 42f5b7d53b3..cae6413e559 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/__init__.py @@ -10,7 +10,3 @@ # the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. - -from .dataframe import OmnisciProtocolDataframe - -__all__ = ["OmnisciProtocolDataframe"] diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index 50919ab4672..252c11948f4 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -160,7 +160,8 @@ def describe_categorical(self) -> Dict[str, Any]: if dtype != "category": raise RuntimeError( - f"Column 'dtype' has to be categorical to be able to dectribe categiries, met: {dtype}" + "`describe_categorical only works on a column with " + + "categorical dtype!" ) ordered = dtype.ordered @@ -324,7 +325,7 @@ def _get_data_buffer( Returns ------- tuple - Tuple of OmnisciProtocolBuffer and protocol dtype representation of the buffer's underlying data. + Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. """ if self.dtype[0] == DTypeKind.CATEGORICAL: # For dictionary data the buffer has to return categories codes @@ -340,6 +341,9 @@ def _get_data_buffer( ) return ( + # According to the Arrow's memory layout, the validity buffer is always present + # at the last position of `.buffers()`: + # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout OmnisciProtocolBuffer(arr.buffers()[-1], buff_size), arrow_type, ) @@ -358,9 +362,11 @@ def _get_validity_buffer( Returns ------- tuple or None - Tuple of OmnisciProtocolBuffer and protocol dtype representation of the buffer's underlying data. + Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. None if column is non-nullable (``self.describe_null == ColumnNullType.NON_NULLABLE``). """ + # According to the Arrow's memory layout, the validity buffer is always present at zero position: + # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout validity_buffer = arr.buffers()[0] if validity_buffer is None: return validity_buffer @@ -386,10 +392,15 @@ def _get_offsets_buffer( Returns ------- tuple or None - Tuple of OmnisciProtocolBuffer and protocol dtype representation of the buffer's underlying data. + Tuple of ``OmnisciProtocolBuffer`` and protocol dtype representation of the buffer's underlying data. None if the column's dtype is fixed-size. """ buffs = arr.buffers() + # According to the Arrow's memory layout, the offsets buffer is always at the second position + # of `.buffers()` if present. Considering the support of only Primitive, Variable-length binary, + # and Dict-encoded types from the layout table, we can assume that there's no offsets buffer + # if there are fewer than 3 buffers available. + # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout if len(buffs) < 3: return None diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py index d6b0bc7ddac..f51a932d9c5 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py @@ -11,7 +11,7 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -"""Utility functions of DataFrame exchange protocol implementation for OmnisciOnNative execution.""" +"""Utility functions for the DataFrame exchange protocol implementation for ``OmnisciOnNative`` execution.""" import pyarrow as pa import numpy as np diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 22b85a8d811..3f4964c3474 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -14,22 +14,20 @@ """Dataframe exchange protocol tests that are specific for OmniSci implementation.""" import pytest -import modin.pandas as pd import pyarrow as pa import pandas import numpy as np -from modin.pandas.test.utils import df_equals -from modin.test.test_utils import warns_that_defaulting_to_pandas -from modin.pandas.utils import from_arrow, from_dataframe as md_from_dataframe - +import modin.pandas as pd from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( convert_primitive_column_to_ndarray, buffer_to_ndarray, set_nulls, ) - -from .utils import get_all_types, split_df_into_chunks, export_frame +from modin.pandas.utils import from_arrow, from_dataframe as md_from_dataframe +from modin.pandas.test.utils import df_equals +from modin.test.test_utils import warns_that_defaulting_to_pandas +from .utils import get_data_of_all_types, split_df_into_chunks, export_frame @pytest.mark.parametrize("data_has_nulls", [True, False]) @@ -42,7 +40,9 @@ def test_simple_export(data_has_nulls, from_omnisci): else: exclude_dtypes = None - data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=exclude_dtypes) + data = get_data_of_all_types( + has_nulls=data_has_nulls, exclude_dtypes=exclude_dtypes + ) md_df = pd.DataFrame(data) exported_df = export_frame(md_df, from_omnisci) @@ -63,7 +63,7 @@ def test_simple_export(data_has_nulls, from_omnisci): def test_export_aligned_at_chunks(nchunks, data_has_nulls): """Test export from DataFrame exchange protocol when internal PyArrow table is equaly chunked.""" # Modin DataFrame constructor can't process PyArrow's category, so exclude it - data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) + data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) pd_chunks = split_df_into_chunks(pd_df, nchunks) @@ -97,7 +97,7 @@ def test_export_unaligned_at_chunks(data_has_nulls): to emulate equaly chunked columns in the protocol. """ # Modin DataFrame constructor can't process PyArrow's category, so exclude it - data = get_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) + data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) # divide columns in 3 groups: unchunked, 2-chunked, 7-chunked chunk_groups = [1, 2, 7] @@ -156,7 +156,7 @@ def test_export_when_delayed_computations(): """ # OmniSci can't import 'uint64' as well as booleans, so exclude them # issue for bool: https://github.com/modin-project/modin/issues/4299 - data = get_all_types(has_nulls=True, exclude_dtypes=["uint64", "bool"]) + data = get_data_of_all_types(has_nulls=True, exclude_dtypes=["uint64", "bool"]) md_df = pd.DataFrame(data) pd_df = pandas.DataFrame(data) @@ -173,7 +173,7 @@ def test_export_when_delayed_computations(): @pytest.mark.parametrize("data_has_nulls", [True, False]) def test_simple_import(data_has_nulls): """Test that ``modin.pandas.utils.from_dataframe`` works properly.""" - data = get_all_types(data_has_nulls) + data = get_data_of_all_types(data_has_nulls) md_df_source = pd.DataFrame(data) md_df_consumer = md_from_dataframe(md_df_source._query_compiler._modin_frame) @@ -184,7 +184,7 @@ def test_simple_import(data_has_nulls): @pytest.mark.parametrize("data_has_nulls", [True, False]) def test_zero_copy_export_for_primitives(data_has_nulls): """Test that basic data types can be zero-copy exported from OmnisciOnNative dataframe.""" - data = get_all_types( + data = get_data_of_all_types( has_nulls=data_has_nulls, include_dtypes=["int", "uint", "float"] ) at = pa.Table.from_pydict(data) @@ -226,7 +226,7 @@ def test_bitmask_chunking(): @pytest.mark.parametrize("nchunks", [2, 9]) def test_buffer_of_chunked_at(data_has_nulls, nchunks): """Test that getting buffers of physically chunked column works properly.""" - data = get_all_types( + data = get_data_of_all_types( # For the simplicity of the test include only primitive types, so the test can use # only one function to export a column instead of if-elsing to find a type-according one has_nulls=data_has_nulls, diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index c1060caa775..ca7c7f19fe3 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -15,6 +15,7 @@ import pandas import numpy as np +from typing import Dict from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( from_dataframe, @@ -77,7 +78,32 @@ def export_frame(md_df, from_omnisci=False, **kwargs): return exported_df -def get_all_types(has_nulls=False, exclude_dtypes=None, include_dtypes=None): +def get_data_of_all_types( + has_nulls=False, exclude_dtypes=None, include_dtypes=None +) -> Dict[str, np.ndarray]: + """ + Generate a dictionary containing every datatype that is supported by Omnisci implementation of the exchange protocol. + + Parameters + ---------- + has_nulls : bool, default: False + Whether to include columns containing null values. + exclude_dtypes : list, optional + List of type prefixes to exclude in the dictionary. For example, + passing ``["int", "float"]`` excludes all of the signed integer (``int16``, + ``int32``, ``int64``) and float (``float32``, ``float64``) types. + include_dtypes : list, optional + List of type prefixes to include in the dictionary. For example, + passing ``["int", "float"]`` will include ONLY signed integer (``int16``, + ``int32``, ``int64``) and float (``float32``, ``float64``) types. + + Returns + ------- + dict + Dictionary to pass to a DataFrame constructor. The keys are string column names + that are equal to the type name of the according column. Columns containing null + types have a ``"_null"`` suffix in their names. + """ bool_data = {} int_data = {} uint_data = {} From c66191b9250a3aa0ab5ccc77ae0f63a8ec843d5b Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 16 Mar 2022 14:43:57 +0300 Subject: [PATCH 17/33] Remove redundant 'null_count' check Signed-off-by: Dmitry Chigarev --- .../omnisci_on_native/exchange/dataframe_protocol/column.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index 252c11948f4..d0d3eb49771 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -198,8 +198,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: @property def null_count(self) -> int: - ncount = self._pyarrow_table.column(0).null_count - return ncount if ncount >= 0 else None + return self._pyarrow_table.column(0).null_count @property def metadata(self) -> Dict[str, Any]: From 3e1ca03400e8a4a02ee071f2b329e3e500d758a3 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 16 Mar 2022 15:01:12 +0300 Subject: [PATCH 18/33] Move 'from_dataframe' logic to core dataframe level Signed-off-by: Dmitry Chigarev --- .../omnisci_on_native/dataframe/dataframe.py | 34 +++++++++++++++++++ .../storage_formats/omnisci/query_compiler.py | 7 +--- .../omnisci/test_protocol.py | 2 +- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index 23f39701303..d33ae2415bc 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2075,6 +2075,40 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): self, nan_as_null=nan_as_null, allow_copy=allow_copy ) + @classmethod + def from_dataframe(cls, df: "ProtocolDataframe") -> "OmnisciOnNativeDataframe": + """ + Convert a DataFrame implementing the dataframe exchange protocol to a Core Modin Dataframe. + + See more about the protocol in https://data-apis.org/dataframe-protocol/latest/index.html. + + Parameters + ---------- + df : ProtocolDataframe + The DataFrame object supporting the dataframe exchange protocol. + + Returns + ------- + OmnisciOnNativeDataframe + A new Core Modin Dataframe object. + """ + if isinstance(df, cls): + return df + + if not hasattr(df, "__dataframe__"): + raise ValueError( + "`df` does not support DataFrame exchange protocol (``__dataframe__``)" + ) + + from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( + from_dataframe, + ) + + # TODO: build PyArrow table instead of pandas DataFrame from the protocol object + # as it's possible to do zero-copy with `cls.from_arrow` + pd_df = from_dataframe(df) + return cls.from_pandas(pd_df) + columns = property(_get_columns) index = property(_get_index) diff --git a/modin/experimental/core/storage_formats/omnisci/query_compiler.py b/modin/experimental/core/storage_formats/omnisci/query_compiler.py index 9564fc7ecce..7abdda1c15a 100644 --- a/modin/experimental/core/storage_formats/omnisci/query_compiler.py +++ b/modin/experimental/core/storage_formats/omnisci/query_compiler.py @@ -210,12 +210,7 @@ def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): @classmethod def from_dataframe(cls, df, data_cls): - from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( - from_dataframe, - ) - - pd_df = from_dataframe(df) - return data_cls.from_pandas(pd_df) + return data_cls.from_dataframe(df) # END Dataframe exchange protocol diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 3f4964c3474..50b3c9b1a6c 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -176,7 +176,7 @@ def test_simple_import(data_has_nulls): data = get_data_of_all_types(data_has_nulls) md_df_source = pd.DataFrame(data) - md_df_consumer = md_from_dataframe(md_df_source._query_compiler._modin_frame) + md_df_consumer = md_from_dataframe(md_df_source) df_equals(md_df_source, md_df_consumer) From a0219f48f5d7ab4b29f501fd5565c782c193e46e Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 16 Mar 2022 19:12:50 +0300 Subject: [PATCH 19/33] Apply review suggestions Signed-off-by: Dmitry Chigarev --- .../base/exchange/dataframe_protocol/from_dataframe.py | 10 +++++----- .../omnisci_on_native/dataframe/dataframe.py | 4 ++-- .../dataframe_protocol/omnisci/test_protocol.py | 4 ++-- .../test/exchange/dataframe_protocol/omnisci/utils.py | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py index f6cca28baea..2ceb3f80daa 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py @@ -29,7 +29,7 @@ ) -def from_dataframe( +def from_dataframe_to_pandas( df: ProtocolDataframe, allow_copy: bool = True, nchunks: Optional[int] = None ): """ @@ -79,7 +79,7 @@ def get_pandas_df(df): elif dtype == DTypeKind.STRING: columns[name], buf = convert_string_column(col) elif dtype == DTypeKind.DATETIME: - columns[name], buf = convert_datetime_col(col) + columns[name], buf = convert_datetime_column(col) else: raise NotImplementedError(f"Data type {dtype} not handled yet") @@ -149,7 +149,7 @@ def convert_categorical_column(col: ProtocolColumn) -> Tuple[pandas.Series, Any] codes_buff, codes_dtype = buffers["data"] codes = buffer_to_ndarray(codes_buff, codes_dtype, col.offset, col.size) - # Doing module in order to not get IndexError for out-of-bounds sentinel values in `codes` + # Doing module in order to not get ``IndexError`` for out-of-bounds sentinel values in `codes` values = categories[codes % len(categories)] cat = pandas.Categorical(values, categories=categories, ordered=ordered) @@ -233,7 +233,7 @@ def convert_string_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: return np.asarray(str_list, dtype="object"), buffers -def convert_datetime_col(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: +def convert_datetime_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert Column holding DateTime data to a NumPy array. @@ -431,7 +431,7 @@ def set_nulls( Returns ------- - numpy.ndarray of pandas.Series + numpy.ndarray or pandas.Series Data with the nulls being set. """ null_kind, sentinel_val = col.describe_null diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index d33ae2415bc..33e44798ed1 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2101,12 +2101,12 @@ def from_dataframe(cls, df: "ProtocolDataframe") -> "OmnisciOnNativeDataframe": ) from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( - from_dataframe, + from_dataframe_to_pandas, ) # TODO: build PyArrow table instead of pandas DataFrame from the protocol object # as it's possible to do zero-copy with `cls.from_arrow` - pd_df = from_dataframe(df) + pd_df = from_dataframe_to_pandas(df) return cls.from_pandas(pd_df) columns = property(_get_columns) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 50b3c9b1a6c..fe4464e9ed2 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -24,7 +24,7 @@ buffer_to_ndarray, set_nulls, ) -from modin.pandas.utils import from_arrow, from_dataframe as md_from_dataframe +from modin.pandas.utils import from_arrow, from_dataframe from modin.pandas.test.utils import df_equals from modin.test.test_utils import warns_that_defaulting_to_pandas from .utils import get_data_of_all_types, split_df_into_chunks, export_frame @@ -176,7 +176,7 @@ def test_simple_import(data_has_nulls): data = get_data_of_all_types(data_has_nulls) md_df_source = pd.DataFrame(data) - md_df_consumer = md_from_dataframe(md_df_source) + md_df_consumer = from_dataframe(md_df_source) df_equals(md_df_source, md_df_consumer) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index ca7c7f19fe3..e0ccecd705f 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -18,7 +18,7 @@ from typing import Dict from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( - from_dataframe, + from_dataframe_to_pandas, ) from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( ForceOmnisciImport, @@ -69,11 +69,11 @@ def export_frame(md_df, from_omnisci=False, **kwargs): pandas.DataFrame """ if not from_omnisci: - return from_dataframe(md_df, **kwargs) + return from_dataframe_to_pandas(md_df, **kwargs) with ForceOmnisciImport(md_df) as instance: md_df_exported = instance.export_frames()[0] - exported_df = from_dataframe(md_df_exported, **kwargs) + exported_df = from_dataframe_to_pandas(md_df_exported, **kwargs) return exported_df From e6908c67270531b523ea786660544454896ec52a Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Wed, 16 Mar 2022 19:58:47 +0300 Subject: [PATCH 20/33] Apply suggestions from code review Co-authored-by: Yaroslav Igoshev --- .../base/exchange/dataframe_protocol/from_dataframe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py index 2ceb3f80daa..3f77031cb46 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py @@ -114,7 +114,7 @@ def convert_primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray Returns ------- tuple - Tuple of numpy.ndarray holding the data and the memory owner object that keeps the memory alive. + Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. """ buffers = col.get_buffers() @@ -170,7 +170,7 @@ def convert_string_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: Returns ------- tuple - Tuple of numpy.ndarray holding the data and the memory owner object that keeps the memory alive. + Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. """ if col.describe_null[0] not in ( ColumnNullType.NON_NULLABLE, @@ -244,7 +244,7 @@ def convert_datetime_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: Returns ------- tuple - Tuple of numpy.ndarray holding the data and the memory owner object that keeps the memory alive. + Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. """ buffers = col.get_buffers() @@ -418,7 +418,7 @@ def set_nulls( Parameters ---------- - data : numpy.ndarray or pandas.Series + data : np.ndarray or pandas.Series Data to set nulls in. col : ProtocolColumn Column object that describes the `data`. @@ -431,7 +431,7 @@ def set_nulls( Returns ------- - numpy.ndarray or pandas.Series + np.ndarray or pandas.Series Data with the nulls being set. """ null_kind, sentinel_val = col.describe_null From e4e36e6acc9dce78aad82db227d246e4cb62bf01 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 16 Mar 2022 21:55:08 +0300 Subject: [PATCH 21/33] Move 'from_dataframe.py' from base to pandas Signed-off-by: Dmitry Chigarev --- .../exchange/dataframe_protocol/from_dataframe.py | 0 .../implementations/omnisci_on_native/dataframe/dataframe.py | 2 +- modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py | 2 +- modin/test/exchange/dataframe_protocol/omnisci/utils.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename modin/core/dataframe/{base => pandas}/exchange/dataframe_protocol/from_dataframe.py (100%) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py similarity index 100% rename from modin/core/dataframe/base/exchange/dataframe_protocol/from_dataframe.py rename to modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index 33e44798ed1..5db94b09c83 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2100,7 +2100,7 @@ def from_dataframe(cls, df: "ProtocolDataframe") -> "OmnisciOnNativeDataframe": "`df` does not support DataFrame exchange protocol (``__dataframe__``)" ) - from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( + from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import ( from_dataframe_to_pandas, ) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index fe4464e9ed2..ab9952caf02 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -19,7 +19,7 @@ import numpy as np import modin.pandas as pd -from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( +from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import ( convert_primitive_column_to_ndarray, buffer_to_ndarray, set_nulls, diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index e0ccecd705f..7e1b73e2718 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -17,7 +17,7 @@ import numpy as np from typing import Dict -from modin.core.dataframe.base.exchange.dataframe_protocol.from_dataframe import ( +from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import ( from_dataframe_to_pandas, ) from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( From ada742b336f7afa25bfd2e8dcfea064b5c3d50b8 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev <62142979+dchigarev@users.noreply.github.com> Date: Thu, 17 Mar 2022 18:18:28 +0300 Subject: [PATCH 22/33] Apply suggestions from code review Co-authored-by: Yaroslav Igoshev --- .../dataframe_protocol/from_dataframe.py | 26 +++++++++---------- .../omnisci_on_native/dataframe/dataframe.py | 6 ++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index 3f77031cb46..b20603f7acb 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -11,13 +11,13 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. -"""Module houses functions building a ``pandas.DataFrame`` from DataFrame exchange protocol object.""" +"""Module houses the functions building a ``pandas.DataFrame`` from a DataFrame exchange protocol object.""" import pandas -import ctypes import numpy as np - +import ctypes from typing import Optional, Tuple, Any, Union + from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( DTypeKind, ColumnNullType, @@ -30,15 +30,15 @@ def from_dataframe_to_pandas( - df: ProtocolDataframe, allow_copy: bool = True, nchunks: Optional[int] = None + df: ProtocolDataframe, nchunks: Optional[int] = None ): """ - Build ``pandas.DataFrame`` from an object supporting DataFrame exchange protocol (__dataframe__). + Build a ``pandas.DataFrame`` from an object supporting the DataFrame exchange protocol, i.e. `__dataframe__` method. Parameters ---------- df : ProtocolDataframe - Object supporting the exchange protocol (__dataframe__). + Object supporting the exchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True Whether to allow for `df` providing a copy of underlying data. nchunks : int, optional @@ -56,8 +56,8 @@ def from_dataframe_to_pandas( df = df["dataframe"] def get_pandas_df(df): - # We need a dict of columns here, with each column being a numpy array (at - # least for now, deal with non-numpy dtypes later). + # We need a dict of columns here, with each column being a NumPy array (at + # least for now, deal with non-NumPy dtypes later). columns = dict() buffers = [] # hold on to buffers, keeps memory alive for name in df.column_names(): @@ -105,7 +105,7 @@ def get_pandas_df(df): def convert_primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ - Convert Column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. + Convert a column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. Parameters ---------- @@ -127,7 +127,7 @@ def convert_primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray def convert_categorical_column(col: ProtocolColumn) -> Tuple[pandas.Series, Any]: """ - Convert Column holding categorical data to a pandas Series. + Convert a column holding categorical data to a pandas Series. Parameters ---------- @@ -161,7 +161,7 @@ def convert_categorical_column(col: ProtocolColumn) -> Tuple[pandas.Series, Any] def convert_string_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ - Convert Column holding string data to a NumPy array. + Convert a column holding string data to a NumPy array. Parameters ---------- @@ -235,7 +235,7 @@ def convert_string_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: def convert_datetime_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ - Convert Column holding DateTime data to a NumPy array. + Convert a column holding DateTime data to a NumPy array. Parameters ---------- @@ -274,7 +274,7 @@ def convert_datetime_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: # date 'td{Days/Ms}' unit = format_str[2:] if unit == "D": - # numpy doesn't support DAY unit, so converting days to seconds + # NumPy doesn't support DAY unit, so converting days to seconds # (converting to uint64 to avoid overflow) data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") elif unit == "m": diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index 5db94b09c83..7261d3d1e39 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2058,7 +2058,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): Returns ------- ProtocolDataframe - A dataframe object following the dataframe protocol specification. + A dataframe object following the dataframe exchange protocol specification. """ if self._has_unsupported_data: pd_df = self.to_pandas() @@ -2097,14 +2097,14 @@ def from_dataframe(cls, df: "ProtocolDataframe") -> "OmnisciOnNativeDataframe": if not hasattr(df, "__dataframe__"): raise ValueError( - "`df` does not support DataFrame exchange protocol (``__dataframe__``)" + "`df` does not support DataFrame exchange protocol, i.e. `__dataframe__` method" ) from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import ( from_dataframe_to_pandas, ) - # TODO: build PyArrow table instead of pandas DataFrame from the protocol object + # TODO: build a PyArrow table instead of a pandas DataFrame from the protocol object # as it's possible to do zero-copy with `cls.from_arrow` pd_df = from_dataframe_to_pandas(df) return cls.from_pandas(pd_df) From 172a59a3d7aa22b9d2ab88367c754d87ff5d294c Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 17 Mar 2022 18:24:30 +0300 Subject: [PATCH 23/33] Align var name 'nchunks' -> 'n_chunks' Signed-off-by: Dmitry Chigarev --- .../dataframe_protocol/from_dataframe.py | 8 ++-- .../exchange/dataframe_protocol/column.py | 2 +- .../omnisci/test_protocol.py | 46 +++++++++---------- .../dataframe_protocol/omnisci/utils.py | 12 ++--- 4 files changed, 33 insertions(+), 35 deletions(-) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index b20603f7acb..c70309f751d 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -29,9 +29,7 @@ ) -def from_dataframe_to_pandas( - df: ProtocolDataframe, nchunks: Optional[int] = None -): +def from_dataframe_to_pandas(df: ProtocolDataframe, n_chunks: Optional[int] = None): """ Build a ``pandas.DataFrame`` from an object supporting the DataFrame exchange protocol, i.e. `__dataframe__` method. @@ -41,7 +39,7 @@ def from_dataframe_to_pandas( Object supporting the exchange protocol, i.e. `__dataframe__` method. allow_copy : bool, default: True Whether to allow for `df` providing a copy of underlying data. - nchunks : int, optional + n_chunks : int, optional Number of chunks to split `df`. Returns @@ -90,7 +88,7 @@ def get_pandas_df(df): return pandas_df pandas_dfs = [] - for chunk in df.get_chunks(nchunks): + for chunk in df.get_chunks(n_chunks): pandas_df = get_pandas_df(chunk) pandas_dfs.append(pandas_df) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index d0d3eb49771..40b980df8bc 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -475,7 +475,7 @@ def _propagate_dtype(self, dtype: Tuple[DTypeKind, int, str, str]): ) # TODO: currently, each column chunk casts its buffers independently which results - # in an `NCHUNKS - 1` amount of redundant casts. We can make the PyArrow table + # in an `N_CHUNKS - 1` amount of redundant casts. We can make the PyArrow table # being shared across all the chunks, so the cast being triggered in a single chunk # propagate to all of them. self._cast_at(schema_to_cast) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index ab9952caf02..51c79212e97 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -48,42 +48,42 @@ def test_simple_export(data_has_nulls, from_omnisci): exported_df = export_frame(md_df, from_omnisci) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, from_omnisci, nchunks=3) + exported_df = export_frame(md_df, from_omnisci, n_chunks=3) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, from_omnisci, nchunks=5) + exported_df = export_frame(md_df, from_omnisci, n_chunks=5) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, from_omnisci, nchunks=12) + exported_df = export_frame(md_df, from_omnisci, n_chunks=12) df_equals(md_df, exported_df) -@pytest.mark.parametrize("nchunks", [2, 4, 7]) +@pytest.mark.parametrize("n_chunks", [2, 4, 7]) @pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_export_aligned_at_chunks(nchunks, data_has_nulls): +def test_export_aligned_at_chunks(n_chunks, data_has_nulls): """Test export from DataFrame exchange protocol when internal PyArrow table is equaly chunked.""" # Modin DataFrame constructor can't process PyArrow's category, so exclude it data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) - pd_chunks = split_df_into_chunks(pd_df, nchunks) + pd_chunks = split_df_into_chunks(pd_df, n_chunks) chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) md_df = from_arrow(chunked_at) assert ( len(md_df._query_compiler._modin_frame._partitions[0][0].get().column(0).chunks) - == nchunks + == n_chunks ) exported_df = export_frame(md_df) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, nchunks=nchunks) + exported_df = export_frame(md_df, n_chunks=n_chunks) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, nchunks=nchunks * 2) + exported_df = export_frame(md_df, n_chunks=n_chunks * 2) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, nchunks=nchunks * 3) + exported_df = export_frame(md_df, n_chunks=n_chunks * 3) df_equals(md_df, exported_df) @@ -110,8 +110,8 @@ def test_export_unaligned_at_chunks(data_has_nulls): ] pd_chunk_groups = [ - split_df_into_chunks(pd_df.iloc[:, cols], nchunks) - for nchunks, cols in zip(chunk_groups, chunk_col_ilocs) + split_df_into_chunks(pd_df.iloc[:, cols], n_chunks) + for n_chunks, cols in zip(chunk_groups, chunk_col_ilocs) ] at_chunk_groups = [ pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in chunk_group]) @@ -127,22 +127,22 @@ def test_export_unaligned_at_chunks(data_has_nulls): # verify that test generated the correct chunking internal_at = md_df._query_compiler._modin_frame._partitions[0][0].get() - for nchunks_group, cols in zip(chunk_groups, chunk_col_ilocs): + for n_chunks_group, cols in zip(chunk_groups, chunk_col_ilocs): for col in internal_at.select(range(cols.start, cols.stop)).columns: - assert len(col.chunks) == nchunks_group + assert len(col.chunks) == n_chunks_group - nchunks = md_df.__dataframe__().num_chunks() + n_chunks = md_df.__dataframe__().num_chunks() exported_df = export_frame(md_df) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, nchunks=nchunks) + exported_df = export_frame(md_df, n_chunks=n_chunks) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, nchunks=nchunks * 2) + exported_df = export_frame(md_df, n_chunks=n_chunks * 2) df_equals(md_df, exported_df) - exported_df = export_frame(md_df, nchunks=nchunks * 3) + exported_df = export_frame(md_df, n_chunks=n_chunks * 3) df_equals(md_df, exported_df) @@ -216,15 +216,15 @@ def test_bitmask_chunking(): assert at["col"].type.bit_width == 1 md_df = from_arrow(at) - # Column length is 25, nchunks is 2, meaning that the split will occur in the middle + # Column length is 25, n_chunks is 2, meaning that the split will occur in the middle # of the second byte - exported_df = export_frame(md_df, nchunks=2) + exported_df = export_frame(md_df, n_chunks=2) df_equals(md_df, exported_df) @pytest.mark.parametrize("data_has_nulls", [True, False]) -@pytest.mark.parametrize("nchunks", [2, 9]) -def test_buffer_of_chunked_at(data_has_nulls, nchunks): +@pytest.mark.parametrize("n_chunks", [2, 9]) +def test_buffer_of_chunked_at(data_has_nulls, n_chunks): """Test that getting buffers of physically chunked column works properly.""" data = get_data_of_all_types( # For the simplicity of the test include only primitive types, so the test can use @@ -234,7 +234,7 @@ def test_buffer_of_chunked_at(data_has_nulls, nchunks): ) pd_df = pandas.DataFrame(data) - pd_chunks = split_df_into_chunks(pd_df, nchunks) + pd_chunks = split_df_into_chunks(pd_df, n_chunks) chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) md_df = from_arrow(chunked_at) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index 7e1b73e2718..18bb295245f 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -25,15 +25,15 @@ ) -def split_df_into_chunks(df, nchunks): +def split_df_into_chunks(df, n_chunks): """ - Split passed DataFrame into `nchunks` along row axis. + Split passed DataFrame into `n_chunks` along row axis. Parameters ---------- df : DataFrame DataFrame to split into chunks. - nchunks : int + n_chunks : int Number of chunks to split `df` into. Returns @@ -41,9 +41,9 @@ def split_df_into_chunks(df, nchunks): list of DataFrames """ chunks = [] - for i in range(nchunks): - start = i * len(df) // nchunks - end = (i + 1) * len(df) // nchunks + for i in range(n_chunks): + start = i * len(df) // n_chunks + end = (i + 1) * len(df) // n_chunks chunks.append(df.iloc[start:end]) return chunks From db9e32a600a1bad6e5a9d74ddc323bc888eb3db3 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 17 Mar 2022 18:31:06 +0300 Subject: [PATCH 24/33] Align convertion function to 'smt_to_smt' format Signed-off-by: Dmitry Chigarev --- .../dataframe_protocol/from_dataframe.py | 20 +++++++++---------- .../omnisci/test_protocol.py | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index c70309f751d..f596aea0888 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -71,13 +71,13 @@ def get_pandas_df(df): DTypeKind.FLOAT, DTypeKind.BOOL, ): - columns[name], buf = convert_primitive_column_to_ndarray(col) + columns[name], buf = primitive_column_to_ndarray(col) elif dtype == DTypeKind.CATEGORICAL: - columns[name], buf = convert_categorical_column(col) + columns[name], buf = categorical_column_to_series(col) elif dtype == DTypeKind.STRING: - columns[name], buf = convert_string_column(col) + columns[name], buf = string_column_to_ndarray(col) elif dtype == DTypeKind.DATETIME: - columns[name], buf = convert_datetime_column(col) + columns[name], buf = datetime_column_to_ndarray(col) else: raise NotImplementedError(f"Data type {dtype} not handled yet") @@ -101,7 +101,7 @@ def get_pandas_df(df): return pandas_df -def convert_primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: +def primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert a column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. @@ -123,7 +123,7 @@ def convert_primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray return data, buffers -def convert_categorical_column(col: ProtocolColumn) -> Tuple[pandas.Series, Any]: +def categorical_column_to_series(col: ProtocolColumn) -> Tuple[pandas.Series, Any]: """ Convert a column holding categorical data to a pandas Series. @@ -157,7 +157,7 @@ def convert_categorical_column(col: ProtocolColumn) -> Tuple[pandas.Series, Any] return data, buffers -def convert_string_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: +def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert a column holding string data to a NumPy array. @@ -231,7 +231,7 @@ def convert_string_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: return np.asarray(str_list, dtype="object"), buffers -def convert_datetime_column(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: +def datetime_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert a column holding DateTime data to a NumPy array. @@ -343,14 +343,14 @@ def buffer_to_ndarray( if bit_width == 1: assert length is not None, "`length` must be specified for a bit-mask buffer." arr = np.ctypeslib.as_array(data_pointer, shape=(buffer.bufsize,)) - return bitmask_to_bool_array(arr, length, first_byte_offset=offset % 8) + return bitmask_to_bool_ndarray(arr, length, first_byte_offset=offset % 8) else: return np.ctypeslib.as_array( data_pointer, shape=(buffer.bufsize // (bit_width // 8),) ) -def bitmask_to_bool_array( +def bitmask_to_bool_ndarray( bitmask: np.ndarray, mask_length: int, first_byte_offset: int = 0 ) -> np.ndarray: """ diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 51c79212e97..4ba8b6e6b4d 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -20,7 +20,7 @@ import modin.pandas as pd from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import ( - convert_primitive_column_to_ndarray, + primitive_column_to_ndarray, buffer_to_ndarray, set_nulls, ) @@ -193,7 +193,7 @@ def test_zero_copy_export_for_primitives(data_has_nulls): protocol_df = md_df.__dataframe__(allow_copy=False) for i, col in enumerate(protocol_df.get_columns()): - col_arr, memory_owner = convert_primitive_column_to_ndarray(col) + col_arr, memory_owner = primitive_column_to_ndarray(col) exported_ptr = col_arr.__array_interface__["data"][0] source_ptr = at.column(i).chunks[0].buffers()[-1].address @@ -205,7 +205,7 @@ def test_zero_copy_export_for_primitives(data_has_nulls): non_zero_copy_protocol_df = md_df.__dataframe__(allow_copy=False) with pytest.raises(RuntimeError): - col_arr, memory_owner = convert_primitive_column_to_ndarray( + col_arr, memory_owner = primitive_column_to_ndarray( non_zero_copy_protocol_df.get_column_by_name("float32") ) From f035d3c0806ca92a45aceb5776907f1822ee6904 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 17 Mar 2022 18:38:18 +0300 Subject: [PATCH 25/33] Apply formatting suggestions Signed-off-by: Dmitry Chigarev --- .../dataframe_protocol/from_dataframe.py | 8 ++--- .../omnisci/test_protocol.py | 30 +++++++------------ .../dataframe_protocol/omnisci/utils.py | 2 +- 3 files changed, 16 insertions(+), 24 deletions(-) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index f596aea0888..37a8867fce5 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -170,13 +170,15 @@ def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: tuple Tuple of np.ndarray holding the data and the memory owner object that keeps the memory alive. """ - if col.describe_null[0] not in ( + null_kind, sentinel_val = col.describe_null + + if null_kind not in ( ColumnNullType.NON_NULLABLE, ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK, ): raise NotImplementedError( - f"{col.describe_null[0]} null kind is not yet supported for string columns." + f"{null_kind} null kind is not yet supported for string columns." ) buffers = col.get_buffers() @@ -198,9 +200,7 @@ def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: offset_buff, offset_dtype = buffers["offsets"] offsets = buffer_to_ndarray(offset_buff, offset_dtype, col.offset, col.size + 1) - null_kind, sentinel_val = col.describe_null null_pos = None - if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): valid_buff, valid_dtype = buffers["validity"] null_pos = buffer_to_ndarray(valid_buff, valid_dtype, col.offset, col.size) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 4ba8b6e6b4d..42dc69c9705 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -32,7 +32,8 @@ @pytest.mark.parametrize("data_has_nulls", [True, False]) @pytest.mark.parametrize("from_omnisci", [True, False]) -def test_simple_export(data_has_nulls, from_omnisci): +@pytest.mark.parametrize("n_chunks", [None, 3, 5, 12]) +def test_simple_export(data_has_nulls, from_omnisci, n_chunks): if from_omnisci: # OmniSci can't import 'uint64' as well as booleans # issue for bool: https://github.com/modin-project/modin/issues/4299 @@ -45,16 +46,7 @@ def test_simple_export(data_has_nulls, from_omnisci): ) md_df = pd.DataFrame(data) - exported_df = export_frame(md_df, from_omnisci) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, from_omnisci, n_chunks=3) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, from_omnisci, n_chunks=5) - df_equals(md_df, exported_df) - - exported_df = export_frame(md_df, from_omnisci, n_chunks=12) + exported_df = export_frame(md_df, from_omnisci, n_chunks=n_chunks) df_equals(md_df, exported_df) @@ -62,7 +54,7 @@ def test_simple_export(data_has_nulls, from_omnisci): @pytest.mark.parametrize("data_has_nulls", [True, False]) def test_export_aligned_at_chunks(n_chunks, data_has_nulls): """Test export from DataFrame exchange protocol when internal PyArrow table is equaly chunked.""" - # Modin DataFrame constructor can't process PyArrow's category, so exclude it + # Modin DataFrame constructor can't process PyArrow's category when using ``from_arrow``, so exclude it data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) pd_chunks = split_df_into_chunks(pd_df, n_chunks) @@ -96,7 +88,7 @@ def test_export_unaligned_at_chunks(data_has_nulls): each column has its individual chunking and so some preprocessing is required in order to emulate equaly chunked columns in the protocol. """ - # Modin DataFrame constructor can't process PyArrow's category, so exclude it + # Modin DataFrame constructor can't process PyArrow's category when using ``from_arrow``, so exclude it data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) # divide columns in 3 groups: unchunked, 2-chunked, 7-chunked @@ -175,10 +167,10 @@ def test_simple_import(data_has_nulls): """Test that ``modin.pandas.utils.from_dataframe`` works properly.""" data = get_data_of_all_types(data_has_nulls) - md_df_source = pd.DataFrame(data) - md_df_consumer = from_dataframe(md_df_source) + md_df_producer = pd.DataFrame(data) + md_df_consumer = from_dataframe(md_df_producer) - df_equals(md_df_source, md_df_consumer) + df_equals(md_df_producer, md_df_consumer) @pytest.mark.parametrize("data_has_nulls", [True, False]) @@ -196,9 +188,9 @@ def test_zero_copy_export_for_primitives(data_has_nulls): col_arr, memory_owner = primitive_column_to_ndarray(col) exported_ptr = col_arr.__array_interface__["data"][0] - source_ptr = at.column(i).chunks[0].buffers()[-1].address - # Verify that the pointers of source and exported objects point to the same data - assert source_ptr == exported_ptr + producer_ptr = at.column(i).chunks[0].buffers()[-1].address + # Verify that the pointers of produce and exported objects point to the same data + assert producer_ptr == exported_ptr # Can't export `md_df` zero-copy no more as it has delayed 'fillna' operation md_df = md_df.fillna({"float32": 32.0}) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index 18bb295245f..0a830c0d579 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -62,7 +62,7 @@ def export_frame(md_df, from_omnisci=False, **kwargs): data into OmniSci and then export it back, so the origin for underlying `md_df` data is OmniSci. **kwargs : dict - Additional parameters to pass to the ``from_dataframe`` function. + Additional parameters to pass to the ``from_dataframe_to_pandas`` function. Returns ------- From 0029209a410247ddaff2796389a845004a7f5d37 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 17 Mar 2022 18:52:25 +0300 Subject: [PATCH 26/33] Add extra 'num_chunks' protocol check Signed-off-by: Dmitry Chigarev --- modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 42dc69c9705..72896955584 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -63,6 +63,7 @@ def test_export_aligned_at_chunks(n_chunks, data_has_nulls): md_df = from_arrow(chunked_at) assert ( len(md_df._query_compiler._modin_frame._partitions[0][0].get().column(0).chunks) + == md_df.__dataframe__().num_chunks() == n_chunks ) From 5104f7db952f4fe78b88c9ecb2f336108beb2099 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 17 Mar 2022 20:27:31 +0300 Subject: [PATCH 27/33] Apply review suggestions Signed-off-by: Dmitry Chigarev --- .../pandas/exchange/dataframe_protocol/from_dataframe.py | 2 -- .../test/exchange/dataframe_protocol/omnisci/test_protocol.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index 37a8867fce5..0f1cec49fb8 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -37,8 +37,6 @@ def from_dataframe_to_pandas(df: ProtocolDataframe, n_chunks: Optional[int] = No ---------- df : ProtocolDataframe Object supporting the exchange protocol, i.e. `__dataframe__` method. - allow_copy : bool, default: True - Whether to allow for `df` providing a copy of underlying data. n_chunks : int, optional Number of chunks to split `df`. diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 72896955584..cb52312b5ee 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -54,7 +54,7 @@ def test_simple_export(data_has_nulls, from_omnisci, n_chunks): @pytest.mark.parametrize("data_has_nulls", [True, False]) def test_export_aligned_at_chunks(n_chunks, data_has_nulls): """Test export from DataFrame exchange protocol when internal PyArrow table is equaly chunked.""" - # Modin DataFrame constructor can't process PyArrow's category when using ``from_arrow``, so exclude it + # Modin DataFrame constructor can't process PyArrow's category when using `from_arrow`, so exclude it data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) pd_chunks = split_df_into_chunks(pd_df, n_chunks) @@ -89,7 +89,7 @@ def test_export_unaligned_at_chunks(data_has_nulls): each column has its individual chunking and so some preprocessing is required in order to emulate equaly chunked columns in the protocol. """ - # Modin DataFrame constructor can't process PyArrow's category when using ``from_arrow``, so exclude it + # Modin DataFrame constructor can't process PyArrow's category when using `from_arrow`, so exclude it data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) pd_df = pandas.DataFrame(data) # divide columns in 3 groups: unchunked, 2-chunked, 7-chunked From 8e846d81a370f2c1f6a46b7c18f504244e3e9eba Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Thu, 17 Mar 2022 20:29:46 +0300 Subject: [PATCH 28/33] Add release note Signed-off-by: Dmitry Chigarev --- docs/release_notes/release_notes-0.14.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release_notes/release_notes-0.14.0.rst b/docs/release_notes/release_notes-0.14.0.rst index 0bf78cbbcb8..a13762e740b 100644 --- a/docs/release_notes/release_notes-0.14.0.rst +++ b/docs/release_notes/release_notes-0.14.0.rst @@ -47,6 +47,7 @@ Key Features and Updates * * Developer API enhancements * FEAT-#4245: Define base interface for dataframe exchange protocol (#4246) + * FEAT-#4244: Implement dataframe exchange protocol for OmnisciOnNative execution (#4269) * Update testing suite * TEST-#3628: Report coverage data for `test-internals` CI job (#4198) * TEST-#3938: Test tutorial notebooks in CI (#4145) From 718414d2f20d0910e2a23789103d7ca59cedb4e4 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 22 Mar 2022 15:11:23 +0300 Subject: [PATCH 29/33] Apply vnlitvinov's suggestions Signed-off-by: Dmitry Chigarev --- .../base/exchange/dataframe_protocol/utils.py | 25 +++ .../dataframe_protocol/from_dataframe.py | 204 ++++++++++-------- .../omnisci_on_native/dataframe/dataframe.py | 2 + .../exchange/dataframe_protocol/column.py | 67 +++--- .../exchange/dataframe_protocol/dataframe.py | 94 ++++---- .../exchange/dataframe_protocol/utils.py | 46 +++- .../storage_formats/omnisci/query_compiler.py | 2 +- .../omnisci/test_protocol.py | 51 ++++- .../dataframe_protocol/omnisci/utils.py | 44 +++- 9 files changed, 372 insertions(+), 163 deletions(-) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py b/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py index 96369662758..87412d4a357 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py @@ -127,6 +127,15 @@ class ArrowCTypes: TIME = "tt{resolution}" +class Edianness: + """Enum indicating the byte-order of a data-type.""" + + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + + def pandas_dtype_to_arrow_c(dtype) -> str: """ Represent pandas `dtype` as a format string in Apache Arrow C notation. @@ -159,3 +168,19 @@ def pandas_dtype_to_arrow_c(dtype) -> str: raise NotImplementedError( f"Convertion of {dtype} to Arrow C format string is not implemented." ) + + +def raise_copy_alert(copy_reason=None): + """ + Raise a ``RuntimeError`` mentioning that there's a copy required. + + Parameters + ---------- + copy_reason : str, optional + The reason of making a copy. Should fit to the following format: + 'The copy occured due to {copy_reason}.'. + """ + msg = "Copy required but 'allow_copy=False' is set." + if copy_reason: + msg += f" The copy occured due to {copy_reason}." + raise RuntimeError(msg) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index 0f1cec49fb8..0aa08258832 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -16,11 +16,14 @@ import pandas import numpy as np import ctypes +import re from typing import Optional, Tuple, Any, Union from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( DTypeKind, ColumnNullType, + ArrowCTypes, + Edianness, ) from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( ProtocolDataframe, @@ -29,6 +32,15 @@ ) +np_types_map = { + DTypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, + DTypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, + DTypeKind.FLOAT: {32: np.float32, 64: np.float64}, + # Consider bitmask to be a uint8 dtype to parse the bits later + DTypeKind.BOOL: {1: np.uint8, 8: bool}, +} + + def from_dataframe_to_pandas(df: ProtocolDataframe, n_chunks: Optional[int] = None): """ Build a ``pandas.DataFrame`` from an object supporting the DataFrame exchange protocol, i.e. `__dataframe__` method. @@ -51,43 +63,9 @@ def from_dataframe_to_pandas(df: ProtocolDataframe, n_chunks: Optional[int] = No if isinstance(df, dict): df = df["dataframe"] - def get_pandas_df(df): - # We need a dict of columns here, with each column being a NumPy array (at - # least for now, deal with non-NumPy dtypes later). - columns = dict() - buffers = [] # hold on to buffers, keeps memory alive - for name in df.column_names(): - if not isinstance(name, str): - raise ValueError(f"Column {name} is not a string") - if name in columns: - raise ValueError(f"Column {name} is not unique") - col = df.get_column_by_name(name) - dtype = col.dtype[0] - if dtype in ( - DTypeKind.INT, - DTypeKind.UINT, - DTypeKind.FLOAT, - DTypeKind.BOOL, - ): - columns[name], buf = primitive_column_to_ndarray(col) - elif dtype == DTypeKind.CATEGORICAL: - columns[name], buf = categorical_column_to_series(col) - elif dtype == DTypeKind.STRING: - columns[name], buf = string_column_to_ndarray(col) - elif dtype == DTypeKind.DATETIME: - columns[name], buf = datetime_column_to_ndarray(col) - else: - raise NotImplementedError(f"Data type {dtype} not handled yet") - - buffers.append(buf) - - pandas_df = pandas.DataFrame(columns) - pandas_df._buffers = buffers - return pandas_df - pandas_dfs = [] for chunk in df.get_chunks(n_chunks): - pandas_df = get_pandas_df(chunk) + pandas_df = protocol_df_chunk_to_pandas(chunk) pandas_dfs.append(pandas_df) pandas_df = pandas.concat(pandas_dfs, axis=0, ignore_index=True) @@ -99,6 +77,52 @@ def get_pandas_df(df): return pandas_df +def protocol_df_chunk_to_pandas(df): + """ + Convert exchange protocol chunk to ``pandas.DataFrame``. + + Parameters + ---------- + df : ProtocolDataframe + + Returns + ------- + pandas.DataFrame + """ + # We need a dict of columns here, with each column being a NumPy array (at + # least for now, deal with non-NumPy dtypes later). + columns = dict() + buffers = [] # hold on to buffers, keeps memory alive + for name in df.column_names(): + if not isinstance(name, str): + raise ValueError(f"Column {name} is not a string") + if name in columns: + raise ValueError(f"Column {name} is not unique") + col = df.get_column_by_name(name) + dtype = col.dtype[0] + if dtype in ( + DTypeKind.INT, + DTypeKind.UINT, + DTypeKind.FLOAT, + DTypeKind.BOOL, + ): + columns[name], buf = primitive_column_to_ndarray(col) + elif dtype == DTypeKind.CATEGORICAL: + columns[name], buf = categorical_column_to_series(col) + elif dtype == DTypeKind.STRING: + columns[name], buf = string_column_to_ndarray(col) + elif dtype == DTypeKind.DATETIME: + columns[name], buf = datetime_column_to_ndarray(col) + else: + raise NotImplementedError(f"Data type {dtype} not handled yet") + + buffers.append(buf) + + pandas_df = pandas.DataFrame(columns) + pandas_df._buffers = buffers + return pandas_df + + def primitive_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: """ Convert a column holding one of the primitive dtypes (int, uint, float or bool) to a NumPy array. @@ -182,21 +206,27 @@ def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: buffers = col.get_buffers() # Retrieve the data buffer containing the UTF-8 code units - data_buff, _ = buffers["data"] + data_buff, protocol_data_dtype = buffers["data"] + # We're going to reinterpret the buffer as uint8, so making sure we can do it safely + assert protocol_data_dtype[1] == 8 # bitwidth == 8 + assert protocol_data_dtype[2] == ArrowCTypes.STRING # format_str == utf-8 # Convert the buffers to NumPy arrays, in order to go from STRING to an equivalent ndarray, # we claim that the buffer is uint8 (i.e., a byte array) data_dtype = ( DTypeKind.UINT, 8, - None, - None, + ArrowCTypes.UINT8, + Edianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size) # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string offset_buff, offset_dtype = buffers["offsets"] - offsets = buffer_to_ndarray(offset_buff, offset_dtype, col.offset, col.size + 1) + # As the offsets buffer size is greater than the data size do `col.size + 1` here + offsets = buffer_to_ndarray( + offset_buff, offset_dtype, col.offset, length=col.size + 1 + ) null_pos = None if null_kind in (ColumnNullType.USE_BITMASK, ColumnNullType.USE_BYTEMASK): @@ -206,11 +236,11 @@ def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: null_pos = ~null_pos # Assemble the strings from the code units - str_list = [] - for i in range(offsets.size - 1): + str_list = [None] * col.size + for i in range(col.size): # Check for missing values if null_pos is not None and null_pos[i]: - str_list.append(np.nan) + str_list[i] = np.nan continue # Extract a range of code units @@ -223,7 +253,7 @@ def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: string = str_bytes.decode(encoding="utf-8") # Add to our list of strings - str_list.append(string) + str_list[i] = string # Convert the string list to a NumPy array return np.asarray(str_list, dtype="object"), buffers @@ -248,38 +278,49 @@ def datetime_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: dbuf, dtype = buffers["data"] # Consider dtype being `uint` to get number of units passed since the 01.01.1970 data = buffer_to_ndarray( - dbuf, (DTypeKind.UINT, dtype[1], "u", "="), col.offset, col.size + dbuf, + ( + DTypeKind.UINT, + dtype[1], + getattr(ArrowCTypes, f"UINT{dtype[1]}"), + Edianness.NATIVE, + ), + col.offset, + col.size, ) - if format_str.startswith("ts"): + def parse_format_str(format_str, data): + """Parse datetime `format_str` to interpret the `data`.""" # timestamp 'ts{unit}:tz' - meta = format_str[2:].split(":") - if len(meta) == 1: - unit = meta[0] - tz = "" - else: - unit, tz = meta - if tz != "": - raise NotImplementedError("Timezones are not supported yet") - if unit != "s": - # the format string describes only a first letter of the unit, add one extra - # letter to make the unit in numpy-style: 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' - unit += "s" - data = data.astype(f"datetime64[{unit}]") - elif format_str.startswith("td"): + timestamp_meta = re.findall(r"ts([smun]):(.*)", format_str) + if timestamp_meta: + unit, tz = timestamp_meta[0] + if tz != "": + raise NotImplementedError("Timezones are not supported yet") + if unit != "s": + # the format string describes only a first letter of the unit, add one extra + # letter to make the unit in numpy-style: 'm' -> 'ms', 'u' -> 'us', 'n' -> 'ns' + unit += "s" + data = data.astype(f"datetime64[{unit}]") + return data + # date 'td{Days/Ms}' - unit = format_str[2:] - if unit == "D": - # NumPy doesn't support DAY unit, so converting days to seconds - # (converting to uint64 to avoid overflow) - data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") - elif unit == "m": - data = data.astype("datetime64[ms]") - else: - raise NotImplementedError(f"Date unit is not supported: {unit}") - else: + date_meta = re.findall(r"td([Dm])") + if date_meta: + unit = date_meta[0] + if unit == "D": + # NumPy doesn't support DAY unit, so converting days to seconds + # (converting to uint64 to avoid overflow) + data = (data.astype(np.uint64) * (24 * 60 * 60)).astype("datetime64[s]") + elif unit == "m": + data = data.astype("datetime64[ms]") + else: + raise NotImplementedError(f"Date unit is not supported: {unit}") + return data + raise NotImplementedError(f"DateTime kind is not supported: {format_str}") + data = parse_format_str(format_str, data) data = set_nulls(data, col, buffers["validity"]) return data, buffers @@ -316,23 +357,13 @@ def buffer_to_ndarray( """ kind, bit_width, _, _ = dtype - if kind not in (DTypeKind.INT, DTypeKind.UINT, DTypeKind.FLOAT, DTypeKind.BOOL): - raise RuntimeError("Not a boolean, integer or floating-point dtype") - - np_kinds = { - DTypeKind.INT: {8: np.int8, 16: np.int16, 32: np.int32, 64: np.int64}, - DTypeKind.UINT: {8: np.uint8, 16: np.uint16, 32: np.uint32, 64: np.uint64}, - DTypeKind.FLOAT: {32: np.float32, 64: np.float64}, - # Consider bitmask to be a uint8 dtype to parse the bits later - DTypeKind.BOOL: {1: np.uint8, 8: bool}, - } - - column_dtype = np_kinds[kind].get(bit_width, None) + column_dtype = np_types_map.get(kind, {}).get(bit_width, None) if column_dtype is None: raise NotImplementedError(f"Convertion for {dtype} is not yet supported.") - # No DLPack yet, so need to construct a new ndarray from the data pointer - # and size in the buffer plus the dtype on the column + # TODO: No DLPack yet, so need to construct a new ndarray from the data pointer + # and size in the buffer plus the dtype on the column. Use DLPack as NumPy supports + # it since https://github.com/numpy/numpy/pull/19083 ctypes_type = np.ctypeslib.as_ctypes_type(column_dtype) data_pointer = ctypes.cast( buffer.ptr + (offset * bit_width // 8), ctypes.POINTER(ctypes_type) @@ -367,10 +398,9 @@ def bitmask_to_bool_ndarray( ------- np.ndarray[bool] """ - if first_byte_offset > 8: - raise ValueError( - f"First byte offset can't be more than 8, met: {first_byte_offset}" - ) + bytes_to_skip = first_byte_offset // 8 + bitmask = bitmask[bytes_to_skip:] + first_byte_offset = first_byte_offset % 8 bool_mask = np.zeros(mask_length, dtype=bool) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py index 7261d3d1e39..cc7cc1dedfa 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/dataframe/dataframe.py @@ -2061,6 +2061,7 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): A dataframe object following the dataframe exchange protocol specification. """ if self._has_unsupported_data: + ErrorMessage.default_to_pandas(message="`__dataframe__`") pd_df = self.to_pandas() if hasattr(pd_df, "__dataframe__"): return pd_df.__dataframe__() @@ -2106,6 +2107,7 @@ def from_dataframe(cls, df: "ProtocolDataframe") -> "OmnisciOnNativeDataframe": # TODO: build a PyArrow table instead of a pandas DataFrame from the protocol object # as it's possible to do zero-copy with `cls.from_arrow` + ErrorMessage.default_to_pandas(message="`from_dataframe`") pd_df = from_dataframe_to_pandas(df) return cls.from_pandas(pd_df) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index 40b980df8bc..37c4c5221af 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -22,14 +22,17 @@ from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( DTypeKind, ColumnNullType, + ArrowCTypes, + Edianness, pandas_dtype_to_arrow_c, + raise_copy_alert, ) from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( ProtocolColumn, ) from modin.utils import _inherit_docstrings from .buffer import OmnisciProtocolBuffer -from .utils import arrow_dtype_to_arrow_c +from .utils import arrow_dtype_to_arrow_c, arrow_types_map @_inherit_docstrings(ProtocolColumn) @@ -76,16 +79,21 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]: dtype = self._pandas_dtype if pandas.api.types.is_bool_dtype(dtype): - return (DTypeKind.BOOL, 1, pandas_dtype_to_arrow_c(np.dtype("bool")), "=") + return (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Edianness.NATIVE) elif pandas.api.types.is_datetime64_dtype( dtype ) or pandas.api.types.is_categorical_dtype(dtype): # For these types we have to use internal arrow dtype to get proper metadata return self._dtype_from_pyarrow(self._arrow_dtype) elif pandas.api.types.is_string_dtype(dtype): - return (DTypeKind.STRING, 8, pandas_dtype_to_arrow_c(dtype), "=") + return ( + DTypeKind.STRING, + 8, + pandas_dtype_to_arrow_c(dtype), + Edianness.NATIVE, + ) else: - return self._dtype_from_primitive_pandas(dtype) + return self._dtype_from_primitive_numpy(dtype) def _dtype_from_pyarrow(self, dtype): """ @@ -119,17 +127,19 @@ def _dtype_from_pyarrow(self, dtype): bit_width = dtype.bit_width if kind is not None: - return (kind, bit_width, arrow_dtype_to_arrow_c(dtype), "=") + return (kind, bit_width, arrow_dtype_to_arrow_c(dtype), Edianness.NATIVE) else: - return self._dtype_from_primitive_pandas(np.dtype(dtype.to_pandas_dtype())) + return self._dtype_from_primitive_numpy(np.dtype(dtype.to_pandas_dtype())) - def _dtype_from_primitive_pandas(self, dtype) -> Tuple[DTypeKind, int, str, str]: + def _dtype_from_primitive_numpy( + self, dtype: np.dtype + ) -> Tuple[DTypeKind, int, str, str]: """ Build protocol dtype from primitive pandas dtype. Parameters ---------- - dtype : {np.int, np.uint, np.float, np.bool} + dtype : np.dtype Data type to convert from. Returns @@ -176,11 +186,13 @@ def describe_categorical(self) -> Dict[str, Any]: col = self._pyarrow_table.column(0) if len(col.chunks) > 1: if not self._col._allow_copy: - raise RuntimeError("Copy required but 'allow_copy=False'") + raise_copy_alert( + copy_reason="physical chunks combining due to contiguous buffer materialization" + ) col = col.combine_chunks() col = col.chunks[0] - mapping = {index: value for index, value in enumerate(col.dictionary.tolist())} + mapping = dict(enumerate(col.dictionary.tolist())) return { "is_ordered": ordered, @@ -270,7 +282,9 @@ def _materialize_actual_buffers(self): """ if self.num_chunks() != 1: if not self._col._allow_copy: - raise RuntimeError("Copy required with 'allow_copy=False' flag") + raise_copy_alert( + copy_reason="physical chunks combining due to contiguous buffer materialization" + ) self._combine_chunks() external_dtype = self.dtype @@ -340,7 +354,7 @@ def _get_data_buffer( ) return ( - # According to the Arrow's memory layout, the validity buffer is always present + # According to the Arrow's memory layout, the data buffer is always present # at the last position of `.buffers()`: # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout OmnisciProtocolBuffer(arr.buffers()[-1], buff_size), @@ -368,13 +382,13 @@ def _get_validity_buffer( # https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout validity_buffer = arr.buffers()[0] if validity_buffer is None: - return validity_buffer + return None # If exist, validity buffer is always a bit-mask. data_size = self._get_buffer_size(bit_width=1) return ( OmnisciProtocolBuffer(validity_buffer, data_size), - (DTypeKind.BOOL, 1, "b", "="), + (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Edianness.NATIVE), ) def _get_offsets_buffer( @@ -405,7 +419,7 @@ def _get_offsets_buffer( offset_buff = buffs[1] # According to Arrow's data layout, the offset buffer type is "int32" - dtype = self._dtype_from_primitive_pandas(np.dtype("int32")) + dtype = self._dtype_from_primitive_numpy(np.dtype("int32")) return ( OmnisciProtocolBuffer( offset_buff, @@ -427,25 +441,10 @@ def _propagate_dtype(self, dtype: Tuple[DTypeKind, int, str, str]): Data type conforming protocol dtypes format to cast underlying PyArrow table. """ if not self._col._allow_copy: - raise RuntimeError("Copy required with 'allow_copy=False' flag") - - arrow_types_map = { - DTypeKind.BOOL: {8: pa.bool_()}, - DTypeKind.INT: { - 8: pa.int8(), - 16: pa.int16(), - 32: pa.int32(), - 64: pa.int64(), - }, - DTypeKind.UINT: { - 8: pa.uint8(), - 16: pa.uint16(), - 32: pa.uint32(), - 64: pa.uint64(), - }, - DTypeKind.FLOAT: {16: pa.float16(), 32: pa.float32(), 64: pa.float64()}, - DTypeKind.STRING: {8: pa.string()}, - } + raise_copy_alert( + copy_reason="casting to align pandas and PyArrow data types" + ) + kind, bit_width, format_str, _ = dtype arrow_type = None diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index 4a686e1bac9..98389ca2e44 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -33,6 +33,7 @@ UnionNode, ) from .column import OmnisciProtocolColumn +from .utils import raise_copy_alert_if_materialize @_inherit_docstrings(ProtocolDataframe) @@ -67,22 +68,24 @@ def __init__( self._allow_copy = allow_copy @property + @raise_copy_alert_if_materialize def metadata(self) -> Dict[str, Any]: # TODO: as the frame's index is stored as a separate column inside PyArrow table # we may want to return the column's name here instead of materialized index. # This will require the internal index column to be visible in the protocol's column # accessor methods. - self._maybe_raise_if_materialize() return {"modin.index": self._df.index} def num_columns(self) -> int: return len(self._df.columns) + @raise_copy_alert_if_materialize def num_rows(self) -> int: - self._maybe_raise_if_materialize() return len(self._df.index) def num_chunks(self) -> int: + # `._ chunk_slices` describe chunk offsets (start-stop indices of the chunks) + # meaning that there are actually `len(self._chunk_slices) - 1` amount of chunks return len(self._chunk_slices) - 1 __chunk_slices = None @@ -132,11 +135,6 @@ def _chunk_slices(self) -> np.ndarray: return self.__chunk_slices - def _maybe_raise_if_materialize(self): - """Raise a ``RuntimeError`` if the way of retrieving the data violates the ``allow_copy`` flag.""" - if not self._allow_copy and not self._is_zero_copy_possible: - raise RuntimeError("Copy required with 'allow_copy=False'") - __is_zero_copy_possible = None @property @@ -196,6 +194,7 @@ def _is_zero_copy_arrow_op(cls, op) -> bool: ) @property + @raise_copy_alert_if_materialize def _pyarrow_table(self) -> pa.Table: """ Get PyArrow table representing the DataFrame. @@ -204,8 +203,6 @@ def _pyarrow_table(self) -> pa.Table: ------- pyarrow.Table """ - self._maybe_raise_if_materialize() - if not self._df._has_arrow_table(): self._df._execute() @@ -214,8 +211,7 @@ def _pyarrow_table(self) -> pa.Table: return at def column_names(self) -> Iterable[str]: - for col in self._df.columns: - yield col + return self._df.columns def get_column(self, i: int) -> OmnisciProtocolColumn: return OmnisciProtocolColumn( @@ -268,7 +264,7 @@ def select_columns_by_name( def get_chunks( self, n_chunks: Optional[int] = None ) -> Iterable["OmnisciProtocolDataframe"]: - if n_chunks is None: + if n_chunks is None or n_chunks == self.num_chunks(): return self._yield_chunks(self._chunk_slices) if n_chunks % self.num_chunks() != 0: @@ -281,31 +277,55 @@ def get_chunks( "The passed `n_chunks` value is bigger than the amout of rows in the frame." ) - extra_chunks = n_chunks - self.num_chunks() - # `._chunk_slices` is a cached property, we don't want to modify the property's - # array inplace, so doing a copy here - subdivided_slices = self._chunk_slices.copy() - - # The subdividing behavior is a bit different from "subdividing each chunk", - # instead it subdivides the biggest chunks first, so overall chunking be as - # equal as possible - for _ in range(extra_chunks): - # 1. Find the biggest chunk - # 2. Split it in the middle - biggest_chunk_idx = np.argmax(np.diff(subdivided_slices)) - new_chunk_offset = ( - subdivided_slices[biggest_chunk_idx + 1] - - subdivided_slices[biggest_chunk_idx] - ) // 2 - ErrorMessage.catch_bugs_and_request_email( - failure_condition=new_chunk_offset == 0, - extra_log="No more chunks to subdivide", - ) - subdivided_slices = np.insert( - subdivided_slices, - biggest_chunk_idx + 1, - subdivided_slices[biggest_chunk_idx] + new_chunk_offset, - ) + extra_chunks = 0 + to_subdivide = n_chunks // self.num_chunks() + subdivided_slices = [] + + # The loop subdivides each chunk into `to_subdivide` chunks if possible + for i in range(len(self._chunk_slices) - 1): + chunk_length = self._chunk_slices[i + 1] - self._chunk_slices[i] + step = chunk_length // to_subdivide + if step == 0: + # Bad case: we're requested to subdivide a chunk in more pieces than it has rows in it. + # This means that there is a bigger chunk that we can subdivide into more pieces to get + # the required amount of chunks. For now, subdividing the current chunk into maximum possible + # pieces (TODO: maybe we should subdivide it into `sqrt(chunk_length)` chunks to make + # this more oprimal?), writing a number of missing pieces into `extra_chunks` variable + # to extract them from bigger chunks later. + step = 1 + extra_chunks += to_subdivide - chunk_length + to_subdivide_chunk = chunk_length + else: + to_subdivide_chunk = to_subdivide + + for j in range(to_subdivide_chunk): + subdivided_slices.append(self._chunk_slices[i] + step * j) + subdivided_slices.append(self._chunk_slices[-1]) + + if extra_chunks != 0: + # Making more pieces from big chunks to get the required amount of `n_chunks` + for _ in range(extra_chunks): + # 1. Find the biggest chunk + # 2. Split it in the middle + biggest_chunk_idx = np.argmax(np.diff(subdivided_slices)) + new_chunk_offset = ( + subdivided_slices[biggest_chunk_idx + 1] + - subdivided_slices[biggest_chunk_idx] + ) // 2 + ErrorMessage.catch_bugs_and_request_email( + failure_condition=new_chunk_offset == 0, + extra_log="No more chunks to subdivide", + ) + subdivided_slices = np.insert( + subdivided_slices, + biggest_chunk_idx + 1, + subdivided_slices[biggest_chunk_idx] + new_chunk_offset, + ) + + ErrorMessage.catch_bugs_and_request_email( + failure_condition=len(subdivided_slices) != n_chunks + 1, + extra_log=f"Chunks were incorrectly split: {len(subdivided_slices)} != {n_chunks + 1}", + ) return self._yield_chunks(subdivided_slices) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py index f51a932d9c5..77bfd2d5a75 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py @@ -15,13 +15,35 @@ import pyarrow as pa import numpy as np +import functools from modin.core.dataframe.base.exchange.dataframe_protocol.utils import ( ArrowCTypes, pandas_dtype_to_arrow_c, + raise_copy_alert, + DTypeKind, ) +arrow_types_map = { + DTypeKind.BOOL: {8: pa.bool_()}, + DTypeKind.INT: { + 8: pa.int8(), + 16: pa.int16(), + 32: pa.int32(), + 64: pa.int64(), + }, + DTypeKind.UINT: { + 8: pa.uint8(), + 16: pa.uint16(), + 32: pa.uint32(), + 64: pa.uint64(), + }, + DTypeKind.FLOAT: {16: pa.float16(), 32: pa.float32(), 64: pa.float64()}, + DTypeKind.STRING: {8: pa.string()}, +} + + def arrow_dtype_to_arrow_c(dtype: pa.DataType) -> str: """ Represent PyArrow `dtype` as a format string in Apache Arrow C notation. @@ -46,8 +68,30 @@ def arrow_dtype_to_arrow_c(dtype: pa.DataType) -> str: # TODO: for some reason `time32` type doesn't have a `unit` attribute, # always return "s" for now. # return ArrowCTypes.TIME.format(resolution=dtype.unit[:1]) - return ArrowCTypes.TIME.format(resolution="s") + return ArrowCTypes.TIME.format(resolution=getattr(dtype, "unit", "s")[:1]) elif pa.types.is_dictionary(dtype): return arrow_dtype_to_arrow_c(dtype.index_type) else: return pandas_dtype_to_arrow_c(np.dtype(dtype.to_pandas_dtype())) + + +def raise_copy_alert_if_materialize(fn): + """ + Do docstring. + + Parameters + ---------- + fn : callable + + Returns + ------- + callable + """ + + @functools.wraps(fn) + def method(self, *args, **kwargs): + if not self._allow_copy and not self._is_zero_copy_possible: + raise_copy_alert() + return fn(self, *args, **kwargs) + + return method diff --git a/modin/experimental/core/storage_formats/omnisci/query_compiler.py b/modin/experimental/core/storage_formats/omnisci/query_compiler.py index 7abdda1c15a..a674d1c9561 100644 --- a/modin/experimental/core/storage_formats/omnisci/query_compiler.py +++ b/modin/experimental/core/storage_formats/omnisci/query_compiler.py @@ -210,7 +210,7 @@ def to_dataframe(self, nan_as_null: bool = False, allow_copy: bool = True): @classmethod def from_dataframe(cls, df, data_cls): - return data_cls.from_dataframe(df) + return cls(data_cls.from_dataframe(df)) # END Dataframe exchange protocol diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index cb52312b5ee..104a542ceb9 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -139,6 +139,40 @@ def test_export_unaligned_at_chunks(data_has_nulls): df_equals(md_df, exported_df) +@pytest.mark.parametrize("data_has_nulls", [True, False]) +def test_export_bad_chunking(data_has_nulls): + """ + Test ``.get_chunks(n_chunks)`` when internal PyArrow table's is 'badly chunked'. + + The setup for the test is a PyArrow table having one of the chunk consisting of a single row, + meaning that the chunk can't be subdivide. + """ + data = get_data_of_all_types(has_nulls=data_has_nulls, exclude_dtypes=["category"]) + pd_df = pandas.DataFrame(data) + pd_chunks = (pd_df.iloc[:1], pd_df.iloc[1:]) + + chunked_at = pa.concat_tables([pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) + md_df = from_arrow(chunked_at) + assert ( + len(md_df._query_compiler._modin_frame._partitions[0][0].get().column(0).chunks) + == md_df.__dataframe__().num_chunks() + == 2 + ) + # Meaning that we can't subdivide first chunk + np.testing.assert_array_equal( + md_df.__dataframe__()._chunk_slices, [0, 1, len(pd_df)] + ) + + exported_df = export_frame(md_df, n_chunks=2) + df_equals(md_df, exported_df) + + exported_df = export_frame(md_df, n_chunks=4) + df_equals(md_df, exported_df) + + exported_df = export_frame(md_df, n_chunks=40) + df_equals(md_df, exported_df) + + def test_export_when_delayed_computations(): """ Test that export works properly when OmnisciOnNative has delayed computations. @@ -169,7 +203,22 @@ def test_simple_import(data_has_nulls): data = get_data_of_all_types(data_has_nulls) md_df_producer = pd.DataFrame(data) - md_df_consumer = from_dataframe(md_df_producer) + # Our configuration in pytest.ini requires that we explicitly catch all + # instances of defaulting to pandas, this one raises a warning on `.from_dataframe` + with warns_that_defaulting_to_pandas(): + md_df_consumer = from_dataframe(md_df_producer) + + # TODO: the following assertions verify that `from_dataframe` doesn't return + # the same object untouched due to optimization branching, it actually should + # do so but the logic is not implemented yet, so the assertions are passing + # for now. It's required to replace the producer's type with a different one + # to consumer when we have some other implementation of the protocol as the + # assertions may start failing shortly. + assert md_df_producer is not md_df_consumer + assert ( + md_df_producer._query_compiler._modin_frame + is not md_df_consumer._query_compiler._modin_frame + ) df_equals(md_df_producer, md_df_consumer) diff --git a/modin/test/exchange/dataframe_protocol/omnisci/utils.py b/modin/test/exchange/dataframe_protocol/omnisci/utils.py index 0a830c0d579..ccc713a70b8 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/utils.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/utils.py @@ -19,6 +19,7 @@ from modin.core.dataframe.pandas.exchange.dataframe_protocol.from_dataframe import ( from_dataframe_to_pandas, + protocol_df_chunk_to_pandas, ) from modin.experimental.core.execution.native.implementations.omnisci_on_native.test.utils import ( ForceOmnisciImport, @@ -69,15 +70,54 @@ def export_frame(md_df, from_omnisci=False, **kwargs): pandas.DataFrame """ if not from_omnisci: - return from_dataframe_to_pandas(md_df, **kwargs) + return from_dataframe_to_pandas_assert_chunking(md_df, **kwargs) with ForceOmnisciImport(md_df) as instance: md_df_exported = instance.export_frames()[0] - exported_df = from_dataframe_to_pandas(md_df_exported, **kwargs) + exported_df = from_dataframe_to_pandas_assert_chunking(md_df_exported, **kwargs) return exported_df +def from_dataframe_to_pandas_assert_chunking(df, n_chunks=None, **kwargs): + """ + Build a ``pandas.DataFrame`` from a `__dataframe__` object splitting it into `n_chunks`. + + The function asserts that the `df` was split exactly into `n_chunks` before converting them to pandas. + + Parameters + ---------- + df : DataFrame + Object supporting the exchange protocol, i.e. `__dataframe__` method. + n_chunks : int, optional + Number of chunks to split `df`. + + Returns + ------- + pandas.DataFrame + """ + if n_chunks is None: + return from_dataframe_to_pandas(df, n_chunks=n_chunks, **kwargs) + + protocol_df = df.__dataframe__() + chunks = list(protocol_df.get_chunks(n_chunks)) + assert len(chunks) == n_chunks + + pd_chunks = [None] * len(chunks) + for i in range(len(chunks)): + pd_chunks[i] = protocol_df_chunk_to_pandas(chunks[i], **kwargs) + + pd_df = pandas.concat(pd_chunks, axis=0, ignore_index=True) + + index_obj = protocol_df.metadata.get( + "modin.index", protocol_df.metadata.get("pandas.index", None) + ) + if index_obj is not None: + pd_df.index = index_obj + + return pd_df + + def get_data_of_all_types( has_nulls=False, exclude_dtypes=None, include_dtypes=None ) -> Dict[str, np.ndarray]: From eebcbe6a426f6fbb26d9831134c4839b5ba1e29b Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Tue, 22 Mar 2022 16:08:46 +0300 Subject: [PATCH 30/33] Fix date parsing Signed-off-by: Dmitry Chigarev --- .../pandas/exchange/dataframe_protocol/from_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index 0aa08258832..07e85dd27b7 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -305,7 +305,7 @@ def parse_format_str(format_str, data): return data # date 'td{Days/Ms}' - date_meta = re.findall(r"td([Dm])") + date_meta = re.findall(r"td([Dm])", format_str) if date_meta: unit = date_meta[0] if unit == "D": From bae7b11d74875bce68d506177ce330c27e704ca0 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 23 Mar 2022 14:20:17 +0300 Subject: [PATCH 31/33] Add more comments&doc-strings Signed-off-by: Dmitry Chigarev --- .../dataframe_protocol/from_dataframe.py | 16 +++++----- .../exchange/dataframe_protocol/column.py | 4 ++- .../exchange/dataframe_protocol/dataframe.py | 31 +++++++++++++++++++ .../exchange/dataframe_protocol/utils.py | 3 +- .../omnisci/test_protocol.py | 4 +-- 5 files changed, 47 insertions(+), 11 deletions(-) diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index 07e85dd27b7..c6653835779 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -163,7 +163,7 @@ def categorical_column_to_series(col: ProtocolColumn) -> Tuple[pandas.Series, An if not is_dict: raise NotImplementedError("Non-dictionary categoricals not supported yet") - categories = np.array(list(mapping.values())) + categories = np.array(tuple(mapping.values())) buffers = col.get_buffers() codes_buff, codes_dtype = buffers["data"] @@ -223,7 +223,9 @@ def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: # Retrieve the offsets buffer containing the index offsets demarcating the beginning and end of each string offset_buff, offset_dtype = buffers["offsets"] - # As the offsets buffer size is greater than the data size do `col.size + 1` here + # Offsets buffer contains start-stop positions of strings in the data buffer, + # meaning that it has more elements than in the data buffer, do `col.size + 1` here + # to pass a proper offsets buffer size offsets = buffer_to_ndarray( offset_buff, offset_dtype, col.offset, length=col.size + 1 ) @@ -292,9 +294,9 @@ def datetime_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: def parse_format_str(format_str, data): """Parse datetime `format_str` to interpret the `data`.""" # timestamp 'ts{unit}:tz' - timestamp_meta = re.findall(r"ts([smun]):(.*)", format_str) + timestamp_meta = re.match(r"ts([smun]):(.*)", format_str) if timestamp_meta: - unit, tz = timestamp_meta[0] + unit, tz = timestamp_meta.group(1), timestamp_meta.group(2) if tz != "": raise NotImplementedError("Timezones are not supported yet") if unit != "s": @@ -305,9 +307,9 @@ def parse_format_str(format_str, data): return data # date 'td{Days/Ms}' - date_meta = re.findall(r"td([Dm])", format_str) + date_meta = re.match(r"td([Dm])", format_str) if date_meta: - unit = date_meta[0] + unit = date_meta.group(1) if unit == "D": # NumPy doesn't support DAY unit, so converting days to seconds # (converting to uint64 to avoid overflow) @@ -400,7 +402,7 @@ def bitmask_to_bool_ndarray( """ bytes_to_skip = first_byte_offset // 8 bitmask = bitmask[bytes_to_skip:] - first_byte_offset = first_byte_offset % 8 + first_byte_offset %= 8 bool_mask = np.zeros(mask_length, dtype=bool) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index 37c4c5221af..edd1ea8e9a8 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -83,7 +83,9 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]: elif pandas.api.types.is_datetime64_dtype( dtype ) or pandas.api.types.is_categorical_dtype(dtype): - # For these types we have to use internal arrow dtype to get proper metadata + # We can't fully describe an actual underlying type's metadata from pandas dtype, + # use a `._arrow_dtype` for missing parts of information like datetime resulution, + # dictionary metadata, etc?... return self._dtype_from_pyarrow(self._arrow_dtype) elif pandas.api.types.is_string_dtype(dtype): return ( diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index 98389ca2e44..b2e5aab84bd 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -264,6 +264,37 @@ def select_columns_by_name( def get_chunks( self, n_chunks: Optional[int] = None ) -> Iterable["OmnisciProtocolDataframe"]: + """ + Return an iterator yielding the chunks. + + If `n_chunks` is not specified, yields the chunks that the data is stored underneath. + If given, `n_chunks` must be a multiple of ``self.num_chunks()``, meaning that each physical + chunk is going to be split into ``n_chunks // self.num_chunks()`` virtual chunks, that are + backed by the same physical buffers but have different ``.offset`` values. + + Parameters + ---------- + n_chunks : int, optional + Number of chunks to yield. + + Returns + ------- + Iterable["OmnisciProtocolDataframe"] + An iterator yielding ``OmnisciProtocolDataframe`` objects. + + Raises + ------ + ``RuntimeError`` if ``n_chunks`` is not a multiple of ``self.num_chunks()`` or ``n_chunks`` + is greater than ``self.num_rows()``. + + Notes + ----- + There is a special casing in handling variable-sized columns (i.e. strings) when virtually chunked. + In order to make the offsets buffer be valid for each virtual chunk, the data buffer shouldn't be + chunked at all, meaning that ``.get_buffers()["data"]`` always returns a buffer owning the whole + physical chunk and the consumer must always interpret it with zero offset (validity and offsets + buffers must be interpret respecting the column's offset value). + """ if n_chunks is None or n_chunks == self.num_chunks(): return self._yield_chunks(self._chunk_slices) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py index 77bfd2d5a75..17729141fe5 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py @@ -77,11 +77,12 @@ def arrow_dtype_to_arrow_c(dtype: pa.DataType) -> str: def raise_copy_alert_if_materialize(fn): """ - Do docstring. + Decorate ``OmnisciProtocolDataframe`` method with a check raising a copy-alert in case of disability to retrieve the data zero-copy. Parameters ---------- fn : callable + ``OmnisciProtocolDataframe`` method. Returns ------- diff --git a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py index 104a542ceb9..21142716a2e 100644 --- a/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py +++ b/modin/test/exchange/dataframe_protocol/omnisci/test_protocol.py @@ -140,9 +140,9 @@ def test_export_unaligned_at_chunks(data_has_nulls): @pytest.mark.parametrize("data_has_nulls", [True, False]) -def test_export_bad_chunking(data_has_nulls): +def test_export_indivisible_chunking(data_has_nulls): """ - Test ``.get_chunks(n_chunks)`` when internal PyArrow table's is 'badly chunked'. + Test ``.get_chunks(n_chunks)`` when internal PyArrow table's is 'indivisibly chunked'. The setup for the test is a PyArrow table having one of the chunk consisting of a single row, meaning that the chunk can't be subdivide. From d8053a18be3b6fe673184bc8659df550fa53af74 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 23 Mar 2022 14:58:23 +0300 Subject: [PATCH 32/33] Apply suggestions from code review Co-authored-by: Vasily Litvinov --- .../omnisci_on_native/exchange/dataframe_protocol/dataframe.py | 2 +- .../omnisci_on_native/exchange/dataframe_protocol/utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py index b2e5aab84bd..fd796e69082 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/dataframe.py @@ -293,7 +293,7 @@ def get_chunks( In order to make the offsets buffer be valid for each virtual chunk, the data buffer shouldn't be chunked at all, meaning that ``.get_buffers()["data"]`` always returns a buffer owning the whole physical chunk and the consumer must always interpret it with zero offset (validity and offsets - buffers must be interpret respecting the column's offset value). + buffers have to be interpreted respecting the column's offset value). """ if n_chunks is None or n_chunks == self.num_chunks(): return self._yield_chunks(self._chunk_slices) diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py index 17729141fe5..2ebf7221105 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/utils.py @@ -77,7 +77,7 @@ def arrow_dtype_to_arrow_c(dtype: pa.DataType) -> str: def raise_copy_alert_if_materialize(fn): """ - Decorate ``OmnisciProtocolDataframe`` method with a check raising a copy-alert in case of disability to retrieve the data zero-copy. + Decorate ``OmnisciProtocolDataframe`` method with a check raising a copy-alert if it's impossible to retrieve the data in zero-copy way. Parameters ---------- From 6b11631f4c777d8d4e23f30a9de9250f922de30f Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 23 Mar 2022 19:20:00 +0300 Subject: [PATCH 33/33] Fix spelling 'Edianness' -> 'Endianness' Signed-off-by: Dmitry Chigarev --- .../base/exchange/dataframe_protocol/utils.py | 2 +- .../exchange/dataframe_protocol/from_dataframe.py | 6 +++--- .../exchange/dataframe_protocol/column.py | 10 +++++----- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py b/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py index 87412d4a357..a83fa15cb21 100644 --- a/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py +++ b/modin/core/dataframe/base/exchange/dataframe_protocol/utils.py @@ -127,7 +127,7 @@ class ArrowCTypes: TIME = "tt{resolution}" -class Edianness: +class Endianness: """Enum indicating the byte-order of a data-type.""" LITTLE = "<" diff --git a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py index c6653835779..499168d9511 100644 --- a/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py +++ b/modin/core/dataframe/pandas/exchange/dataframe_protocol/from_dataframe.py @@ -23,7 +23,7 @@ DTypeKind, ColumnNullType, ArrowCTypes, - Edianness, + Endianness, ) from modin.core.dataframe.base.exchange.dataframe_protocol.dataframe import ( ProtocolDataframe, @@ -216,7 +216,7 @@ def string_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: DTypeKind.UINT, 8, ArrowCTypes.UINT8, - Edianness.NATIVE, + Endianness.NATIVE, ) # Specify zero offset as we don't want to chunk the string data data = buffer_to_ndarray(data_buff, data_dtype, offset=0, length=col.size) @@ -285,7 +285,7 @@ def datetime_column_to_ndarray(col: ProtocolColumn) -> Tuple[np.ndarray, Any]: DTypeKind.UINT, dtype[1], getattr(ArrowCTypes, f"UINT{dtype[1]}"), - Edianness.NATIVE, + Endianness.NATIVE, ), col.offset, col.size, diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py index edd1ea8e9a8..f7e64366522 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/exchange/dataframe_protocol/column.py @@ -23,7 +23,7 @@ DTypeKind, ColumnNullType, ArrowCTypes, - Edianness, + Endianness, pandas_dtype_to_arrow_c, raise_copy_alert, ) @@ -79,7 +79,7 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]: dtype = self._pandas_dtype if pandas.api.types.is_bool_dtype(dtype): - return (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Edianness.NATIVE) + return (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE) elif pandas.api.types.is_datetime64_dtype( dtype ) or pandas.api.types.is_categorical_dtype(dtype): @@ -92,7 +92,7 @@ def dtype(self) -> Tuple[DTypeKind, int, str, str]: DTypeKind.STRING, 8, pandas_dtype_to_arrow_c(dtype), - Edianness.NATIVE, + Endianness.NATIVE, ) else: return self._dtype_from_primitive_numpy(dtype) @@ -129,7 +129,7 @@ def _dtype_from_pyarrow(self, dtype): bit_width = dtype.bit_width if kind is not None: - return (kind, bit_width, arrow_dtype_to_arrow_c(dtype), Edianness.NATIVE) + return (kind, bit_width, arrow_dtype_to_arrow_c(dtype), Endianness.NATIVE) else: return self._dtype_from_primitive_numpy(np.dtype(dtype.to_pandas_dtype())) @@ -390,7 +390,7 @@ def _get_validity_buffer( data_size = self._get_buffer_size(bit_width=1) return ( OmnisciProtocolBuffer(validity_buffer, data_size), - (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Edianness.NATIVE), + (DTypeKind.BOOL, 1, ArrowCTypes.BOOL, Endianness.NATIVE), ) def _get_offsets_buffer(