diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e8834bd509bf0..516d37a483ce4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -132,3 +132,22 @@ jobs: - name: Upload dev docs run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev if: github.event_name == 'push' + + data_manager: + name: Test experimental data manager + runs-on: ubuntu-latest + steps: + + - name: Setting conda path + run: echo "${HOME}/miniconda3/bin" >> $GITHUB_PATH + + - name: Checkout + uses: actions/checkout@v1 + + - name: Setup environment and build pandas + run: ci/setup_env.sh + + - name: Run tests + run: | + source activate pandas-dev + pytest pandas/tests/frame/methods --array-manager diff --git a/pandas/_typing.py b/pandas/_typing.py index 91cb01dac76fb..9b957ab4d0686 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -39,6 +39,7 @@ from pandas.core.generic import NDFrame # noqa: F401 from pandas.core.groupby.generic import DataFrameGroupBy, SeriesGroupBy from pandas.core.indexes.base import Index + from pandas.core.internals import ArrayManager, BlockManager from pandas.core.resample import Resampler from pandas.core.series import Series from pandas.core.window.rolling import BaseWindow @@ -159,3 +160,6 @@ ColspaceArgType = Union[ str, int, Sequence[Union[str, int]], Mapping[Hashable, Union[str, int]] ] + +# internals +Manager = Union["ArrayManager", "BlockManager"] diff --git a/pandas/conftest.py b/pandas/conftest.py index e30a55cef3166..45d545a522fc7 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -75,6 +75,19 @@ def pytest_addoption(parser): action="store_true", help="Fail if a test is skipped for missing data file.", ) + parser.addoption( + "--array-manager", + "--am", + action="store_true", + help="Use the experimental ArrayManager as default data manager.", + ) + + +def pytest_sessionstart(session): + # Note: we need to set the option here and not in pytest_runtest_setup below + # to ensure this is run before creating fixture data + if session.config.getoption("--array-manager"): + pd.options.mode.data_manager = "array" def pytest_runtest_setup(item): @@ -1454,3 +1467,11 @@ def indexer_si(request): Parametrize over __setitem__, iloc.__setitem__ """ return request.param + + +@pytest.fixture +def using_array_manager(request): + """ + Fixture to check if the array manager is being used. + """ + return pd.options.mode.data_manager == "array" diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index fba82ae499e90..56ef1ea28ed1b 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -483,6 +483,12 @@ def use_inf_as_na_cb(key): cf.register_option( "use_inf_as_null", False, use_inf_as_null_doc, cb=use_inf_as_na_cb ) + cf.register_option( + "data_manager", + "block", + "Internal data manager type", + validator=is_one_of_factory(["block", "array"]), + ) cf.deprecate_option( "mode.use_inf_as_null", msg=use_inf_as_null_doc, rkey="mode.use_inf_as_na" diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 344e5d6667074..36ccd0b8a2f7d 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -62,6 +62,7 @@ IndexKeyFunc, IndexLabel, Level, + Manager, PythonFuncType, Renamer, StorageOptions, @@ -137,13 +138,14 @@ ) from pandas.core.indexes.multi import MultiIndex, maybe_droplevels from pandas.core.indexing import check_bool_indexer, convert_to_index_sliceable -from pandas.core.internals import BlockManager +from pandas.core.internals import ArrayManager, BlockManager from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, init_dict, init_ndarray, masked_rec_array_to_mgr, + mgr_to_mgr, nested_data_to_arrays, reorder_arrays, sanitize_index, @@ -523,7 +525,7 @@ def __init__( if isinstance(data, DataFrame): data = data._mgr - if isinstance(data, BlockManager): + if isinstance(data, (BlockManager, ArrayManager)): if index is None and columns is None and dtype is None and copy is False: # GH#33357 fastpath NDFrame.__init__(self, data) @@ -601,8 +603,31 @@ def __init__( values, index, columns, dtype=values.dtype, copy=False ) + # ensure correct Manager type according to settings + manager = get_option("mode.data_manager") + mgr = mgr_to_mgr(mgr, typ=manager) + NDFrame.__init__(self, mgr) + def _as_manager(self, typ: str) -> DataFrame: + """ + Private helper function to create a DataFrame with specific manager. + + Parameters + ---------- + typ : {"block", "array"} + + Returns + ------- + DataFrame + New DataFrame using specified manager type. Is not guaranteed + to be a copy or not. + """ + new_mgr: Manager + new_mgr = mgr_to_mgr(self._mgr, typ=typ) + # fastpath of passing a manager doesn't check the option/manager class + return DataFrame(new_mgr) + # ---------------------------------------------------------------------- @property @@ -675,6 +700,8 @@ def _is_homogeneous_type(self) -> bool: ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type False """ + if isinstance(self._mgr, ArrayManager): + return len({arr.dtype for arr in self._mgr.arrays}) == 1 if self._mgr.any_extension_types: return len({block.dtype for block in self._mgr.blocks}) == 1 else: @@ -685,6 +712,8 @@ def _can_fast_transpose(self) -> bool: """ Can we transpose this DataFrame without creating any new array objects. """ + if isinstance(self._mgr, ArrayManager): + return False if self._mgr.any_extension_types: # TODO(EA2D) special case would be unnecessary with 2D EAs return False @@ -5506,7 +5535,7 @@ def sort_values( # type: ignore[override] ) if ignore_index: - new_data.axes[1] = ibase.default_index(len(indexer)) + new_data.set_axis(1, ibase.default_index(len(indexer))) result = self._constructor(new_data) if inplace: @@ -6051,7 +6080,10 @@ def _dispatch_frame_op(self, right, func, axis: Optional[int] = None): # fails in cases with empty columns reached via # _frame_arith_method_with_reindex - bm = self._mgr.operate_blockwise(right._mgr, array_op) + # TODO operate_blockwise expects a manager of the same type + bm = self._mgr.operate_blockwise( + right._mgr, array_op # type: ignore[arg-type] + ) return type(self)(bm) elif isinstance(right, Series) and axis == 1: @@ -8894,11 +8926,11 @@ def func(values: np.ndarray): # We only use this in the case that operates on self.values return op(values, axis=axis, skipna=skipna, **kwds) - def blk_func(values): + def blk_func(values, axis=1): if isinstance(values, ExtensionArray): return values._reduce(name, skipna=skipna, **kwds) else: - return op(values, axis=1, skipna=skipna, **kwds) + return op(values, axis=axis, skipna=skipna, **kwds) def _get_data() -> DataFrame: if filter_type is None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0daeed0e393e6..9e3f6f8e36175 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -45,6 +45,7 @@ IndexLabel, JSONSerializable, Level, + Manager, NpDtype, Renamer, StorageOptions, @@ -102,7 +103,7 @@ RangeIndex, ensure_index, ) -from pandas.core.internals import BlockManager +from pandas.core.internals import ArrayManager, BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.shared_docs import _shared_docs @@ -179,7 +180,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): ) _metadata: List[str] = [] _is_copy = None - _mgr: BlockManager + _mgr: Manager _attrs: Dict[Optional[Hashable], Any] _typ: str @@ -188,7 +189,7 @@ class NDFrame(PandasObject, SelectionMixin, indexing.IndexingMixin): def __init__( self, - data: BlockManager, + data: Manager, copy: bool = False, attrs: Optional[Mapping[Optional[Hashable], Any]] = None, ): @@ -207,7 +208,7 @@ def __init__( @classmethod def _init_mgr( cls, mgr, axes, dtype: Optional[Dtype] = None, copy: bool = False - ) -> BlockManager: + ) -> Manager: """ passed a manager and a axes dict """ for a, axe in axes.items(): if axe is not None: @@ -220,7 +221,13 @@ def _init_mgr( mgr = mgr.copy() if dtype is not None: # avoid further copies if we can - if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: + if ( + isinstance(mgr, BlockManager) + and len(mgr.blocks) == 1 + and mgr.blocks[0].values.dtype == dtype + ): + pass + else: mgr = mgr.astype(dtype=dtype) return mgr @@ -4544,11 +4551,11 @@ def sort_index( new_data = self._mgr.take(indexer, axis=baxis, verify=False) # reconstruct axis if needed - new_data.axes[baxis] = new_data.axes[baxis]._sort_levels_monotonic() + new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic()) if ignore_index: axis = 1 if isinstance(self, ABCDataFrame) else 0 - new_data.axes[axis] = ibase.default_index(len(indexer)) + new_data.set_axis(axis, ibase.default_index(len(indexer))) result = self._constructor(new_data) @@ -5521,6 +5528,8 @@ def _protect_consolidate(self, f): Consolidate _mgr -- if the blocks have changed, then clear the cache """ + if isinstance(self._mgr, ArrayManager): + return f() blocks_before = len(self._mgr.blocks) result = f() if len(self._mgr.blocks) != blocks_before: @@ -5710,11 +5719,13 @@ def _to_dict_of_blocks(self, copy: bool_t = True): Return a dict of dtype -> Constructor Types that each is a homogeneous dtype. - Internal ONLY + Internal ONLY - only works for BlockManager """ + mgr = self._mgr + mgr = cast(BlockManager, mgr) return { k: self._constructor(v).__finalize__(self) - for k, v, in self._mgr.to_dict(copy=copy).items() + for k, v, in mgr.to_dict(copy=copy).items() } def astype( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 174a2f4052b06..c561204c1c125 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1086,10 +1086,12 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: # in the operation. We un-split here. result = result._consolidate() assert isinstance(result, (Series, DataFrame)) # for mypy - assert len(result._mgr.blocks) == 1 + mgr = result._mgr + assert isinstance(mgr, BlockManager) + assert len(mgr.blocks) == 1 # unwrap DataFrame to get array - result = result._mgr.blocks[0].values + result = mgr.blocks[0].values return result def blk_func(bvalues: ArrayLike) -> ArrayLike: diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index fbccac1c2af67..e71143224556b 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,3 +1,5 @@ +from pandas.core.internals.array_manager import ArrayManager +from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import ( # io.pytables, io.packers Block, BoolBlock, @@ -35,6 +37,8 @@ "TimeDeltaBlock", "safe_reshape", "make_block", + "DataManager", + "ArrayManager", "BlockManager", "SingleBlockManager", "concatenate_block_managers", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py new file mode 100644 index 0000000000000..134bf59ed7f9c --- /dev/null +++ b/pandas/core/internals/array_manager.py @@ -0,0 +1,892 @@ +""" +Experimental manager based on storing a collection of 1D arrays +""" +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Tuple, TypeVar, Union + +import numpy as np + +from pandas._libs import algos as libalgos, lib +from pandas._typing import ArrayLike, DtypeObj, Hashable +from pandas.util._validators import validate_bool_kwarg + +from pandas.core.dtypes.cast import find_common_type, infer_dtype_from_scalar +from pandas.core.dtypes.common import ( + is_bool_dtype, + is_dtype_equal, + is_extension_array_dtype, + is_numeric_dtype, +) +from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna + +import pandas.core.algorithms as algos +from pandas.core.arrays import ExtensionArray, PandasDtype +from pandas.core.arrays.sparse import SparseDtype +from pandas.core.construction import extract_array +from pandas.core.indexers import maybe_convert_indices +from pandas.core.indexes.api import Index, ensure_index +from pandas.core.internals.base import DataManager +from pandas.core.internals.blocks import make_block + +if TYPE_CHECKING: + from pandas.core.internals.managers import SingleBlockManager + + +T = TypeVar("T", bound="ArrayManager") + + +class ArrayManager(DataManager): + """ + Core internal data structure to implement DataFrame and Series. + + Alternative to the BlockManager, storing a list of 1D arrays instead of + Blocks. + + This is *not* a public API class + + Parameters + ---------- + arrays : Sequence of arrays + axes : Sequence of Index + do_integrity_check : bool, default True + + """ + + __slots__ = [ + "_axes", # private attribute, because 'axes' has different order, see below + "arrays", + ] + + arrays: List[Union[np.ndarray, ExtensionArray]] + _axes: List[Index] + + def __init__( + self, + arrays: List[Union[np.ndarray, ExtensionArray]], + axes: List[Index], + do_integrity_check: bool = True, + ): + # Note: we are storing the axes in "_axes" in the (row, columns) order + # which contrasts the order how it is stored in BlockManager + self._axes = axes + self.arrays = arrays + + if do_integrity_check: + self._axes = [ensure_index(ax) for ax in axes] + self._verify_integrity() + + def make_empty(self: T, axes=None) -> T: + """Return an empty ArrayManager with the items axis of len 0 (no columns)""" + if axes is None: + axes = [self.axes[1:], Index([])] + + arrays: List[Union[np.ndarray, ExtensionArray]] = [] + return type(self)(arrays, axes) + + @property + def items(self) -> Index: + return self._axes[1] + + @property + def axes(self) -> List[Index]: # type: ignore[override] + # mypy doesn't work to override attribute with property + # see https://github.com/python/mypy/issues/4125 + """Axes is BlockManager-compatible order (columns, rows)""" + return [self._axes[1], self._axes[0]] + + @property + def shape(self) -> Tuple[int, ...]: + # this still gives the BlockManager-compatible transposed shape + return tuple(len(ax) for ax in self.axes) + + @property + def shape_proper(self) -> Tuple[int, ...]: + # this returns (n_rows, n_columns) + return tuple(len(ax) for ax in self._axes) + + @staticmethod + def _normalize_axis(axis): + # switch axis + axis = 1 if axis == 0 else 0 + return axis + + # TODO can be shared + def set_axis(self, axis: int, new_labels: Index) -> None: + # Caller is responsible for ensuring we have an Index object. + axis = self._normalize_axis(axis) + old_len = len(self._axes[axis]) + new_len = len(new_labels) + + if new_len != old_len: + raise ValueError( + f"Length mismatch: Expected axis has {old_len} elements, new " + f"values have {new_len} elements" + ) + + self._axes[axis] = new_labels + + def consolidate(self) -> "ArrayManager": + return self + + def is_consolidated(self) -> bool: + return True + + def _consolidate_inplace(self) -> None: + pass + + def get_dtypes(self): + return np.array([arr.dtype for arr in self.arrays], dtype="object") + + # TODO setstate getstate + + def __repr__(self) -> str: + output = type(self).__name__ + output += f"\nIndex: {self._axes[0]}" + output += f"\nColumns: {self._axes[1]}" + output += f"\n{len(self.arrays)} arrays:" + for arr in self.arrays: + output += f"\n{arr.dtype}" + return output + + def _verify_integrity(self) -> None: + n_rows, n_columns = self.shape_proper + if not len(self.arrays) == n_columns: + raise ValueError( + "Number of passed arrays must equal the size of the column Index: " + f"{len(self.arrays)} arrays vs {n_columns} columns." + ) + for arr in self.arrays: + if not len(arr) == n_rows: + raise ValueError( + "Passed arrays should have the same length as the rows Index: " + f"{len(arr)} vs {n_rows} rows" + ) + if not isinstance(arr, (np.ndarray, ExtensionArray)): + raise ValueError( + "Passed arrays should be np.ndarray or ExtensionArray instances, " + f"got {type(arr)} instead" + ) + + def reduce( + self: T, func: Callable, ignore_failures: bool = False + ) -> Tuple[T, np.ndarray]: + # TODO this still fails because `func` assumes to work on 2D arrays + # TODO implement ignore_failures + assert self.ndim == 2 + + res_arrays = [] + for arr in self.arrays: + res = func(arr, axis=0) + res_arrays.append(np.array([res])) + + index = Index([None]) # placeholder + new_mgr = type(self)(res_arrays, [index, self.items]) + indexer = np.arange(self.shape[0]) + return new_mgr, indexer + + def operate_blockwise(self, other: "ArrayManager", array_op) -> "ArrayManager": + """ + Apply array_op blockwise with another (aligned) BlockManager. + """ + # TODO what if `other` is BlockManager ? + left_arrays = self.arrays + right_arrays = other.arrays + result_arrays = [ + array_op(left, right) for left, right in zip(left_arrays, right_arrays) + ] + return type(self)(result_arrays, self._axes) + + def apply( + self: T, + f, + align_keys: Optional[List[str]] = None, + ignore_failures: bool = False, + **kwargs, + ) -> T: + """ + Iterate over the arrays, collect and create a new ArrayManager. + + Parameters + ---------- + f : str or callable + Name of the Array method to apply. + align_keys: List[str] or None, default None + ignore_failures: bool, default False + **kwargs + Keywords to pass to `f` + + Returns + ------- + ArrayManager + """ + assert "filter" not in kwargs + + align_keys = align_keys or [] + result_arrays: List[np.ndarray] = [] + result_indices: List[int] = [] + # fillna: Series/DataFrame is responsible for making sure value is aligned + + aligned_args = {k: kwargs[k] for k in align_keys} + + if f == "apply": + f = kwargs.pop("func") + + for i, arr in enumerate(self.arrays): + + if aligned_args: + + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[i] + else: + kwargs[k] = obj.iloc[:, i]._values + else: + # otherwise we have an array-like + kwargs[k] = obj[i] + + try: + if callable(f): + applied = f(arr, **kwargs) + else: + applied = getattr(arr, f)(**kwargs) + except (TypeError, NotImplementedError): + if not ignore_failures: + raise + continue + # if not isinstance(applied, ExtensionArray): + # # TODO not all EA operations return new EAs (eg astype) + # applied = array(applied) + result_arrays.append(applied) + result_indices.append(i) + + new_axes: List[Index] + if ignore_failures: + # TODO copy? + new_axes = [self._axes[0], self._axes[1][result_indices]] + else: + new_axes = self._axes + + if len(result_arrays) == 0: + return self.make_empty(new_axes) + + return type(self)(result_arrays, new_axes) + + def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: + + align_keys = align_keys or [] + aligned_args = {k: kwargs[k] for k in align_keys} + + result_arrays = [] + + for i, arr in enumerate(self.arrays): + + if aligned_args: + for k, obj in aligned_args.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj.iloc[[i]] + else: + kwargs[k] = obj.iloc[:, [i]]._values + else: + # otherwise we have an ndarray + kwargs[k] = obj[[i]] + + if hasattr(arr, "tz") and arr.tz is None: # type: ignore[union-attr] + # DatetimeArray needs to be converted to ndarray for DatetimeBlock + arr = arr._data # type: ignore[union-attr] + elif arr.dtype.kind == "m": + # TimedeltaArray needs to be converted to ndarray for TimedeltaBlock + arr = arr._data # type: ignore[union-attr] + if isinstance(arr, np.ndarray): + arr = np.atleast_2d(arr) + block = make_block(arr, placement=slice(0, 1, 1), ndim=2) + applied = getattr(block, f)(**kwargs) + if isinstance(applied, list): + applied = applied[0] + arr = applied.values + if isinstance(arr, np.ndarray): + arr = arr[0, :] + result_arrays.append(arr) + + return type(self)(result_arrays, self._axes) + + # TODO quantile + + def isna(self, func) -> "ArrayManager": + return self.apply("apply", func=func) + + def where(self, other, cond, align: bool, errors: str, axis: int) -> "ArrayManager": + if align: + align_keys = ["other", "cond"] + else: + align_keys = ["cond"] + other = extract_array(other, extract_numpy=True) + + return self.apply_with_block( + "where", + align_keys=align_keys, + other=other, + cond=cond, + errors=errors, + axis=axis, + ) + + # TODO what is this used for? + # def setitem(self, indexer, value) -> "ArrayManager": + # return self.apply_with_block("setitem", indexer=indexer, value=value) + + def putmask(self, mask, new, align: bool = True, axis: int = 0): + + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + return self.apply_with_block( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + axis=axis, + ) + + def diff(self, n: int, axis: int) -> "ArrayManager": + return self.apply_with_block("diff", n=n, axis=axis) + + def interpolate(self, **kwargs) -> "ArrayManager": + return self.apply_with_block("interpolate", **kwargs) + + def shift(self, periods: int, axis: int, fill_value) -> "ArrayManager": + if fill_value is lib.no_default: + fill_value = None + + if axis == 0 and self.ndim == 2: + # TODO column-wise shift + raise NotImplementedError + + return self.apply_with_block( + "shift", periods=periods, axis=axis, fill_value=fill_value + ) + + def fillna(self, value, limit, inplace: bool, downcast) -> "ArrayManager": + # TODO implement downcast + inplace = validate_bool_kwarg(inplace, "inplace") + + def array_fillna(array, value, limit, inplace): + + mask = isna(array) + if limit is not None: + limit = libalgos.validate_limit(None, limit=limit) + mask[mask.cumsum() > limit] = False + + # TODO could optimize for arrays that cannot hold NAs + # (like _can_hold_na on Blocks) + if not inplace: + array = array.copy() + + # np.putmask(array, mask, value) + if np.any(mask): + # TODO allow invalid value if there is nothing to fill? + array[mask] = value + return array + + return self.apply(array_fillna, value=value, limit=limit, inplace=inplace) + + def downcast(self) -> "ArrayManager": + return self.apply_with_block("downcast") + + def astype( + self, dtype, copy: bool = False, errors: str = "raise" + ) -> "ArrayManager": + return self.apply("astype", dtype=dtype, copy=copy) # , errors=errors) + + def convert( + self, + copy: bool = True, + datetime: bool = True, + numeric: bool = True, + timedelta: bool = True, + ) -> "ArrayManager": + return self.apply_with_block( + "convert", + copy=copy, + datetime=datetime, + numeric=numeric, + timedelta=timedelta, + ) + + def replace(self, value, **kwargs) -> "ArrayManager": + assert np.ndim(value) == 0, value + # TODO "replace" is right now implemented on the blocks, we should move + # it to general array algos so it can be reused here + return self.apply_with_block("replace", value=value, **kwargs) + + def replace_list( + self: T, + src_list: List[Any], + dest_list: List[Any], + inplace: bool = False, + regex: bool = False, + ) -> T: + """ do a list replace """ + inplace = validate_bool_kwarg(inplace, "inplace") + + return self.apply_with_block( + "_replace_list", + src_list=src_list, + dest_list=dest_list, + inplace=inplace, + regex=regex, + ) + + def to_native_types(self, **kwargs): + return self.apply_with_block("to_native_types", **kwargs) + + @property + def is_mixed_type(self) -> bool: + return True + + @property + def is_numeric_mixed_type(self) -> bool: + return False + + @property + def any_extension_types(self) -> bool: + """Whether any of the blocks in this manager are extension blocks""" + return False # any(block.is_extension for block in self.blocks) + + @property + def is_view(self) -> bool: + """ return a boolean if we are a single block and are a view """ + # TODO what is this used for? + return False + + @property + def is_single_block(self) -> bool: + return False + + def get_bool_data(self, copy: bool = False) -> "ArrayManager": + """ + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + mask = np.array([is_bool_dtype(t) for t in self.get_dtypes()], dtype="object") + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) + + def get_numeric_data(self, copy: bool = False) -> "ArrayManager": + """ + Parameters + ---------- + copy : bool, default False + Whether to copy the blocks + """ + mask = np.array([is_numeric_dtype(t) for t in self.get_dtypes()]) + arrays = [self.arrays[i] for i in np.nonzero(mask)[0]] + # TODO copy? + new_axes = [self._axes[0], self._axes[1][mask]] + return type(self)(arrays, new_axes) + + def copy(self: T, deep=True) -> T: + """ + Make deep or shallow copy of ArrayManager + + Parameters + ---------- + deep : bool or string, default True + If False, return shallow copy (do not copy data) + If 'all', copy data and a deep copy of the index + + Returns + ------- + BlockManager + """ + # this preserves the notion of view copying of axes + if deep: + # hit in e.g. tests.io.json.test_pandas + + def copy_func(ax): + return ax.copy(deep=True) if deep == "all" else ax.view() + + new_axes = [copy_func(ax) for ax in self._axes] + else: + new_axes = list(self._axes) + + if deep: + new_arrays = [arr.copy() for arr in self.arrays] + else: + new_arrays = self.arrays + return type(self)(new_arrays, new_axes) + + def as_array( + self, + transpose: bool = False, + dtype=None, + copy: bool = False, + na_value=lib.no_default, + ) -> np.ndarray: + """ + Convert the blockmanager data into an numpy array. + + Parameters + ---------- + transpose : bool, default False + If True, transpose the return array. + dtype : object, default None + Data type of the return array. + copy : bool, default False + If True then guarantee that a copy is returned. A value of + False does not guarantee that the underlying data is not + copied. + na_value : object, default lib.no_default + Value to be used as the missing value sentinel. + + Returns + ------- + arr : ndarray + """ + if len(self.arrays) == 0: + arr = np.empty(self.shape, dtype=float) + return arr.transpose() if transpose else arr + + # We want to copy when na_value is provided to avoid + # mutating the original object + copy = copy or na_value is not lib.no_default + + if not dtype: + dtype = _interleaved_dtype(self.arrays) + + if isinstance(dtype, SparseDtype): + dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + dtype = "object" + elif is_dtype_equal(dtype, str): + dtype = "object" + + result = np.empty(self.shape_proper, dtype=dtype) + + for i, arr in enumerate(self.arrays): + arr = arr.astype(dtype, copy=copy) + result[:, i] = arr + + if na_value is not lib.no_default: + result[isna(result)] = na_value + + return result + # return arr.transpose() if transpose else arr + + def get_slice(self, slobj: slice, axis: int = 0) -> "ArrayManager": + axis = self._normalize_axis(axis) + + if axis == 0: + arrays = [arr[slobj] for arr in self.arrays] + elif axis == 1: + arrays = self.arrays[slobj] + + new_axes = list(self._axes) + new_axes[axis] = new_axes[axis][slobj] + + return type(self)(arrays, new_axes, do_integrity_check=False) + + def fast_xs(self, loc: int) -> ArrayLike: + """ + Return the array corresponding to `frame.iloc[loc]`. + + Parameters + ---------- + loc : int + + Returns + ------- + np.ndarray or ExtensionArray + """ + dtype = _interleaved_dtype(self.arrays) + + if isinstance(dtype, SparseDtype): + temp_dtype = dtype.subtype + elif isinstance(dtype, PandasDtype): + temp_dtype = dtype.numpy_dtype + elif is_extension_array_dtype(dtype): + temp_dtype = "object" + elif is_dtype_equal(dtype, str): + temp_dtype = "object" + else: + temp_dtype = dtype + + result = np.array([arr[loc] for arr in self.arrays], dtype=temp_dtype) + if isinstance(dtype, ExtensionDtype): + result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) + return result + + def iget(self, i: int) -> "SingleBlockManager": + """ + Return the data as a SingleBlockManager. + """ + from pandas.core.internals.managers import SingleBlockManager + + values = self.arrays[i] + block = make_block(values, placement=slice(0, len(values)), ndim=1) + + return SingleBlockManager(block, self._axes[0]) + + def iget_values(self, i: int) -> ArrayLike: + """ + Return the data for column i as the values (ndarray or ExtensionArray). + """ + return self.arrays[i] + + def idelete(self, indexer): + """ + Delete selected locations in-place (new block and array, same BlockManager) + """ + to_keep = np.ones(self.shape[0], dtype=np.bool_) + to_keep[indexer] = False + + self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] + self._axes = [self._axes[0], self._axes[1][to_keep]] + + def iset(self, loc: Union[int, slice, np.ndarray], value): + """ + Set new item in-place. Does not consolidate. Adds new Block if not + contained in the current set of items + """ + if lib.is_integer(loc): + # TODO normalize array -> this should in theory not be needed? + value = extract_array(value, extract_numpy=True) + if isinstance(value, np.ndarray) and value.ndim == 2: + value = value[0, :] + + assert isinstance(value, (np.ndarray, ExtensionArray)) + # value = np.asarray(value) + # assert isinstance(value, np.ndarray) + assert len(value) == len(self._axes[0]) + self.arrays[loc] = value + return + + # TODO + raise Exception + + def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False): + """ + Insert item at selected position. + + Parameters + ---------- + loc : int + item : hashable + value : array_like + allow_duplicates: bool + If False, trying to insert non-unique item will raise + + """ + if not allow_duplicates and item in self.items: + # Should this be a different kind of error?? + raise ValueError(f"cannot insert {item}, already exists") + + if not isinstance(loc, int): + raise TypeError("loc must be int") + + # insert to the axis; this could possibly raise a TypeError + new_axis = self.items.insert(loc, item) + + value = extract_array(value, extract_numpy=True) + if value.ndim == 2: + value = value[0, :] + # TODO self.arrays can be empty + # assert len(value) == len(self.arrays[0]) + + # TODO is this copy needed? + arrays = self.arrays.copy() + arrays.insert(loc, value) + + self.arrays = arrays + self._axes[1] = new_axis + + def reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + # ignored keywords + consolidate: bool = True, + only_slice: bool = False, + ) -> T: + axis = self._normalize_axis(axis) + return self._reindex_indexer( + new_axis, indexer, axis, fill_value, allow_dups, copy + ) + + def _reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + ) -> T: + """ + Parameters + ---------- + new_axis : Index + indexer : ndarray of int64 or None + axis : int + fill_value : object, default None + allow_dups : bool, default False + copy : bool, default True + + + pandas-indexer with -1's only. + """ + if indexer is None: + if new_axis is self._axes[axis] and not copy: + return self + + result = self.copy(deep=copy) + result._axes = list(self._axes) + result._axes[axis] = new_axis + return result + + # some axes don't allow reindexing with dups + if not allow_dups: + self._axes[axis]._can_reindex(indexer) + + # if axis >= self.ndim: + # raise IndexError("Requested axis not found in manager") + + if axis == 1: + new_arrays = [] + for i in indexer: + if i == -1: + arr = self._make_na_array(fill_value=fill_value) + else: + arr = self.arrays[i] + new_arrays.append(arr) + + else: + new_arrays = [ + algos.take( + arr, + indexer, + allow_fill=True, + fill_value=fill_value, + # if fill_value is not None else blk.fill_value + ) + for arr in self.arrays + ] + + new_axes = list(self._axes) + new_axes[axis] = new_axis + + return type(self)(new_arrays, new_axes) + + def take(self, indexer, axis: int = 1, verify: bool = True, convert: bool = True): + """ + Take items along any axis. + """ + axis = self._normalize_axis(axis) + + indexer = ( + np.arange(indexer.start, indexer.stop, indexer.step, dtype="int64") + if isinstance(indexer, slice) + else np.asanyarray(indexer, dtype="int64") + ) + + n = self.shape_proper[axis] + if convert: + indexer = maybe_convert_indices(indexer, n) + + if verify: + if ((indexer == -1) | (indexer >= n)).any(): + raise Exception("Indices must be nonzero and less than the axis length") + + new_labels = self._axes[axis].take(indexer) + return self._reindex_indexer( + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + ) + + def _make_na_array(self, fill_value=None): + if fill_value is None: + fill_value = np.nan + + dtype, fill_value = infer_dtype_from_scalar(fill_value) + values = np.empty(self.shape_proper[0], dtype=dtype) + values.fill(fill_value) + return values + + def equals(self, other: object) -> bool: + # TODO + raise NotImplementedError + + def unstack(self, unstacker, fill_value) -> "ArrayManager": + """ + Return a BlockManager with all blocks unstacked.. + + Parameters + ---------- + unstacker : reshape._Unstacker + fill_value : Any + fill_value for newly introduced missing values. + + Returns + ------- + unstacked : BlockManager + """ + indexer, _ = unstacker._indexer_and_to_sort + new_indexer = np.full(unstacker.mask.shape, -1) + new_indexer[unstacker.mask] = indexer + new_indexer2D = new_indexer.reshape(*unstacker.full_shape) + + new_arrays = [] + for arr in self.arrays: + for i in range(unstacker.full_shape[1]): + new_arr = algos.take( + arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value + ) + new_arrays.append(new_arr) + + new_index = unstacker.new_index + new_columns = unstacker.get_new_columns(self._axes[1]) + new_axes = [new_index, new_columns] + + return type(self)(new_arrays, new_axes, do_integrity_check=False) + + # TODO + # equals + # to_dict + # quantile + + +def _interleaved_dtype(blocks) -> Optional[DtypeObj]: + """ + Find the common dtype for `blocks`. + + Parameters + ---------- + blocks : List[Block] + + Returns + ------- + dtype : np.dtype, ExtensionDtype, or None + None is returned when `blocks` is empty. + """ + if not len(blocks): + return None + + return find_common_type([b.dtype for b in blocks]) diff --git a/pandas/core/internals/base.py b/pandas/core/internals/base.py new file mode 100644 index 0000000000000..2295e3f2c41b2 --- /dev/null +++ b/pandas/core/internals/base.py @@ -0,0 +1,72 @@ +""" +Base class for the internal managers. Both BlockManager and ArrayManager +inherit from this class. +""" +from typing import List, TypeVar + +from pandas.errors import AbstractMethodError + +from pandas.core.base import PandasObject +from pandas.core.indexes.api import Index, ensure_index + +T = TypeVar("T", bound="DataManager") + + +class DataManager(PandasObject): + + # TODO share more methods/attributes + + axes: List[Index] + + @property + def items(self) -> Index: + raise AbstractMethodError(self) + + def __len__(self) -> int: + return len(self.items) + + @property + def ndim(self) -> int: + return len(self.axes) + + def reindex_indexer( + self: T, + new_axis, + indexer, + axis: int, + fill_value=None, + allow_dups: bool = False, + copy: bool = True, + consolidate: bool = True, + only_slice: bool = False, + ) -> T: + raise AbstractMethodError(self) + + def reindex_axis( + self, + new_index, + axis: int, + method=None, + limit=None, + fill_value=None, + copy: bool = True, + consolidate: bool = True, + only_slice: bool = False, + ): + """ + Conform data manager to new index. + """ + new_index = ensure_index(new_index) + new_index, indexer = self.axes[axis].reindex( + new_index, method=method, limit=limit + ) + + return self.reindex_indexer( + new_index, + indexer, + axis=axis, + fill_value=fill_value, + copy=copy, + consolidate=consolidate, + only_slice=only_slice, + ) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index f97077954f8bf..32b6f9d64dd8d 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -1,11 +1,12 @@ from collections import defaultdict import copy +import itertools from typing import TYPE_CHECKING, Any, Dict, List, Sequence, Tuple, cast import numpy as np from pandas._libs import NaT, internals as libinternals -from pandas._typing import ArrayLike, DtypeObj, Shape +from pandas._typing import ArrayLike, DtypeObj, Manager, Shape from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote @@ -25,6 +26,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray, ExtensionArray +from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -35,7 +37,7 @@ def concatenate_block_managers( mgrs_indexers, axes: List["Index"], concat_axis: int, copy: bool -) -> BlockManager: +) -> Manager: """ Concatenate block managers into one. @@ -50,6 +52,21 @@ def concatenate_block_managers( ------- BlockManager """ + if isinstance(mgrs_indexers[0][0], ArrayManager): + + if concat_axis == 1: + # TODO for now only fastpath without indexers + mgrs = [t[0] for t in mgrs_indexers] + arrays = [ + concat_compat([mgrs[i].arrays[j] for i in range(len(mgrs))], axis=0) + for j in range(len(mgrs[0].arrays)) + ] + return ArrayManager(arrays, [axes[1], axes[0]]) + elif concat_axis == 0: + mgrs = [t[0] for t in mgrs_indexers] + arrays = list(itertools.chain.from_iterable([mgr.arrays for mgr in mgrs])) + return ArrayManager(arrays, [axes[1], axes[0]]) + concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5161cf7038fe8..57a87e1e283d9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -19,7 +19,7 @@ import numpy.ma as ma from pandas._libs import lib -from pandas._typing import Axis, DtypeObj, Scalar +from pandas._typing import Axis, DtypeObj, Manager, Scalar from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -149,6 +149,33 @@ def masked_rec_array_to_mgr( return mgr +def mgr_to_mgr(mgr, typ: str): + """ + Convert to specific type of Manager. Does not copy if the type is already + correct. Does not guarantee a copy otherwise. + """ + from pandas.core.internals import ArrayManager, BlockManager + + new_mgr: Manager + + if typ == "block": + if isinstance(mgr, BlockManager): + new_mgr = mgr + else: + new_mgr = arrays_to_mgr( + mgr.arrays, mgr.axes[0], mgr.axes[1], mgr.axes[0], dtype=None + ) + elif typ == "array": + if isinstance(mgr, ArrayManager): + new_mgr = mgr + else: + arrays = [mgr.iget_values(i).copy() for i in range(len(mgr.axes[0]))] + new_mgr = ArrayManager(arrays, [mgr.axes[1], mgr.axes[0]]) + else: + raise ValueError(f"'typ' needs to be one of {{'block', 'array'}}, got '{type}'") + return new_mgr + + # --------------------------------------------------------------------- # DataFrame Constructor Interface diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index fd503280eeafb..cc5576719ff43 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -40,10 +40,10 @@ import pandas.core.algorithms as algos from pandas.core.arrays.sparse import SparseDtype -from pandas.core.base import PandasObject from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices from pandas.core.indexes.api import Index, ensure_index +from pandas.core.internals.base import DataManager from pandas.core.internals.blocks import ( Block, CategoricalBlock, @@ -62,7 +62,7 @@ T = TypeVar("T", bound="BlockManager") -class BlockManager(PandasObject): +class BlockManager(DataManager): """ Core internal data structure to implement DataFrame, Series, etc. @@ -1229,35 +1229,6 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False stacklevel=5, ) - def reindex_axis( - self, - new_index, - axis: int, - method=None, - limit=None, - fill_value=None, - copy: bool = True, - consolidate: bool = True, - only_slice: bool = False, - ): - """ - Conform block manager to new index. - """ - new_index = ensure_index(new_index) - new_index, indexer = self.axes[axis].reindex( - new_index, method=method, limit=limit - ) - - return self.reindex_indexer( - new_index, - indexer, - axis=axis, - fill_value=fill_value, - copy=copy, - consolidate=consolidate, - only_slice=only_slice, - ) - def reindex_indexer( self: T, new_axis, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index d1275590306ef..f1d0af60e1c7f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -21,6 +21,7 @@ Tuple, Type, Union, + cast, ) import warnings @@ -67,6 +68,7 @@ from pandas.core.computation.pytables import PyTablesExpr, maybe_expression from pandas.core.construction import extract_array from pandas.core.indexes.api import ensure_index +from pandas.core.internals import BlockManager from pandas.io.common import stringify_path from pandas.io.formats.printing import adjoin, pprint_thing @@ -3983,19 +3985,21 @@ def _get_blocks_and_items( def get_blk_items(mgr): return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks] - blocks: List["Block"] = list(frame._mgr.blocks) - blk_items: List[Index] = get_blk_items(frame._mgr) + mgr = frame._mgr + mgr = cast(BlockManager, mgr) + blocks: List["Block"] = list(mgr.blocks) + blk_items: List[Index] = get_blk_items(mgr) if len(data_columns): axis, axis_labels = new_non_index_axes[0] new_labels = Index(axis_labels).difference(Index(data_columns)) mgr = frame.reindex(new_labels, axis=axis)._mgr - blocks = list(mgr.blocks) + blocks = list(mgr.blocks) # type: ignore[union-attr] blk_items = get_blk_items(mgr) for c in data_columns: mgr = frame.reindex([c], axis=axis)._mgr - blocks.extend(mgr.blocks) + blocks.extend(mgr.blocks) # type: ignore[union-attr] blk_items.extend(get_blk_items(mgr)) # reorder the blocks in the same order as the existing table if we can diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 356dc800d9662..36c875b8abe6f 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -1,10 +1,15 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Series, Timestamp, date_range, timedelta_range import pandas._testing as tm +# TODO td.skip_array_manager_not_yet_implemented +# appending with reindexing not yet working + class TestDataFrameAppend: def test_append_multiindex(self, multiindex_dataframe_random_data, frame_or_series): @@ -32,6 +37,7 @@ def test_append_empty_list(self): tm.assert_frame_equal(result, expected) assert result is not df # .append() should return a new object + @td.skip_array_manager_not_yet_implemented def test_append_series_dict(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -72,6 +78,7 @@ def test_append_series_dict(self): expected = df.append(df[-1:], ignore_index=True) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented def test_append_list_of_series_dicts(self): df = DataFrame(np.random.randn(5, 4), columns=["foo", "bar", "baz", "qux"]) @@ -90,6 +97,7 @@ def test_append_list_of_series_dicts(self): expected = df.append(DataFrame(dicts), ignore_index=True, sort=True) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented def test_append_missing_cols(self): # GH22252 # exercise the conditional branch in append method where the data @@ -134,6 +142,7 @@ def test_append_empty_dataframe(self): expected = df1.copy() tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented def test_append_dtypes(self): # GH 5754 @@ -193,6 +202,7 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): expected = Series(Timestamp(timestamp, tz=tz), name=0) tm.assert_series_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize( "data, dtype", [ diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 3c65551aafd0f..a4da77548b920 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( Categorical, @@ -90,6 +92,7 @@ def test_astype_mixed_type(self, mixed_type_frame): casted = mn.astype("O") _check_cast(casted, "object") + @td.skip_array_manager_not_yet_implemented def test_astype_with_exclude_string(self, float_frame): df = float_frame.copy() expected = float_frame.astype(int) @@ -124,6 +127,7 @@ def test_astype_with_view_mixed_float(self, mixed_float_frame): casted = tf.astype(np.int64) casted = tf.astype(np.float32) # noqa + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): @@ -382,6 +386,7 @@ def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["ns", "us", "ms", "s", "h", "m", "D"]) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination @@ -406,6 +411,7 @@ def test_astype_to_timedelta_unit_ns(self, unit): tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"]) def test_astype_to_timedelta_unit(self, unit): # coerce to float @@ -429,6 +435,7 @@ def test_astype_to_incorrect_datetimelike(self, unit): msg = ( fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to " fr"\[timedelta64\[{unit}\]\]" + fr"|(Cannot cast DatetimeArray to dtype timedelta64\[{unit}\])" ) with pytest.raises(TypeError, match=msg): df.astype(other) @@ -436,11 +443,13 @@ def test_astype_to_incorrect_datetimelike(self, unit): msg = ( fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to " fr"\[datetime64\[{unit}\]\]" + fr"|(Cannot cast TimedeltaArray to dtype datetime64\[{unit}\])" ) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) + @td.skip_array_manager_not_yet_implemented def test_astype_arg_for_errors(self): # GH#14878 @@ -567,6 +576,7 @@ def test_astype_empty_dtype_dict(self): tm.assert_frame_equal(result, df) assert result is not df + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) ignore keyword @pytest.mark.parametrize( "df", [ diff --git a/pandas/tests/frame/methods/test_count.py b/pandas/tests/frame/methods/test_count.py index d738c7139093c..1727a76c191ee 100644 --- a/pandas/tests/frame/methods/test_count.py +++ b/pandas/tests/frame/methods/test_count.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, Index, Series import pandas._testing as tm @@ -103,6 +105,7 @@ def test_count_index_with_nan(self): ) tm.assert_frame_equal(res, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_count_level( self, multiindex_year_month_day_dataframe_random_data, diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 6cea5abcac6d0..f8d729a215ba8 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -191,14 +191,15 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method): expected = DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) - def test_corr_item_cache(self): + def test_corr_item_cache(self, using_array_manager): # Check that corr does not lead to incorrect entries in item_cache df = DataFrame({"A": range(10)}) df["B"] = range(10)[::-1] ser = df["A"] # populate item_cache - assert len(df._mgr.blocks) == 2 + if not using_array_manager: + assert len(df._mgr.blocks) == 2 _ = df.corr() diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 148263bad0eb0..1de270fc72fb2 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -1,10 +1,15 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import Categorical, DataFrame, Series, Timestamp, date_range import pandas._testing as tm +# TODO(ArrayManager) quantile is needed for describe() +pytestmark = td.skip_array_manager_not_yet_implemented + class TestDataFrameDescribe: def test_describe_bool_in_mixed_frame(self): diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index 58e1bd146191f..bc2b7a4655b8e 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -5,6 +5,7 @@ from pandas.compat import is_numpy_dev from pandas.errors import PerformanceWarning +import pandas.util._test_decorators as td import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series, Timestamp @@ -156,6 +157,7 @@ def test_drop(self): assert return_value is None tm.assert_frame_equal(df, expected) + @td.skip_array_manager_not_yet_implemented def test_drop_multiindex_not_lexsorted(self): # GH#11640 diff --git a/pandas/tests/frame/methods/test_equals.py b/pandas/tests/frame/methods/test_equals.py index de2509ed91be2..dc45c9eb97ae4 100644 --- a/pandas/tests/frame/methods/test_equals.py +++ b/pandas/tests/frame/methods/test_equals.py @@ -1,8 +1,13 @@ import numpy as np +import pandas.util._test_decorators as td + from pandas import DataFrame, date_range import pandas._testing as tm +# TODO(ArrayManager) implement equals +pytestmark = td.skip_array_manager_not_yet_implemented + class TestEquals: def test_dataframe_not_equal(self): diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index bd0901387eeed..be80dd49ff1fb 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -1,9 +1,14 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm +# TODO(ArrayManager) concat with reindexing +pytestmark = td.skip_array_manager_not_yet_implemented + def test_error(): df = pd.DataFrame( diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b427611099be3..58016be82c405 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( Categorical, DataFrame, @@ -230,6 +232,7 @@ def test_fillna_categorical_nan(self): df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=NaT), df) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) implement downcast def test_fillna_downcast(self): # GH#15277 # infer int64 from float64 @@ -244,6 +247,7 @@ def test_fillna_downcast(self): expected = DataFrame({"a": [1, 0]}) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) object upcasting def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) @@ -268,6 +272,7 @@ def test_fillna_dtype_conversion(self): result = df.fillna(v) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_invalid_test def test_fillna_datetime_columns(self): # GH#7095 df = DataFrame( @@ -335,13 +340,13 @@ def test_frame_pad_backfill_limit(self): result = df[:2].reindex(index, method="pad", limit=5) expected = df[:2].reindex(index).fillna(method="pad") - expected.values[-3:] = np.nan + expected.iloc[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index, method="backfill", limit=5) expected = df[-2:].reindex(index).fillna(method="backfill") - expected.values[:3] = np.nan + expected.iloc[:3] = np.nan tm.assert_frame_equal(result, expected) def test_frame_fillna_limit(self): @@ -352,14 +357,14 @@ def test_frame_fillna_limit(self): result = result.fillna(method="pad", limit=5) expected = df[:2].reindex(index).fillna(method="pad") - expected.values[-3:] = np.nan + expected.iloc[-3:] = np.nan tm.assert_frame_equal(result, expected) result = df[-2:].reindex(index) result = result.fillna(method="backfill", limit=5) expected = df[-2:].reindex(index).fillna(method="backfill") - expected.values[:3] = np.nan + expected.iloc[:3] = np.nan tm.assert_frame_equal(result, expected) def test_fillna_skip_certain_blocks(self): diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 6b86a13fcf1b9..2477ad79d8a2c 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -324,6 +324,7 @@ def test_interp_string_axis(self, axis_name, axis_number): expected = df.interpolate(method="linear", axis=axis_number) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) support axis=1 @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"]) def test_interp_fillna_methods(self, axis, method): # GH 12918 diff --git a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py index 0fca4e988b775..126c78a657c58 100644 --- a/pandas/tests/frame/methods/test_is_homogeneous_dtype.py +++ b/pandas/tests/frame/methods/test_is_homogeneous_dtype.py @@ -1,8 +1,13 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import Categorical, DataFrame +# _is_homogeneous_type always returns True for ArrayManager +pytestmark = td.skip_array_manager_invalid_test + @pytest.mark.parametrize( "data, expected", diff --git a/pandas/tests/frame/methods/test_join.py b/pandas/tests/frame/methods/test_join.py index eba92cc71a6d0..42694dc3ff37c 100644 --- a/pandas/tests/frame/methods/test_join.py +++ b/pandas/tests/frame/methods/test_join.py @@ -3,10 +3,15 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Index, MultiIndex, date_range, period_range import pandas._testing as tm +# TODO(ArrayManager) concat with reindexing +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.fixture def frame_with_period_index(): diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py index 6ddba8b5e7064..3f7f2e51add96 100644 --- a/pandas/tests/frame/methods/test_quantile.py +++ b/pandas/tests/frame/methods/test_quantile.py @@ -1,10 +1,14 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Series, Timestamp import pandas._testing as tm +pytestmark = td.skip_array_manager_not_yet_implemented + class TestDataFrameQuantile: @pytest.mark.parametrize( diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 4255c1cb5e65f..5b66f58b8f069 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -238,6 +238,7 @@ def test_rank_methods_frame(self): expected = DataFrame(sprank, columns=cols).astype("float64") tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("dtype", ["O", "f8", "i8"]) def test_rank_descending(self, method, dtype): diff --git a/pandas/tests/frame/methods/test_reorder_levels.py b/pandas/tests/frame/methods/test_reorder_levels.py index 6bfbf089a6108..451fc9a5cf717 100644 --- a/pandas/tests/frame/methods/test_reorder_levels.py +++ b/pandas/tests/frame/methods/test_reorder_levels.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, MultiIndex import pandas._testing as tm @@ -47,6 +49,7 @@ def test_reorder_levels(self, frame_or_series): result = obj.reorder_levels(["L0", "L0", "L0"]) tm.assert_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_reorder_levels_swaplevel_equivalence( self, multiindex_year_month_day_dataframe_random_data ): diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py index 00d4a4277a42f..e43eb3fb47b7e 100644 --- a/pandas/tests/frame/methods/test_reset_index.py +++ b/pandas/tests/frame/methods/test_reset_index.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas as pd @@ -518,6 +520,7 @@ def test_reset_index_delevel_infer_dtype(self): assert is_integer_dtype(deleveled["prm1"]) assert is_float_dtype(deleveled["prm2"]) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_reset_index_with_drop( self, multiindex_year_month_day_dataframe_random_data ): @@ -616,6 +619,7 @@ def test_reset_index_empty_frame_with_datetime64_multiindex(): tm.assert_frame_equal(result, expected) +@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby(): # https://github.com/pandas-dev/pandas/issues/35657 df = DataFrame({"c1": [10.0], "c2": ["a"], "c3": pd.to_datetime("2020-01-01")}) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index f2dbe4a799a17..434df5ccccaf7 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -42,6 +42,9 @@ def __len__(self) -> int: def __getitem__(self, item): pass + def copy(self): + return self + class TestSelectDtypes: def test_select_dtypes_include_using_list_like(self): diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 40b3f1e89c015..aefc407d0c432 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import CategoricalIndex, DataFrame, Index, Series, date_range, offsets import pandas._testing as tm @@ -145,12 +147,13 @@ def test_shift_duplicate_columns(self): tm.assert_frame_equal(shifted[0], shifted[1]) tm.assert_frame_equal(shifted[0], shifted[2]) - def test_shift_axis1_multiple_blocks(self): + def test_shift_axis1_multiple_blocks(self, using_array_manager): # GH#35488 df1 = DataFrame(np.random.randint(1000, size=(5, 3))) df2 = DataFrame(np.random.randint(1000, size=(5, 2))) df3 = pd.concat([df1, df2], axis=1) - assert len(df3._mgr.blocks) == 2 + if not using_array_manager: + assert len(df3._mgr.blocks) == 2 result = df3.shift(2, axis=1) @@ -163,7 +166,8 @@ def test_shift_axis1_multiple_blocks(self): # Case with periods < 0 # rebuild df3 because `take` call above consolidated df3 = pd.concat([df1, df2], axis=1) - assert len(df3._mgr.blocks) == 2 + if not using_array_manager: + assert len(df3._mgr.blocks) == 2 result = df3.shift(-2, axis=1) expected = df3.take([2, 3, 4, -1, -1], axis=1) @@ -272,6 +276,7 @@ def test_datetime_frame_shift_with_freq_error(self, datetime_frame): with pytest.raises(ValueError, match=msg): no_freq.shift(freq="infer") + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support def test_shift_dt64values_int_fill_deprecated(self): # GH#31971 ser = Series([pd.Timestamp("2020-01-01"), pd.Timestamp("2020-01-02")]) diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py index 3be6a8453420e..221296bfd6d76 100644 --- a/pandas/tests/frame/methods/test_sort_index.py +++ b/pandas/tests/frame/methods/test_sort_index.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( CategoricalDtype, @@ -371,6 +373,7 @@ def test_sort_index_multiindex(self, level): result = df.sort_index(level=level, sort_remaining=False) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) groupby def test_sort_index_intervalindex(self): # this is a de-facto sort via unstack # confirming that we sort in the order of the bins diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 987848ec697d1..cd3286fa38056 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -566,12 +566,13 @@ def test_sort_values_nat_na_position_default(self): result = expected.sort_values(["A", "date"]) tm.assert_frame_equal(result, expected) - def test_sort_values_item_cache(self): + def test_sort_values_item_cache(self, using_array_manager): # previous behavior incorrect retained an invalid _item_cache entry df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"]) df["D"] = df["A"] * 2 ser = df["A"] - assert len(df._mgr.blocks) == 2 + if not using_array_manager: + assert len(df._mgr.blocks) == 2 df.sort_values(by="A") ser.values[0] = 99 diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index 0257a5d43170f..8de47cb17d7d3 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -1,9 +1,13 @@ import numpy as np +import pandas.util._test_decorators as td + from pandas import DataFrame, MultiIndex import pandas._testing as tm from pandas.core.arrays import PandasArray +pytestmark = td.skip_array_manager_invalid_test + class TestToDictOfBlocks: def test_copy_blocks(self, float_frame): diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index 3d69c004db6bb..0682989294457 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -1,5 +1,7 @@ import numpy as np +import pandas.util._test_decorators as td + from pandas import DataFrame, Timestamp import pandas._testing as tm @@ -17,6 +19,7 @@ def test_to_numpy_dtype(self): result = df.to_numpy(dtype="int64") tm.assert_numpy_array_equal(result, expected) + @td.skip_array_manager_invalid_test def test_to_numpy_copy(self): arr = np.random.randn(4, 3) df = DataFrame(arr) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 8635168f1eb03..548842e653a63 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, date_range import pandas._testing as tm @@ -79,6 +81,7 @@ def test_transpose_float(self, float_frame): for col, s in mixed_T.items(): assert s.dtype == np.object_ + @td.skip_array_manager_invalid_test def test_transpose_get_view(self, float_frame): dft = float_frame.T dft.values[:, 5:10] = 5 diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index fb0c5d31f692b..5426e4368722e 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -1,11 +1,14 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, NaT, Series, Timestamp, date_range, period_range import pandas._testing as tm class TestDataFrameValues: + @td.skip_array_manager_invalid_test def test_values(self, float_frame): float_frame.values[:, 0] = 5.0 assert (float_frame.values[:, 0] == 5).all() diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 22eb642ed8512..afc25c48beb5f 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -972,7 +972,7 @@ def test_align_frame(self): result = ts + ts[::2] expected = ts + ts - expected.values[1::2] = np.nan + expected.iloc[1::2] = np.nan tm.assert_frame_equal(result, expected) half = ts[::2] diff --git a/pandas/tests/internals/test_managers.py b/pandas/tests/internals/test_managers.py new file mode 100644 index 0000000000000..333455875904a --- /dev/null +++ b/pandas/tests/internals/test_managers.py @@ -0,0 +1,40 @@ +""" +Testing interaction between the different managers (BlockManager, ArrayManager) +""" +from pandas.core.dtypes.missing import array_equivalent + +import pandas as pd +import pandas._testing as tm +from pandas.core.internals import ArrayManager, BlockManager + + +def test_dataframe_creation(): + + with pd.option_context("mode.data_manager", "block"): + df_block = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + assert isinstance(df_block._mgr, BlockManager) + + with pd.option_context("mode.data_manager", "array"): + df_array = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + assert isinstance(df_array._mgr, ArrayManager) + + # also ensure both are seen as equal + tm.assert_frame_equal(df_block, df_array) + + # conversion from one manager to the other + result = df_block._as_manager("block") + assert isinstance(result._mgr, BlockManager) + result = df_block._as_manager("array") + assert isinstance(result._mgr, ArrayManager) + tm.assert_frame_equal(result, df_block) + assert all( + array_equivalent(left, right) + for left, right in zip(result._mgr.arrays, df_array._mgr.arrays) + ) + + result = df_array._as_manager("array") + assert isinstance(result._mgr, ArrayManager) + result = df_array._as_manager("block") + assert isinstance(result._mgr, BlockManager) + tm.assert_frame_equal(result, df_array) + assert len(result._mgr.blocks) == 2 diff --git a/pandas/tests/io/formats/test_printing.py b/pandas/tests/io/formats/test_printing.py index f0d5ef19c4468..2339e21288bb5 100644 --- a/pandas/tests/io/formats/test_printing.py +++ b/pandas/tests/io/formats/test_printing.py @@ -3,6 +3,8 @@ import pandas._config.config as cf +import pandas.util._test_decorators as td + import pandas as pd import pandas.io.formats.format as fmt @@ -119,6 +121,7 @@ def test_ambiguous_width(self): assert adjoined == expected +@td.skip_array_manager_not_yet_implemented class TestTableSchemaRepr: @classmethod def setup_class(cls): diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 5faca6bd89dad..6ead81db1fab0 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -5,6 +5,8 @@ import pandas as pd import pandas._testing as tm +pytestmark = td.skip_array_manager_not_yet_implemented + def test_compression_roundtrip(compression): df = pd.DataFrame( diff --git a/pandas/tests/io/json/test_deprecated_kwargs.py b/pandas/tests/io/json/test_deprecated_kwargs.py index 79245bc9d34a8..7367aaefb1c1e 100644 --- a/pandas/tests/io/json/test_deprecated_kwargs.py +++ b/pandas/tests/io/json/test_deprecated_kwargs.py @@ -2,11 +2,15 @@ Tests for the deprecated keyword arguments for `read_json`. """ +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm from pandas.io.json import read_json +pytestmark = td.skip_array_manager_not_yet_implemented + def test_deprecated_kwargs(): df = pd.DataFrame({"A": [2, 4, 6], "B": [3, 6, 9]}, index=[0, 1, 2]) diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py index 215d663e68d8f..e25964f556e4e 100644 --- a/pandas/tests/io/json/test_json_table_schema.py +++ b/pandas/tests/io/json/test_json_table_schema.py @@ -6,6 +6,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.dtypes import CategoricalDtype, DatetimeTZDtype, PeriodDtype import pandas as pd @@ -20,6 +22,8 @@ set_default_names, ) +pytestmark = td.skip_array_manager_not_yet_implemented + class TestBuildSchema: def setup_method(self, method): diff --git a/pandas/tests/io/json/test_normalize.py b/pandas/tests/io/json/test_normalize.py index b232c827f5ece..d7fc1257d8396 100644 --- a/pandas/tests/io/json/test_normalize.py +++ b/pandas/tests/io/json/test_normalize.py @@ -3,11 +3,15 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame, Index, Series, json_normalize import pandas._testing as tm from pandas.io.json._normalize import nested_to_record +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.fixture def deep_nested(): diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index dba3cb4db3ab8..c3ada52eba5aa 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -15,6 +15,9 @@ from pandas import DataFrame, DatetimeIndex, Series, Timestamp, compat, read_json import pandas._testing as tm +pytestmark = td.skip_array_manager_not_yet_implemented + + _seriesd = tm.getSeriesData() _frame = DataFrame(_seriesd) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 099d99507e136..2484c12f42600 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -3,12 +3,16 @@ import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, read_json import pandas._testing as tm from pandas.io.json._json import JsonReader +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.fixture def lines_json_df(): diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 74adb397d91f4..dff506809ee4f 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,10 +16,13 @@ import pandas._libs.json as ujson from pandas._libs.tslib import Timestamp from pandas.compat import IS64, is_platform_windows +import pandas.util._test_decorators as td from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range import pandas._testing as tm +pytestmark = td.skip_array_manager_not_yet_implemented + def _clean_dict(d): """ diff --git a/pandas/tests/io/pytables/test_complex.py b/pandas/tests/io/pytables/test_complex.py index 71bb6584889aa..72e8b4aea5ede 100644 --- a/pandas/tests/io/pytables/test_complex.py +++ b/pandas/tests/io/pytables/test_complex.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import DataFrame, Series import pandas._testing as tm @@ -10,6 +12,9 @@ from pandas.io.pytables import read_hdf +# TODO(ArrayManager) HDFStore relies on accessing the blocks +pytestmark = td.skip_array_manager_not_yet_implemented + def test_complex_fixed(setup_path): df = DataFrame( diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 3f0fd6e7483f8..131711a32d114 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -55,6 +55,10 @@ from pandas.io.pytables import TableIterator # isort:skip +# TODO(ArrayManager) HDFStore relies on accessing the blocks +pytestmark = td.skip_array_manager_not_yet_implemented + + _default_compressor = "blosc" ignore_natural_naming_warning = pytest.mark.filterwarnings( "ignore:object name:tables.exceptions.NaturalNameWarning" diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 9ee44b58d6ced..a106a579d7e52 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -14,6 +14,9 @@ ensure_clean_store, ) +# TODO(ArrayManager) HDFStore relies on accessing the blocks +pytestmark = td.skip_array_manager_not_yet_implemented + def _compare_with_tz(a, b): tm.assert_frame_equal(a, b) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 725c14f410357..d31bee9aca135 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -272,7 +272,9 @@ def test_read_fspath_all(self, reader, module, path, datapath): ("to_excel", {"engine": "xlwt"}, "xlwt"), ("to_feather", {}, "pyarrow"), ("to_html", {}, "os"), - ("to_json", {}, "os"), + pytest.param( + "to_json", {}, "os", marks=td.skip_array_manager_not_yet_implemented + ), ("to_latex", {}, "os"), ("to_pickle", {}, "os"), ("to_stata", {"time_stamp": pd.to_datetime("2019-01-01 00:00")}, "os"), diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index 158504082e657..76bc188afdd1f 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -8,11 +8,15 @@ import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm import pandas.io.common as icom +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.mark.parametrize( "obj", diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index b1038b6d28083..d9575a6ad81e5 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -247,6 +247,7 @@ def test_pickle_options(fsspectest): tm.assert_frame_equal(df, out) +@td.skip_array_manager_not_yet_implemented def test_json_options(fsspectest): df = DataFrame({"a": [0]}) df.to_json("testmem://afile", storage_options={"test": "json_write"}) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 24944281419c3..035460185fa81 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -12,6 +12,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd @@ -29,6 +31,9 @@ read_stata, ) +# TODO(ArrayManager) the stata code relies on BlockManager internals (eg blknos) +pytestmark = td.skip_array_manager_not_yet_implemented + @pytest.fixture() def mixed_frame(): diff --git a/pandas/tests/io/test_user_agent.py b/pandas/tests/io/test_user_agent.py index 32399c7de7a68..fd3ca3919d416 100644 --- a/pandas/tests/io/test_user_agent.py +++ b/pandas/tests/io/test_user_agent.py @@ -8,6 +8,8 @@ import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -180,13 +182,25 @@ def do_GET(self): "responder, read_method, port, parquet_engine", [ (CSVUserAgentResponder, pd.read_csv, 34259, None), - (JSONUserAgentResponder, pd.read_json, 34260, None), + pytest.param( + JSONUserAgentResponder, + pd.read_json, + 34260, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), (ParquetPyArrowUserAgentResponder, pd.read_parquet, 34268, "pyarrow"), (ParquetFastParquetUserAgentResponder, pd.read_parquet, 34273, "fastparquet"), (PickleUserAgentResponder, pd.read_pickle, 34271, None), (StataUserAgentResponder, pd.read_stata, 34272, None), (GzippedCSVUserAgentResponder, pd.read_csv, 34261, None), - (GzippedJSONUserAgentResponder, pd.read_json, 34262, None), + pytest.param( + GzippedJSONUserAgentResponder, + pd.read_json, + 34262, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), ], ) def test_server_and_default_headers(responder, read_method, port, parquet_engine): @@ -212,13 +226,25 @@ def test_server_and_default_headers(responder, read_method, port, parquet_engine "responder, read_method, port, parquet_engine", [ (CSVUserAgentResponder, pd.read_csv, 34263, None), - (JSONUserAgentResponder, pd.read_json, 34264, None), + pytest.param( + JSONUserAgentResponder, + pd.read_json, + 34264, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), (ParquetPyArrowUserAgentResponder, pd.read_parquet, 34270, "pyarrow"), (ParquetFastParquetUserAgentResponder, pd.read_parquet, 34275, "fastparquet"), (PickleUserAgentResponder, pd.read_pickle, 34273, None), (StataUserAgentResponder, pd.read_stata, 34274, None), (GzippedCSVUserAgentResponder, pd.read_csv, 34265, None), - (GzippedJSONUserAgentResponder, pd.read_json, 34266, None), + pytest.param( + GzippedJSONUserAgentResponder, + pd.read_json, + 34266, + None, + marks=td.skip_array_manager_not_yet_implemented, + ), ], ) def test_server_and_custom_headers(responder, read_method, port, parquet_engine): diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index a15dc0751aa7d..e479e5c1416db 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,8 +1,13 @@ import numpy as np +import pandas.util._test_decorators as td + from pandas import Period, Series, Timedelta, Timestamp, date_range import pandas._testing as tm +# TODO(ArrayManager) quantile is needed for describe() +pytestmark = td.skip_array_manager_not_yet_implemented + class TestSeriesDescribe: def test_describe(self): diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 1d3e91d07afe3..5771d8e2b8a47 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import is_integer import pandas as pd @@ -8,6 +10,8 @@ import pandas._testing as tm from pandas.core.indexes.datetimes import Timestamp +pytestmark = td.skip_array_manager_not_yet_implemented + class TestSeriesQuantile: def test_quantile(self, datetime_series): diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py index 209a4233fc3b7..95ef2f6c00fe8 100644 --- a/pandas/util/_test_decorators.py +++ b/pandas/util/_test_decorators.py @@ -274,3 +274,18 @@ def async_mark(): async_mark = pytest.mark.skip(reason="Missing dependency pytest-asyncio") return async_mark + + +# Note: we are using a string as condition (and not for example +# `get_option("mode.data_manager") == "array"`) because this needs to be +# evaluated at test time (otherwise this boolean condition gets evaluated +# at import time, when the pd.options.mode.data_manager has not yet been set) + +skip_array_manager_not_yet_implemented = pytest.mark.skipif( + "config.getvalue('--array-manager')", reason="JSON C code relies on Blocks" +) + +skip_array_manager_invalid_test = pytest.mark.skipif( + "config.getvalue('--array-manager')", + reason="Test that relies on BlockManager internals or specific behaviour", +)