diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0ec8ee698d2..c7410ebbace 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install black - run: black --check --diff modin/ asv_bench/benchmarks scripts/doc_checker.py @@ -43,7 +43,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install -r docs/requirements-doc.txt - run: cd docs && sphinx-build -T -E -b html . build @@ -57,7 +57,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install pytest pytest-cov pydocstyle numpydoc==1.1.0 xgboost - run: pytest scripts/test @@ -132,7 +132,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - run: pip install flake8 flake8-print - run: flake8 --enable=T modin/ asv_bench/benchmarks scripts/doc_checker.py @@ -152,7 +152,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an HTTP error. Retry @@ -185,7 +185,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -214,7 +214,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - name: Clean install and run run: | @@ -235,7 +235,7 @@ jobs: fetch-depth: 1 - uses: actions/setup-python@v2 with: - python-version: "3.7.x" + python-version: "3.8.x" architecture: "x64" - name: Clean install and run run: | @@ -258,7 +258,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -294,7 +294,7 @@ jobs: env: MODIN_MEMORY: 1000000000 MODIN_TEST_DATASET_SIZE: "small" - name: Test ${{ matrix.execution }} execution, Python 3.7 + name: Test ${{ matrix.execution }} execution, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -303,7 +303,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -355,7 +355,7 @@ jobs: shell: bash -l {0} env: MODIN_STORAGE_FORMAT: "omnisci" - name: Test OmniSci storage format, Python 3.7 + name: Test OmniSci storage format, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -365,7 +365,7 @@ jobs: with: activate-environment: modin_on_omnisci environment-file: requirements/env_omnisci.yml - python-version: 3.7 + python-version: 3.8 use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry # it once if it fails. todo(https://github.com/conda-incubator/setup-miniconda/issues/129): @@ -469,7 +469,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["python", "ray", "dask"] env: MODIN_ENGINE: ${{matrix.engine}} @@ -556,7 +556,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -602,7 +602,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! # Miniconda setup sometimes fails because of an http error. retry @@ -640,7 +640,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["ray", "dask"] test-task: - modin/pandas/test/dataframe/test_binary.py @@ -702,7 +702,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] env: MODIN_STORAGE_FORMAT: pyarrow MODIN_EXPERIMENTAL: "True" @@ -739,7 +739,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: [ "3.7", "3.8" ] + python-version: ["3.8" ] engine: ["ray", "dask"] env: MODIN_EXPERIMENTAL: "True" diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml index cfc28ab42c4..db6310d06b9 100644 --- a/.github/workflows/push-to-master.yml +++ b/.github/workflows/push-to-master.yml @@ -18,11 +18,11 @@ jobs: with: activate-environment: modin environment-file: requirements/requirements-no-engine.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: install Ray nightly build - run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl + run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl - name: Conda environment run: | conda info @@ -63,7 +63,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment @@ -81,7 +81,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] test-task: - modin/pandas/test/dataframe/test_binary.py - modin/pandas/test/dataframe/test_default.py @@ -118,8 +118,6 @@ jobs: - run: pip install -r requirements-dev.txt --use-deprecated=legacy-resolver # Use a ray master commit that includes the fix here: https://github.com/ray-project/ray/pull/16278 # Can be changed after a Ray version > 1.4 is released. - - run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/master/c8e3ed9eec30119092ef966ee7b8982c8954c333/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl - if: matrix.python-version == '3.7' - run: pip install https://s3-us-west-2.amazonaws.com/ray-wheels/master/c8e3ed9eec30119092ef966ee7b8982c8954c333/ray-2.0.0.dev0-cp38-cp38-manylinux2014_x86_64.whl if: matrix.python-version == '3.8' - name: Install HDF5 diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml index 3e89ae3cc4b..c205aa05b9d 100644 --- a/.github/workflows/push.yml +++ b/.github/workflows/push.yml @@ -15,7 +15,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment @@ -41,7 +41,7 @@ jobs: env: MODIN_MEMORY: 1000000000 MODIN_TEST_DATASET_SIZE: "small" - name: Test ${{ matrix.execution }} execution, Python 3.7 + name: Test ${{ matrix.execution }} execution, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -50,7 +50,7 @@ jobs: with: activate-environment: modin environment-file: environment-dev.yml - python-version: 3.7 + python-version: 3.8 channel-priority: strict use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment @@ -96,7 +96,7 @@ jobs: MODIN_EXPERIMENTAL: "True" MODIN_ENGINE: "native" MODIN_STORAGE_FORMAT: "omnisci" - name: Test OmniSci storage format, Python 3.7 + name: Test OmniSci storage format, Python 3.8 steps: - uses: actions/checkout@v2 with: @@ -106,7 +106,7 @@ jobs: with: activate-environment: modin_on_omnisci environment-file: requirements/env_omnisci.yml - python-version: 3.7 + python-version: 3.8 use-only-tar-bz2: true # IMPORTANT: This needs to be set for caching to work properly! - name: Conda environment run: | @@ -135,7 +135,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["python", "ray", "dask"] env: MODIN_ENGINE: ${{matrix.engine}} @@ -202,7 +202,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] engine: ["ray", "dask"] test-task: - modin/pandas/test/dataframe/test_binary.py @@ -257,7 +257,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.8"] env: MODIN_STORAGE_FORMAT: pyarrow MODIN_EXPERIMENTAL: "True" @@ -287,7 +287,7 @@ jobs: shell: bash -l {0} strategy: matrix: - python-version: [ "3.7", "3.8" ] + python-version: ["3.8"] engine: ["ray", "dask"] env: MODIN_EXPERIMENTAL: "True" diff --git a/environment-dev.yml b/environment-dev.yml index 05ffcb23609..283393d2890 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,8 +2,8 @@ name: modin channels: - conda-forge dependencies: - - pandas==1.3.5 - - numpy>=1.16.5 + - pandas==1.4.0 + - numpy>=1.18.5 - pyarrow>=4.0.1 - dask[complete]>=2.22.0 - distributed>=2.22.0 diff --git a/modin/core/io/io.py b/modin/core/io/io.py index 10b61d88032..813a094a002 100644 --- a/modin/core/io/io.py +++ b/modin/core/io/io.py @@ -23,7 +23,7 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from pandas.util._decorators import doc from modin.db_conn import ModinDatabaseConnection @@ -826,7 +826,7 @@ def to_sql( def to_pickle( cls, obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, diff --git a/modin/core/io/text/fwf_dispatcher.py b/modin/core/io/text/fwf_dispatcher.py index 63440776cfa..388ae096e8e 100644 --- a/modin/core/io/text/fwf_dispatcher.py +++ b/modin/core/io/text/fwf_dispatcher.py @@ -14,7 +14,6 @@ """Module houses `FWFDispatcher` class, that is used for reading of tables with fixed-width formatted lines.""" import pandas -from pandas._typing import FilePathOrBuffer from modin.core.io.text.text_file_dispatcher import TextFileDispatcher @@ -27,7 +26,7 @@ class FWFDispatcher(TextFileDispatcher): @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ): """ diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index e8620e07abf..cf695792cdf 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -27,7 +27,6 @@ import numpy as np import pandas import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.common import is_list_like from modin.core.io.file_dispatcher import FileDispatcher, OpenFile @@ -36,7 +35,7 @@ from modin.core.io.text.utils import CustomNewlineIterator from modin.config import NPartitions -ColumnNamesTypes = Tuple[Union[pandas.Index, pandas.MultiIndex, pandas.Int64Index]] +ColumnNamesTypes = Tuple[Union[pandas.Index, pandas.MultiIndex]] IndexColType = Union[int, str, bool, Sequence[int], Sequence[str], None] @@ -614,7 +613,7 @@ def _launch_tasks(cls, splits: list, **partition_kwargs) -> Tuple[list, list, li @classmethod def check_parameters_support( cls, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, read_kwargs: dict, ) -> bool: """ @@ -912,7 +911,7 @@ def _get_new_qc( return new_query_compiler @classmethod - def _read(cls, filepath_or_buffer: FilePathOrBuffer, **kwargs): + def _read(cls, filepath_or_buffer, **kwargs): """ Read data from `filepath_or_buffer` according to `kwargs` parameters. diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index 3ecf048c175..d828c74fd0c 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -189,6 +189,9 @@ def generic_parse(fname, **kwargs): bio.seek(start) to_read = header + bio.read(end - start) + if "memory_map" in kwargs: + kwargs = kwargs.copy() + del kwargs["memory_map"] pandas_df = callback(BytesIO(to_read), **kwargs) index = ( pandas_df.index diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py index b5d03b32b50..616f707173d 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/io/io.py @@ -37,7 +37,6 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import FilePathOrBuffer from pandas.io.common import is_url ReadCsvKwargsType = Dict[ @@ -51,7 +50,6 @@ Sequence, Callable, Dialect, - FilePathOrBuffer, None, ], ] diff --git a/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py index f58ffc7f23b..6e3c003b6f7 100644 --- a/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/omnisci_on_native/test/test_dataframe.py @@ -1187,11 +1187,12 @@ def applier(df, **kwargs): # At the end of reduce function it does inevitable `transpose`, which # is defaulting to pandas. The following logic check that `transpose` is the only # function that falling back to pandas in the reduce operation flow. + # Another warning comes from deprecated pandas.Int64Index usage. with pytest.warns(UserWarning) as warns: res = getattr(df, method)() assert ( - len(warns) == 1 - ), f"More than one warning were arisen: len(warns) != 1 ({len(warns)} != 1)" + len(warns) == 2 + ), f"More than two warnings were arisen: len(warns) != 2 ({len(warns)} != 2)" message = warns[0].message.args[0] assert ( re.match(r".*transpose.*defaulting to pandas", message) is not None diff --git a/modin/experimental/pandas/io.py b/modin/experimental/pandas/io.py index 74614b08624..68333577a06 100644 --- a/modin/experimental/pandas/io.py +++ b/modin/experimental/pandas/io.py @@ -20,7 +20,7 @@ import pandas import pandas._libs.lib as lib -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from . import DataFrame from modin.config import IsExperimental, Engine @@ -245,7 +245,7 @@ def _read(**kwargs) -> DataFrame: def read_pickle_distributed( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): @@ -289,7 +289,7 @@ def read_pickle_distributed( def to_pickle_distributed( self, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 74d2d7cbf35..8396462f96f 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -12,12 +12,11 @@ # governing permissions and limitations under the License. import pandas +import warnings -__pandas_version__ = "1.3.5" +__pandas_version__ = "1.4.0" if pandas.__version__ != __pandas_version__: - import warnings - warnings.warn( "The pandas version installed {} does not match the supported pandas version in" " Modin {}. This may cause undesired side effects!".format( @@ -25,67 +24,69 @@ ) ) -from pandas import ( - eval, - cut, - factorize, - test, - qcut, - date_range, - period_range, - Index, - MultiIndex, - CategoricalIndex, - bdate_range, - DatetimeIndex, - Timedelta, - Timestamp, - to_timedelta, - set_eng_float_format, - options, - Flags, - set_option, - NaT, - PeriodIndex, - Categorical, - Interval, - UInt8Dtype, - UInt16Dtype, - UInt32Dtype, - UInt64Dtype, - SparseDtype, - Int8Dtype, - Int16Dtype, - Int32Dtype, - Int64Dtype, - Float32Dtype, - Float64Dtype, - StringDtype, - BooleanDtype, - CategoricalDtype, - DatetimeTZDtype, - IntervalDtype, - PeriodDtype, - RangeIndex, - Int64Index, - UInt64Index, - Float64Index, - TimedeltaIndex, - IntervalIndex, - IndexSlice, - Grouper, - array, - Period, - DateOffset, - timedelta_range, - infer_freq, - interval_range, - ExcelWriter, - datetime, - NamedAgg, - NA, - api, -) +with warnings.catch_warnings(): + warnings.simplefilter("ignore") + from pandas import ( + eval, + cut, + factorize, + test, + qcut, + date_range, + period_range, + Index, + MultiIndex, + CategoricalIndex, + bdate_range, + DatetimeIndex, + Timedelta, + Timestamp, + to_timedelta, + set_eng_float_format, + options, + Flags, + set_option, + NaT, + PeriodIndex, + Categorical, + Interval, + UInt8Dtype, + UInt16Dtype, + UInt32Dtype, + UInt64Dtype, + SparseDtype, + Int8Dtype, + Int16Dtype, + Int32Dtype, + Int64Dtype, + Float32Dtype, + Float64Dtype, + StringDtype, + BooleanDtype, + CategoricalDtype, + DatetimeTZDtype, + IntervalDtype, + PeriodDtype, + RangeIndex, + Int64Index, + UInt64Index, + Float64Index, + TimedeltaIndex, + IntervalIndex, + IndexSlice, + Grouper, + array, + Period, + DateOffset, + timedelta_range, + infer_freq, + interval_range, + ExcelWriter, + datetime, + NamedAgg, + NA, + api, + ) import os import multiprocessing diff --git a/modin/pandas/base.py b/modin/pandas/base.py index 5736a28310b..8558838fa3b 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -30,12 +30,15 @@ import pandas.core.resample import pandas.core.generic from pandas.core.indexing import convert_to_index_sliceable -from pandas.util._validators import validate_bool_kwarg, validate_percentile +from pandas.util._validators import ( + validate_bool_kwarg, + validate_percentile, + validate_ascending, +) from pandas._libs.lib import no_default from pandas._typing import ( CompressionOptions, IndexKeyFunc, - FilePathOrBuffer, StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -521,7 +524,7 @@ def _get_axis_number(cls, axis): Parameters ---------- - axis : int, str + axis : int, str or pandas._libs.lib.NoDefault Axis name ('index' or 'columns') or number to be converted to axis index. Returns @@ -529,6 +532,9 @@ def _get_axis_number(cls, axis): int 0 or 1 - axis index in the array of axes stored in the dataframe. """ + if axis is no_default: + axis = None + return cls._pandas_class._get_axis_number(axis) if axis is not None else 0 def __constructor__(self, *args, **kwargs): @@ -706,6 +712,7 @@ def align( ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is not None: axis = self._get_axis_number(axis) if bool_only and axis == 0: @@ -763,6 +770,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): return result def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if axis is not None: axis = self._get_axis_number(axis) if bool_only and axis == 0: @@ -932,7 +940,13 @@ def at_time(self, time, asof=False, axis=None): return self.loc[indexer] if axis == 0 else self.loc[:, indexer] def between_time( - self, start_time, end_time, include_start=True, include_end=True, axis=None + self: "BasePandasDataset", + start_time, + end_time, + include_start: "bool_t | NoDefault" = no_default, + include_end: "bool_t | NoDefault" = no_default, + inclusive: "str | None" = None, + axis=None, ): axis = self._get_axis_number(axis) idx = self.index if axis == 0 else self.columns @@ -943,6 +957,7 @@ def between_time( end_time, include_start=include_start, include_end=include_end, + inclusive=inclusive, ) .index ) @@ -1280,16 +1295,17 @@ def explode(self, column, ignore_index: bool = False): def ewm( self, - com=None, - span=None, - halflife=None, - alpha=None, - min_periods=0, - adjust=True, - ignore_na=False, - axis=0, - times=None, - ): + com: "float | None" = None, + span: "float | None" = None, + halflife: "float | TimedeltaConvertibleTypes | None" = None, + alpha: "float | None" = None, + min_periods: "int | None" = 0, + adjust: "bool_t" = True, + ignore_na: "bool_t" = False, + axis: "Axis" = 0, + times: "str | np.ndarray | BasePandasDataset | None" = None, + method: "str" = "single", + ) -> "ExponentialMovingWindow": return self._default_to_pandas( "ewm", com=com, @@ -1301,6 +1317,7 @@ def ewm( ignore_na=ignore_na, axis=axis, times=times, + method=method, ) def expanding(self, min_periods=1, center=None, axis=0, method="single"): @@ -1528,10 +1545,16 @@ def iloc(self): return _iLocIndexer(self) - def kurt(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def kurt( + self, + axis: "Axis | None | NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: func_kwargs = { "skipna": skipna, @@ -1582,10 +1605,9 @@ def loc(self): return _LocIndexer(self) - def mad(self, axis=None, skipna=None, level=None): + def mad(self, axis=None, skipna=True, level=None): axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=True) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) @@ -1621,9 +1643,15 @@ def mask( try_cast=try_cast, ) - def max(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - if skipna is None: - skipna = True + def max( + self, + axis: "int | None | NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( "max", @@ -1683,8 +1711,7 @@ def _stat_operation( `DataFrame` - self is DataFrame and level is specified. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( op_name, @@ -1721,10 +1748,24 @@ def _stat_operation( ) return self._reduce_dimension(result_qc) - def mean(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def mean( + self, + axis: "int | None | NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("mean", axis, skipna, level, numeric_only, **kwargs) - def median(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def median( + self, + axis: "int | None | NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation( "median", axis, skipna, level, numeric_only, **kwargs ) @@ -1734,9 +1775,15 @@ def memory_usage(self, index=True, deep=False): self._query_compiler.memory_usage(index=index, deep=deep) ) - def min(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): - if skipna is None: - skipna = True + def min( + self, + axis: "int | None | NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: return self._default_to_pandas( "min", @@ -1873,13 +1920,13 @@ def check_dtype(t): return result def rank( - self, + self: "BasePandasDataset", axis=0, - method="average", - numeric_only=None, - na_option="keep", - ascending=True, - pct=False, + method: "str" = "average", + numeric_only: "bool_t | None | NoDefault" = no_default, + na_option: "str" = "keep", + ascending: "bool_t" = True, + pct: "bool_t" = False, ): axis = self._get_axis_number(axis) return self.__constructor__( @@ -2258,7 +2305,13 @@ def sample( return self.__constructor__(query_compiler=query_compiler) def sem( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "sem", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2377,7 +2430,14 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=no_default): else: return self.tshift(periods, freq) - def skew(self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs): + def skew( + self, + axis: "int | None | NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, + ): return self._stat_operation("skew", axis, skipna, level, numeric_only, **kwargs) def sort_index( @@ -2425,6 +2485,7 @@ def sort_values( ): axis = self._get_axis_number(axis) inplace = validate_bool_kwarg(inplace, "inplace") + ascending = validate_ascending(ascending) if axis == 0: result = self._query_compiler.sort_rows_by_column_values( by, @@ -2446,7 +2507,13 @@ def sort_values( return self._create_or_update_from_compiler(result, inplace) def std( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, + axis=None, + skipna=True, + level=None, + ddof=1, + numeric_only=None, + **kwargs, ): return self._stat_operation( "std", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -2703,7 +2770,7 @@ def to_period(self, freq=None, axis=0, copy=True): # pragma: no cover def to_pickle( self, - path: FilePathOrBuffer, + path, compression: CompressionOptions = "infer", protocol: int = pkl.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -2909,7 +2976,7 @@ def value_counts( return counted_values def var( - self, axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs + self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None, **kwargs ): return self._stat_operation( "var", axis, skipna, level, numeric_only, ddof=ddof, **kwargs @@ -3156,7 +3223,7 @@ def _get_new_resampler(key): if isinstance( key, (list, tuple, Series, pandas.Series, pandas.Index, np.ndarray) ): - if len(self._dataframe.columns.intersection(key)) != len(key): + if len(self._dataframe.columns.intersection(key)) != len(set(key)): missed_keys = list(set(key).difference(self._dataframe.columns)) raise KeyError(f"Columns not found: {str(sorted(missed_keys))[1:-1]}") return _get_new_resampler(list(key)) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index b81cf54cfe5..a71a930e9b7 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -31,7 +31,7 @@ import functools import numpy as np import sys -from typing import IO, Optional, Union, Mapping, Iterator +from typing import IO, Optional, Union, Iterator import warnings from modin.pandas import Categorical @@ -1155,14 +1155,16 @@ def insert(self, loc, column, value, allow_duplicates=False): # noqa: PR01, D20 ): raise ValueError("Length of values does not match length of index") if not allow_duplicates and column in self.columns: - raise ValueError("cannot insert {0}, already exists".format(column)) + raise ValueError(f"cannot insert {column}, already exists") if loc > len(self.columns): raise IndexError( - "index {0} is out of bounds for axis 0 with size {1}".format( - loc, len(self.columns) - ) + f"index {loc} is out of bounds for axis 0 with size {len(self.columns)}" ) if loc < 0: + if loc < -len(self.columns): + raise IndexError( + f"index {loc} is out of bounds for axis 0 with size {len(self.columns)}" + ) raise ValueError("unbounded slice") if isinstance(value, Series): value = value._query_compiler @@ -1615,7 +1617,7 @@ def pow( def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1625,8 +1627,7 @@ def prod( Return the product of the values over the requested axis. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) @@ -1771,11 +1772,11 @@ def rename( def replace( self, to_replace=None, - value=None, - inplace=False, + value=no_default, + inplace: "bool" = False, limit=None, - regex=False, - method="pad", + regex: "bool" = False, + method: "str | NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -2010,7 +2011,7 @@ def sub( def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -2020,8 +2021,7 @@ def sum( Return the sum of the values over the requested axis. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) axis_to_apply = self.columns if axis else self.index if ( skipna is not False @@ -2211,17 +2211,19 @@ def to_records( def to_stata( self, - path, - convert_dates=None, - write_index=True, - byteorder=None, - time_stamp=None, - data_label=None, - variable_labels=None, - version=114, - convert_strl=None, - compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: StorageOptions = None, + path: "FilePath | WriteBuffer[bytes]", + convert_dates: "dict[Hashable, str] | None" = None, + write_index: "bool" = True, + byteorder: "str | None" = None, + time_stamp: "datetime.datetime | None" = None, + data_label: "str | None" = None, + variable_labels: "dict[Hashable, str] | None" = None, + version: "int | None" = 114, + convert_strl: "Sequence[Hashable] | None" = None, + compression: "CompressionOptions" = "infer", + storage_options: "StorageOptions" = None, + *, + value_labels: "dict[Hashable, dict[float | int, str]] | None" = None, ): # pragma: no cover # noqa: PR01, RT01, D200 """ Export ``DataFrame`` object to Stata data format. @@ -2239,6 +2241,7 @@ def to_stata( convert_strl=convert_strl, compression=compression, storage_options=storage_options, + value_labels=value_labels, ) def to_timestamp( @@ -2329,7 +2332,7 @@ def update( def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 892b21b364d..dc8a8996f31 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -16,8 +16,7 @@ import pandas import numpy as np -from typing import Hashable, Iterable, Mapping, Optional, Union -from pandas._typing import FrameOrSeriesUnion +from typing import Hashable, Iterable, Mapping, Union from pandas.core.dtypes.common import is_list_like from modin.error_message import ErrorMessage @@ -359,9 +358,7 @@ def value_counts( @_inherit_docstrings(pandas.concat) def concat( - objs: Union[ - Iterable[FrameOrSeriesUnion], Mapping[Optional[Hashable], FrameOrSeriesUnion] - ], + objs: "Iterable[DataFrame | Series] | Mapping[Hashable, DataFrame | Series]", axis=0, join="outer", ignore_index: bool = False, @@ -371,7 +368,7 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: bool = True, -) -> FrameOrSeriesUnion: +) -> "DataFrame | Series": if isinstance(objs, (pandas.Series, Series, DataFrame, str, pandas.DataFrame)): raise TypeError( "first argument must be an iterable of pandas " diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 0e60fe4c514..391d5179019 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -17,7 +17,7 @@ import pandas import pandas.core.groupby from pandas.core.dtypes.common import is_list_like, is_numeric_dtype -from pandas.core.aggregation import reconstruct_func +from pandas.core.apply import reconstruct_func from pandas._libs.lib import no_default import pandas.core.common as com from types import BuiltinFunctionType diff --git a/modin/pandas/io.py b/modin/pandas/io.py index ac40341067c..15f2160f3de 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -26,7 +26,7 @@ import pathlib import re from collections import OrderedDict -from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions +from pandas._typing import CompressionOptions, StorageOptions from typing import Union, IO, AnyStr, Sequence, Dict, List, Optional, Any from modin.error_message import ErrorMessage @@ -73,18 +73,18 @@ def _read(**kwargs): @_inherit_docstrings(pandas.read_csv) def read_csv( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, @@ -96,7 +96,7 @@ def read_csv( na_filter=True, verbose=False, skip_blank_lines=True, - parse_dates=False, + parse_dates=None, infer_datetime_format=False, keep_date_col=False, date_parser=None, @@ -104,16 +104,16 @@ def read_csv( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, @@ -124,7 +124,7 @@ def read_csv( low_memory=True, memory_map=False, float_precision=None, - storage_options: StorageOptions = None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -137,23 +137,24 @@ def read_csv( @_inherit_docstrings(pandas.read_table) def read_table( - filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]], + filepath_or_buffer: "FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]", sep=lib.no_default, delimiter=None, header="infer", names=lib.no_default, index_col=None, usecols=None, - squeeze=False, + squeeze=None, prefix=lib.no_default, mangle_dupe_cols=True, - dtype=None, - engine=None, + dtype: "DtypeArg | None" = None, + engine: "CSVEngine | None" = None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, + skipfooter=0, nrows=None, na_values=None, keep_default_na=True, @@ -168,26 +169,26 @@ def read_table( cache_dates=True, iterator=False, chunksize=None, - compression="infer", + compression: "CompressionOptions" = "infer", thousands=None, - decimal: str = ".", + decimal: "str" = ".", lineterminator=None, quotechar='"', quoting=0, + doublequote=True, escapechar=None, comment=None, encoding=None, - encoding_errors="strict", + encoding_errors: "str | None" = "strict", dialect=None, error_bad_lines=None, warn_bad_lines=None, on_bad_lines=None, - skipfooter=0, - doublequote=True, delim_whitespace=False, low_memory=True, memory_map=False, float_precision=None, + storage_options: "StorageOptions" = None, ): # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { @@ -317,32 +318,33 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover @_inherit_docstrings(pandas.read_excel) def read_excel( io, - sheet_name=0, - header=0, + sheet_name: "str | int | list[IntStrT] | None" = 0, + header: "int | Sequence[int] | None" = 0, names=None, - index_col=None, + index_col: "int | Sequence[int] | None" = None, usecols=None, - squeeze=False, - dtype=None, - engine=None, + squeeze: "bool | None" = None, + dtype: "DtypeArg | None" = None, + engine: "Literal[('xlrd', 'openpyxl', 'odf', 'pyxlsb')] | None" = None, converters=None, - true_values=None, - false_values=None, - skiprows=None, - nrows=None, + true_values: "Iterable[Hashable] | None" = None, + false_values: "Iterable[Hashable] | None" = None, + skiprows: "Sequence[int] | int | Callable[[int], object] | None" = None, + nrows: "int | None" = None, na_values=None, - keep_default_na=True, - na_filter=True, - verbose=False, + keep_default_na: "bool" = True, + na_filter: "bool" = True, + verbose: "bool" = False, parse_dates=False, date_parser=None, - thousands=None, - comment=None, - skipfooter=0, - convert_float=None, - mangle_dupe_cols=True, - storage_options: StorageOptions = None, -): + thousands: "str | None" = None, + decimal: "str" = ".", + comment: "str | None" = None, + skipfooter: "int" = 0, + convert_float: "bool | None" = None, + mangle_dupe_cols: "bool" = True, + storage_options: "StorageOptions" = None, +) -> "DataFrame | dict[IntStrT, DataFrame]": _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) Engine.subscribe(_update_engine) @@ -438,7 +440,7 @@ def read_sas( @_inherit_docstrings(pandas.read_pickle) def read_pickle( - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: Optional[str] = "infer", storage_options: StorageOptions = None, ): @@ -557,7 +559,7 @@ def read_spss( @_inherit_docstrings(pandas.to_pickle) def to_pickle( obj: Any, - filepath_or_buffer: FilePathOrBuffer, + filepath_or_buffer, compression: CompressionOptions = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, storage_options: StorageOptions = None, @@ -597,9 +599,7 @@ def json_normalize( @_inherit_docstrings(pandas.read_orc) -def read_orc( - path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs -) -> DataFrame: +def read_orc(path, columns: Optional[List[str]] = None, **kwargs) -> DataFrame: ErrorMessage.default_to_pandas("read_orc") Engine.subscribe(_update_engine) return DataFrame(pandas.read_orc(path, columns, **kwargs)) diff --git a/modin/pandas/series.py b/modin/pandas/series.py index f6a868d8281..ab955ae8cb7 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -453,8 +453,8 @@ def __repr__(self): ------- str """ - num_rows = pandas.get_option("max_rows") or 60 - num_cols = pandas.get_option("max_columns") or 20 + num_rows = pandas.get_option("display.max_rows") or 60 + num_cols = pandas.get_option("display.max_columns") or 20 temp_df = self._build_repr_df(num_rows, num_cols) if isinstance(temp_df, pandas.DataFrame) and not temp_df.empty: temp_df = temp_df.iloc[:, 0] @@ -1161,6 +1161,23 @@ def idxmin(self, axis=0, skipna=True, *args, **kwargs): # noqa: PR01, RT01, D20 skipna = True return super(Series, self).idxmin(axis=axis, skipna=skipna, *args, **kwargs) + def info( + self, + verbose: "bool | None" = None, + buf: "IO[str] | None" = None, + max_cols: "int | None" = None, + memory_usage: "bool | str | None" = None, + show_counts: "bool" = True, + ): + return self._default_to_pandas( + pandas.Series.info, + verbose=verbose, + buf=buf, + max_cols=max_cols, + memory_usage=memory_usage, + show_counts=show_counts, + ) + def interpolate( self, method="linear", @@ -1218,7 +1235,12 @@ def keys(self): # noqa: RT01, D200 return self.index def kurt( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + self, + axis: "Axis | None | NoDefault" = no_default, + skipna=True, + level=None, + numeric_only=None, + **kwargs, ): # noqa: PR01, RT01, D200 """ Return unbiased kurtosis over requested axis. @@ -1262,6 +1284,27 @@ def arg(s): ) ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors=no_default, + try_cast=no_default, + ): + return self._default_to_pandas( + pandas.Series.mask, + cond, + other=other, + inplace=inplace, + axis=axis, + level=level, + errors=errors, + try_cast=try_cast, + ) + def memory_usage(self, index=True, deep=False): # noqa: PR01, RT01, D200 """ Return the memory usage of the Series. @@ -1409,7 +1452,7 @@ def pow(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def prod( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1419,8 +1462,7 @@ def prod( Return the product of the values over the requested `axis`. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if level is not None: if ( not self._query_compiler.has_multiindex(axis=axis) @@ -1476,10 +1518,19 @@ def ravel(self, order="C"): # noqa: PR01, RT01, D200 return data - def reindex(self, index=None, **kwargs): # noqa: PR01, RT01, D200 + def reindex(self, *args, **kwargs): # noqa: PR01, RT01, D200 """ Conform Series to new index with optional filling logic. """ + if args: + if len(args) > 1: + raise TypeError("Only one positional argument ('index') is allowed") + if "index" in kwargs: + raise TypeError( + "'index' passed as both positional and keyword argument" + ) + kwargs.update({"index": args[0]}) + index = kwargs.pop("index", None) method = kwargs.pop("method", None) level = kwargs.pop("level", None) copy = kwargs.pop("copy", True) @@ -1543,16 +1594,20 @@ def repeat(self, repeats, axis=None): # noqa: PR01, RT01, D200 return self.__constructor__(query_compiler=self._query_compiler.repeat(repeats)) def reset_index( - self, level=None, drop=False, name=None, inplace=False + self, level=None, drop=False, name=no_default, inplace=False ): # noqa: PR01, RT01, D200 """ Generate a new Series with the index reset. """ + if name is no_default: + # For backwards compatibility, keep columns as [0] instead of + # [None] when self.name is None + name = 0 if self.name is None else self.name + if drop and level is None: new_idx = pandas.RangeIndex(len(self.index)) if inplace: self.index = new_idx - self.name = name or self.name else: result = self.copy() result.index = new_idx @@ -1563,8 +1618,7 @@ def reset_index( ) else: obj = self.copy() - if name is not None: - obj.name = name + obj.name = name from .dataframe import DataFrame return DataFrame(obj).reset_index(level=level, drop=drop, inplace=inplace) @@ -1653,11 +1707,11 @@ def reorder_levels(self, order): # noqa: PR01, RT01, D200 def replace( self, to_replace=None, - value=None, + value=no_default, inplace=False, limit=None, regex=False, - method="pad", + method: "str | NoDefault" = no_default, ): # noqa: PR01, RT01, D200 """ Replace values given in `to_replace` with `value`. @@ -1770,7 +1824,7 @@ def sub(self, other, level=None, fill_value=None, axis=0): # noqa: PR01, RT01, def sum( self, axis=None, - skipna=None, + skipna=True, level=None, numeric_only=None, min_count=0, @@ -1780,8 +1834,7 @@ def sum( Return the sum of the values. """ axis = self._get_axis_number(axis) - if skipna is None: - skipna = True + validate_bool_kwarg(skipna, "skipna", none_allowed=False) if numeric_only is True: raise NotImplementedError("Series.sum does not implement numeric_only") if level is not None: @@ -1843,15 +1896,21 @@ def to_dict(self, into=dict): # pragma: no cover # noqa: PR01, RT01, D200 """ return self._default_to_pandas("to_dict", into=into) - def to_frame(self, name=None): # noqa: PR01, RT01, D200 + def to_frame( + self, name: "Hashable" = no_default + ) -> "DataFrame": # noqa: PR01, RT01, D200 """ Convert Series to {label -> value} dict or dict-like object. """ from .dataframe import DataFrame + if name is None: + name = no_default + self_cp = self.copy() - if name is not None: + if name is not no_default: self_cp.name = name + return DataFrame(self_cp) def to_list(self): # noqa: RT01, D200 @@ -2011,11 +2070,11 @@ def view(self, dtype=None): # noqa: PR01, RT01, D200 def where( self, cond, - other=np.nan, + other=no_default, inplace=False, axis=None, level=None, - errors="raise", + errors=no_default, try_cast=no_default, ): # noqa: PR01, RT01, D200 """ diff --git a/modin/pandas/test/dataframe/test_join_sort.py b/modin/pandas/test/dataframe/test_join_sort.py index 8d949e26429..1d277d45173 100644 --- a/modin/pandas/test/dataframe/test_join_sort.py +++ b/modin/pandas/test/dataframe/test_join_sort.py @@ -305,13 +305,23 @@ def test_merge(test_data, test_data2): ) df_equals(modin_result, pandas_result) - # Named Series promoted to DF - s = pd.Series(frame_data2.get("col1")) - with pytest.raises(ValueError): - modin_df.merge(s) + # Cannot merge a Series without a name + ps = pandas.Series(frame_data2.get("col1")) + ms = pd.Series(frame_data2.get("col1")) + eval_general( + modin_df, + pandas_df, + lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps), + ) - s = pd.Series(frame_data2.get("col1"), name="col1") - df_equals(modin_df.merge(s), modin_df.merge(modin_df2[["col1"]])) + # merge a Series with a name + ps = pandas.Series(frame_data2.get("col1"), name="col1") + ms = pd.Series(frame_data2.get("col1"), name="col1") + eval_general( + modin_df, + pandas_df, + lambda df: df.merge(ms if isinstance(df, pd.DataFrame) else ps), + ) with pytest.raises(TypeError): modin_df.merge("Non-valid type") @@ -449,6 +459,8 @@ def test_sort_multiindex(sort_remaining): def test_sort_values( data, by, axis, ascending, inplace, kind, na_position, ignore_index, key ): + if ascending is None: + pytest.skip("None is not a valid value for ascending.") if (axis == 1 or axis == "columns") and ignore_index: pytest.skip("Pandas bug #39426 which is fixed in Pandas 1.3") diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index 692173c1c4f..53358805d3e 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -410,15 +410,6 @@ def test_append(data): modin_df.append(list(modin_df.iloc[-1])) else: modin_result = modin_df.append(list(modin_df.iloc[-1])) - # Pandas has bug where sort=False is ignored - # (https://github.com/pandas-dev/pandas/issues/35092), but Modin - # now does the right thing, so for now manually sort to workaround - # this. Once the Pandas bug is fixed and Modin upgrades to that - # Pandas release, this sort will cause the test to fail, and the - # next three lines should be deleted. - if get_current_execution() != "BaseOnPython": - assert list(modin_result.columns) == list(modin_df.columns) + [0] - modin_result = modin_result[[0] + sorted(modin_df.columns)] df_equals(modin_result, pandas_result) verify_integrity_values = [True, False] diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 88fe29704f1..da63097f2cd 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -316,6 +316,9 @@ def test_simple_row_groupby(by, as_index, col1_category): if col1_category: pandas_df = pandas_df.astype({"col1": "category"}) + # As of pandas 1.4.0 operators like min cause TypeErrors to be raised on unordered + # categorical columns. We need to specify the categorical column as ordered to bypass this. + pandas_df["col1"] = pandas_df["col1"].cat.as_ordered() modin_df = from_pandas(pandas_df) n = 1 @@ -1397,9 +1400,10 @@ def test_groupby_with_kwarg_dropna(groupby_kwargs, dropna): modin_df = modin_df.T pandas_df = pandas_df.T - md_grp, pd_grp = modin_df.groupby( - **groupby_kwargs, dropna=dropna - ), pandas_df.groupby(**groupby_kwargs, dropna=dropna) + md_grp, pd_grp = ( + modin_df.groupby(**groupby_kwargs, dropna=dropna), + pandas_df.groupby(**groupby_kwargs, dropna=dropna), + ) modin_groupby_equals_pandas(md_grp, pd_grp) by_kwarg = groupby_kwargs.get("by", []) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index e2190c353fc..a94ff9e1cfe 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1944,19 +1944,7 @@ def test_fwf_file_chunksize(self, make_fwf_file): df_equals(modin_df, pd_df) - @pytest.mark.parametrize( - "nrows", - [ - pytest.param( - 13, - marks=pytest.mark.xfail( - Engine.get() == "Ray", - reason="read_fwf bug on pandas side: pandas-dev/pandas#44021", - ), - ), - None, - ], - ) + @pytest.mark.parametrize("nrows", [13, None]) def test_fwf_file_skiprows(self, make_fwf_file, nrows): unique_filename = make_fwf_file() diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 65b5f09b888..0e6bfb976e1 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -769,8 +769,7 @@ def test_align(data): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_all(data, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.all(skipna=skipna), pandas_series.all(skipna=skipna)) + eval_general(*create_test_series(data), lambda df: df.all(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -778,8 +777,7 @@ def test_all(data, skipna): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_any(data, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.any(skipna=skipna), pandas_series.any(skipna=skipna)) + eval_general(*create_test_series(data), lambda df: df.any(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2135,10 +2133,9 @@ def test_lt(data): @pytest.mark.parametrize("skipna", [None, True, False]) @pytest.mark.parametrize("level", [0, -1, None]) def test_mad(level, data, axis, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals( - modin_series.mad(axis=axis, skipna=skipna, level=level), - pandas_series.mad(axis=axis, skipna=skipna, level=level), + eval_general( + *create_test_series(data), + lambda df: df.mad(axis=axis, skipna=skipna, level=level), ) @@ -2182,8 +2179,7 @@ def test_mask(): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_max(data, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.max(skipna=skipna), pandas_series.max(skipna=skipna)) + eval_general(*create_test_series(data), lambda df: df.max(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2191,8 +2187,7 @@ def test_max(data, skipna): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_mean(data, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.mean(skipna=skipna), pandas_series.mean(skipna=skipna)) + eval_general(*create_test_series(data), lambda df: df.mean(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2200,8 +2195,7 @@ def test_mean(data, skipna): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_median(data, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.median(skipna=skipna), pandas_series.median(skipna=skipna)) + eval_general(*create_test_series(data), lambda df: df.median(skipna=skipna)) @pytest.mark.parametrize( @@ -2233,8 +2227,7 @@ def test_memory_usage(data, index): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_min(data, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.min(skipna=skipna), pandas_series.min(skipna=skipna)) + eval_general(*create_test_series(data), lambda df: df.min(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -2948,8 +2941,7 @@ def test_size(data): "skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys) ) def test_skew(data, skipna): - modin_series, pandas_series = create_test_series(data) - df_equals(modin_series.skew(skipna=skipna), pandas_series.skew(skipna=skipna)) + eval_general(*create_test_series(data), lambda df: df.skew(skipna=skipna)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -3512,8 +3504,8 @@ def test_var(data, skipna, ddof): try: pandas_result = pandas_series.var(skipna=skipna, ddof=ddof) - except Exception: - with pytest.raises(TypeError): + except Exception as e: + with pytest.raises(type(e)): modin_series.var(skipna=skipna, ddof=ddof) else: modin_result = modin_series.var(skipna=skipna, ddof=ddof) diff --git a/requirements-dev.txt b/requirements-dev.txt index d321683183a..1e8e356f7d6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,5 @@ -pandas==1.3.5 -numpy>=1.16.5 +pandas==1.4.0 +numpy>=1.18.5 pyarrow>=4.0.1 dask[complete]>=2.22.0 distributed>=2.22.0 diff --git a/requirements/env_omnisci.yml b/requirements/env_omnisci.yml index 9370c1d44de..b6aa6b580c9 100644 --- a/requirements/env_omnisci.yml +++ b/requirements/env_omnisci.yml @@ -2,9 +2,9 @@ name: modin_on_omnisci channels: - conda-forge dependencies: - - pandas==1.3.5 + - pandas==1.4.0 - pyarrow=6 - - numpy>=1.16.5 + - numpy>=1.18.5 - fsspec - pip - pytest>=6.0.1 diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 79a65081e4e..814ce0fd0cd 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -1,8 +1,8 @@ channels: - conda-forge dependencies: - - pandas==1.3.5 - - numpy>=1.16.5 + - pandas==1.4.0 + - numpy>=1.18.5 - pyarrow>=4.0.1 - fsspec - xarray diff --git a/setup.cfg b/setup.cfg index 76605da2173..589734da50a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ filterwarnings = [flake8] max-line-length = 88 -ignore = E203, E266, E501, W503 +ignore = E203, E266, E501, W503, F821 select = B,C,E,F,W,T4,B9 per-file-ignores = modin/pandas/__init__.py:E402,F401 diff --git a/setup.py b/setup.py index e42555d77d7..a1935df70a3 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ url="https://github.com/modin-project/modin", long_description=long_description, long_description_content_type="text/markdown", - install_requires=["pandas==1.3.5", "packaging", "numpy>=1.16.5", "fsspec"], + install_requires=["pandas==1.4.0", "packaging", "numpy>=1.18.5", "fsspec"], extras_require={ # can be installed by pip install modin[dask] "dask": dask_deps, @@ -32,5 +32,5 @@ "sql": sql_deps, "all": all_deps, }, - python_requires=">=3.7.1", + python_requires=">=3.8", )